python source code of diskover

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""diskover - Elasticsearch file system crawler
diskover is a file system crawler that index's
your file metadata into Elasticsearch.
See README.md or https://github.com/shirosaidev/diskover
for more information.

Copyright (C) Chris Park 2017-2020
diskover is released under the Apache 2.0 license. See
LICENSE for the full license text.
"""

from scandir import scandir
from rq import SimpleWorker, Queue
from rq.registry import StartedJobRegistry
from datetime import datetime
from random import randint
try:
    import configparser as ConfigParser
except ImportError:
    import ConfigParser
from multiprocessing import cpu_count
from threading import Thread, Lock
try:
    from queue import Queue as PyQueue
except ImportError:
    from Queue import Queue as PyQueue
import progressbar
import argparse
import logging
import importlib
import time
import math
import re
import os
import sys
import json
import requests


version = '1.5.1.0'
__version__ = version

IS_PY3 = sys.version_info >= (3, 0)


def print_banner(version):
    """This is the print banner function.
    It prints a random banner.
    """

    c = randint(1, 4)
    if c == 1:
        color = '31m'
    elif c == 2:
        color = '32m'
    elif c == 3:
        color = '33m'
    elif c == 4:
        color = '35m'

    b = randint(1, 4)
    if b == 1:
        banner = """\033[%s

      ________  .__        __
      \______ \ |__| _____|  | _________  __ ___________
       |    |  \|  |/  ___/  |/ /  _ \  \/ // __ \_  __ \\ /)___(\\
       |    `   \  |\___ \|    <  <_> )   /\  ___/|  | \/ (='.'=)
      /_______  /__/____  >__|_ \____/ \_/  \___  >__|   (\\")_(\\")
              \/        \/     \/               \/
                          v%s
                          https://shirosaidev.github.io/diskover
                          Crawling all your stuff.
                          Support diskover on Patreon or PayPal :)\033[0m

""" % (color, version)
    elif b == 2:
        banner = """\033[%s

   ___       ___       ___       ___       ___       ___       ___       ___
  /\  \     /\  \     /\  \     /\__\     /\  \     /\__\     /\  \     /\  \\
 /::\  \   _\:\  \   /::\  \   /:/ _/_   /::\  \   /:/ _/_   /::\  \   /::\  \\
/:/\:\__\ /\/::\__\ /\:\:\__\ /::-"\__\ /:/\:\__\ |::L/\__\ /::\:\__\ /::\:\__\\
\:\/:/  / \::/\/__/ \:\:\/__/ \;:;-",-" \:\/:/  / |::::/  / \:\:\/  / \;:::/  /
 \::/  /   \:\__\    \::/  /   |:|  |    \::/  /   L;;/__/   \:\/  /   |:\/__/
  \/__/     \/__/     \/__/     \|__|     \/__/               \/__/     \|__|
                                      v%s
                                      https://shirosaidev.github.io/diskover
                                      Bringing light to the darkness.
                                      Support diskover on Patreon or PayPal :)\033[0m

    """ % (color, version)
    elif b == 3:
        banner = """\033[%s

     _/_/_/    _/            _/
    _/    _/        _/_/_/  _/  _/      _/_/    _/      _/    _/_/    _/  _/_/
   _/    _/  _/  _/_/      _/_/      _/    _/  _/      _/  _/_/_/_/  _/_/
  _/    _/  _/      _/_/  _/  _/    _/    _/    _/  _/    _/        _/
 _/_/_/    _/  _/_/_/    _/    _/    _/_/        _/       _/_/_/  _/
                              v%s
                              https://shirosaidev.github.io/diskover
                              "I didn't even know that was there."
                              Support diskover on Patreon or PayPal :)\033[0m

    """ % (color, version)
    elif b == 4:
        banner = """\033[%s

      __               __
     /\ \  __         /\ \\
     \_\ \/\_\    ____\ \ \/'\\     ___   __  __     __   _ __     //
     /'_` \/\ \  /',__\\\ \ , <    / __`\/\ \/\ \  /'__`\/\`'__\\  ('>
    /\ \L\ \ \ \/\__, `\\\ \ \\\`\ /\ \L\ \ \ \_/ |/\  __/\ \ \/   /rr
    \ \___,_\ \_\/\____/ \ \_\ \_\ \____/\ \___/ \ \____\\\ \\_\\  *\))_
     \/__,_ /\/_/\/___/   \/_/\/_/\/___/  \/__/   \/____/ \\/_/
                      v%s
                      https://shirosaidev.github.io/diskover
                      "Holy s*i# there are so many temp files."
                      Support diskover on Patreon or PayPal :)\033[0m

    """ % (color, version)
    sys.stdout.write(banner)
    sys.stdout.write('\n')
    sys.stdout.flush()


def load_config():
    """This is the load config function.
    It checks for config file and loads in
    the config settings.
    """
    configsettings = {}
    config = ConfigParser.ConfigParser()
    dir_path = os.path.dirname(os.path.realpath(__file__))
    # check if env var for config file and use that
    try:
        configfile = os.environ['DISKOVER_CONFIG']
    except KeyError:
        configfile = '%s/diskover.cfg' % dir_path
        pass
    # Check for config file
    if not os.path.isfile(configfile):
        print('Config file %s not found, exiting.' % configfile)
        sys.exit(1)
    config.read(configfile)
    # Check if any sections missing from config and exit if there is
    try:
        # check if env var for auth token and use that
        try:
            configsettings['auth_token'] = os.environ['DISKOVER_AUTH_TOKEN']
        except KeyError:
            try:
                configsettings['auth_token'] = config.get('diskoverspace.com', 'auth_token')
            except ConfigParser.NoOptionError:
                print('ERROR: Can\'t find auth token, check config or env var is set.')
                print('See https://github.com/shirosaidev/diskover/wiki/Auth-token')
                sys.exit(1)
        # verify auth token
        auth_token = configsettings['auth_token']
        if auth_token == "":
            print('ERROR: Can\'t find auth token, check config or env var is set.')
            print('See https://github.com/shirosaidev/diskover/wiki/Auth-token')
            sys.exit(1)
        payload = {'token': auth_token}
        r = requests.get('https://diskoverspace.com/diskover/verifytoken.php', params=payload)
        if r.status_code != 200:
            print('ERROR: Issue verifying token, status %s' % r.status_code)
            print('See https://github.com/shirosaidev/diskover/wiki/Auth-token')
            sys.exit(1)
        if auth_token.encode('utf-8') not in r.content:
            print('ERROR: Issue verifying token caused by %s' % r.text)
            print('See https://github.com/shirosaidev/diskover/wiki/Auth-token')
            sys.exit(1)
        try:
            d = config.get('excludes', 'dirs')
            dirs = d.split(',')
            configsettings['excluded_dirs'] = set(dirs)
        except ConfigParser.NoOptionError:
            configsettings['excluded_dirs'] = set([])
        try:
            f = config.get('excludes', 'files')
            files = f.split(',')
            configsettings['excluded_files'] = set(files)
        except ConfigParser.NoOptionError:
            configsettings['excluded_files'] = set([])
        try:
            d = config.get('includes', 'dirs')
            dirs = d.split(',')
            configsettings['included_dirs'] = set(dirs)
        except (ConfigParser.NoOptionError):
            configsettings['included_dirs'] = set([])
        try:
            f = config.get('includes', 'files')
            files = f.split(',')
            configsettings['included_files'] = set(files)
        except ConfigParser.NoOptionError:
            configsettings['included_files'] = set([])
        try:
            configsettings['ownersgroups_uidgidonly'] = config.get('ownersgroups', 'uidgidonly').lower()
        except ConfigParser.NoOptionError:
            configsettings['ownersgroups_uidgidonly'] = "false"
        try:
            configsettings['ownersgroups_domain'] = config.get('ownersgroups', 'domain').lower()
        except ConfigParser.NoOptionError:
            configsettings['ownersgroups_domain'] = "false"
        try:
            configsettings['ownersgroups_domainsep'] = config.get('ownersgroups', 'domainsep')
        except ConfigParser.NoOptionError:
            configsettings['ownersgroups_domainsep'] = "\\"
        try:
            configsettings['ownersgroups_keepdomain'] = config.get('ownersgroups', 'keepdomain').lower()
        except ConfigParser.NoOptionError:
            configsettings['ownersgroups_keepdomain'] = "false"
        try:
            t = config.get('autotag', 'files')
            if os.path.isfile("%s/%s" % (os.getcwd(),t)):
                atf = json.loads(open("%s/%s" % (os.getcwd(),t)).read())
            else:
                atf = json.loads(t)
            configsettings['autotag_files'] = atf
        except ValueError as e:
            raise ValueError("Error in config autotag files: %s" % e)
        except ConfigParser.NoOptionError:
            configsettings['autotag_files'] = []
        try:
            t = config.get('autotag', 'dirs')
            if os.path.isfile("%s/%s" % (os.getcwd(),t)):
                atd = json.loads(open("%s/%s" % (os.getcwd(),t)).read())
            else:
                atd = json.loads(t)
            configsettings['autotag_dirs'] = atd
        except ValueError as e:
            raise ValueError("Error in config autotag dirs: %s" % e)
        except ConfigParser.NoOptionError:
            configsettings['autotag_dirs'] = []
        try:
            configsettings['aws'] = config.get('elasticsearch', 'aws').lower()
        except ConfigParser.NoOptionError:
            configsettings['aws'] = "false"
        try:
            h = config.get('elasticsearch', 'host')
            hosts = h.split(',')
            configsettings['es_host'] = hosts
        except ConfigParser.NoOptionError:
            configsettings['es_host'] = ['localhost']
        try:
            configsettings['es_port'] = int(config.get('elasticsearch', 'port'))
        except ConfigParser.NoOptionError:
            configsettings['es_port'] = 9200
        try:
            configsettings['es_user'] = config.get('elasticsearch', 'user')
        except ConfigParser.NoOptionError:
            configsettings['es_user'] = ""
        try:
            configsettings['es_password'] = config.get('elasticsearch', 'password')
        except ConfigParser.NoOptionError:
            configsettings['es_password'] = ""
        try:
            configsettings['index'] = config.get('elasticsearch', 'indexname')
        except ConfigParser.NoOptionError:
            configsettings['index'] = ""
        try:
            configsettings['es_timeout'] = int(config.get('elasticsearch', 'timeout'))
        except ConfigParser.NoOptionError:
            configsettings['es_timeout'] = 10
        try:
            configsettings['es_maxsize'] = int(config.get('elasticsearch', 'maxsize'))
        except ConfigParser.NoOptionError:
            configsettings['es_maxsize'] = 10
        try:
            configsettings['es_max_retries'] = int(config.get('elasticsearch', 'maxretries'))
        except ConfigParser.NoOptionError:
            configsettings['es_max_retries'] = 0
        try:
            configsettings['es_wait_status_yellow'] = config.get('elasticsearch', 'wait').lower()
        except ConfigParser.NoOptionError:
            configsettings['es_wait_status_yellow'] = "false"
        try:
            configsettings['es_chunksize'] = int(config.get('elasticsearch', 'chunksize'))
        except ConfigParser.NoOptionError:
            configsettings['es_chunksize'] = 500
        try:
            configsettings['index_shards'] = int(config.get('elasticsearch', 'shards'))
        except ConfigParser.NoOptionError:
            configsettings['index_shards'] = 5
        try:
            configsettings['index_replicas'] = int(config.get('elasticsearch', 'replicas'))
        except ConfigParser.NoOptionError:
            configsettings['index_replicas'] = 1
        try:
            configsettings['index_refresh'] = config.get('elasticsearch', 'indexrefresh')
        except ConfigParser.NoOptionError:
            configsettings['index_refresh'] = "1s"
        try:
            configsettings['disable_replicas'] = config.get('elasticsearch', 'disablereplicas').lower()
        except ConfigParser.NoOptionError:
            configsettings['disable_replicas'] = "false"
        try:
            configsettings['index_translog_size'] = config.get('elasticsearch', 'translogsize')
        except ConfigParser.NoOptionError:
            configsettings['index_translog_size'] = "512mb"
        try:
            configsettings['es_scrollsize'] = int(config.get('elasticsearch', 'scrollsize'))
        except ConfigParser.NoOptionError:
            configsettings['es_scrollsize'] = 100
        try:
            configsettings['redis_host'] = config.get('redis', 'host')
        except ConfigParser.NoOptionError:
            configsettings['redis_host'] = "localhost"
        try:
            configsettings['redis_port'] = int(config.get('redis', 'port'))
        except ConfigParser.NoOptionError:
            configsettings['redis_port'] = 6379
        try:
            configsettings['redis_socket'] = config.get('redis', 'socket')
        except ConfigParser.NoOptionError:
            configsettings['redis_socket'] = ""
        try:
            configsettings['redis_password'] = config.get('redis', 'password')
        except ConfigParser.NoOptionError:
            configsettings['redis_password'] = ""
        try:
            configsettings['redis_db'] = int(config.get('redis', 'db'))
        except ConfigParser.NoOptionError:
            configsettings['redis_db'] = 0
        try:
            configsettings['redis_rq_timeout'] = int(config.get('redis', 'timeout'))
        except ConfigParser.NoOptionError:
            configsettings['redis_rq_timeout'] = 180
        try:
            configsettings['redis_ttl'] = int(config.get('redis', 'ttl'))
        except ConfigParser.NoOptionError:
            configsettings['redis_ttl'] = 500
        try:
            configsettings['redis_queue'] = config.get('redis', 'queue')
        except ConfigParser.NoOptionError:
            configsettings['redis_queue'] = "diskover"
        try:
            configsettings['redis_queue_crawl'] = config.get('redis', 'queuecrawl')
        except ConfigParser.NoOptionError:
            configsettings['redis_queue_crawl'] = "diskover_crawl"
        try:
            configsettings['redis_queue_calcdir'] = config.get('redis', 'queuecalcdir')
        except ConfigParser.NoOptionError:
            configsettings['redis_queue_calcdir'] = "diskover_calcdir"
        try:
            configsettings['adaptivebatch_startsize'] = int(config.get('adaptivebatch', 'startsize'))
        except ConfigParser.NoOptionError:
            configsettings['adaptivebatch_startsize'] = 50
        try:
            configsettings['adaptivebatch_maxsize'] = int(config.get('adaptivebatch', 'maxsize'))
        except ConfigParser.NoOptionError:
            configsettings['autobatch_maxsize'] = 500
        try:
            configsettings['adaptivebatch_stepsize'] = int(config.get('adaptivebatch', 'stepsize'))
        except ConfigParser.NoOptionError:
            configsettings['adaptivebatch_stepsize'] = 10
        try:
            configsettings['adaptivebatch_maxfiles'] = int(config.get('adaptivebatch', 'maxfiles'))
        except ConfigParser.NoOptionError:
            configsettings['adaptivebatch_maxfiles'] = 50000
        try:
            configsettings['listener_host'] = config.get('socketlistener', 'host')
        except ConfigParser.NoOptionError:
            configsettings['listener_host'] = "localhost"
        try:
            configsettings['listener_port'] = int(config.get('socketlistener', 'port'))
        except ConfigParser.NoOptionError:
            configsettings['listener_port'] = 9999
        try:
            configsettings['listener_maxconnections'] = int(config.get('socketlistener', 'maxconnections'))
        except ConfigParser.NoOptionError:
            configsettings['listener_maxconnections'] = 5
        try:
            configsettings['listener_twcport'] = int(config.get('socketlistener', 'twcport'))
        except ConfigParser.NoOptionError:
            configsettings['listener_twcport'] = 9998
        try:
            configsettings['diskover_path'] = config.get('paths', 'diskoverpath')
        except ConfigParser.NoOptionError:
            configsettings['diskover_path'] = "./diskover.py"
        try:
            configsettings['python_path'] = config.get('paths', 'pythonpath')
        except ConfigParser.NoOptionError:
            configsettings['python_path'] = "python"
        try:
            configsettings['md5_readsize'] = int(config.get('dupescheck', 'readsize'))
        except ConfigParser.NoOptionError:
            configsettings['md5_readsize'] = 65536
        try:
            configsettings['dupes_maxsize'] = int(config.get('dupescheck', 'maxsize'))
        except ConfigParser.NoOptionError:
            configsettings['dupes_maxsize'] = 1073741824
        try:
            configsettings['dupes_checkbytes'] = int(config.get('dupescheck', 'checkbytes'))
        except ConfigParser.NoOptionError:
            configsettings['dupes_checkbytes'] = 64
        try:
            configsettings['dupes_restoretimes'] = config.get('dupescheck', 'restoretimes').lower()
        except ConfigParser.NoOptionError:
            configsettings['dupes_restoretimes'] = "false"
        try:
            configsettings['dupes_threads'] = int(config.get('dupescheck', 'threads'))
        except ConfigParser.NoOptionError:
            configsettings['dupes_threads'] = 8
        try:
            configsettings['gource_maxfilelag'] = float(config.get('gource', 'maxfilelag'))
        except ConfigParser.NoOptionError:
            configsettings['gource_maxfilelag'] = 5
        try:
            configsettings['api_url'] = config.get('crawlapi', 'url')
        except ConfigParser.NoOptionError:
            configsettings['api_url'] = ""
        try:
            configsettings['api_user'] = config.get('crawlapi', 'user')
        except ConfigParser.NoOptionError:
            configsettings['api_user'] = ""
        try:
            configsettings['api_password'] = config.get('crawlapi', 'password')
        except ConfigParser.NoOptionError:
            configsettings['api_password'] = ""
        try:
            configsettings['api_pagesize'] = config.get('crawlapi', 'pagesize')
        except ConfigParser.NoOptionError:
            configsettings['api_pagesize'] = ""
    except ConfigParser.NoSectionError as e:
        print('Missing section from diskover.cfg, check diskover.cfg.sample and copy over, exiting. (%s)' % e)
        sys.exit(1)

    return configsettings, configfile


def get_plugins_info():
    """This is the get plugins info function.
    It gets a list of python plugins info (modules) in
    the plugins directory and returns the plugins information.
    """
    plugin_dir = os.path.dirname(os.path.realpath(__file__)) + "/plugins"
    main_module = "__init__"
    plugins_info = []
    possible_plugins = os.listdir(plugin_dir)
    for i in possible_plugins:
        location = os.path.join(plugin_dir, i)
        if not os.path.isdir(location) or not main_module + ".py" \
                in os.listdir(location):
            continue
        if IS_PY3:
            spec = importlib.machinery.PathFinder().find_spec(main_module, [location])
        else:
            import imp
            spec = imp.find_module(main_module, [location])
        plugins_info.append({"name": i, "spec": spec})
    return plugins_info


def load_plugins():
    """This is the load plugins function.
    It dynamically load the plugins and return them in a list
    """
    loaded_plugins = []
    plugins_info = get_plugins_info()
    for plugin_info in plugins_info:
        if IS_PY3:
            plugin_module = importlib.util.module_from_spec(plugin_info["spec"])
            plugin_info["spec"].loader.exec_module(plugin_module)
        else:
            import imp
            plugin_module = imp.load_module(plugin_info["name"], *plugin_info["spec"])
        loaded_plugins.append(plugin_module)
    return loaded_plugins


def list_plugins():
    """This is the list plugins function.
    It prints the name of all the available plugins
    """
    plugins_info = get_plugins_info()

    for plugin_info in plugins_info:
        print(plugin_info["name"])


def user_prompt(question):
    """ Prompt the yes/no-*question* to the user. """
    from distutils.util import strtobool

    while True:
        try:
            if IS_PY3:
                user_input = input(question + " [y/n]: ").lower()
            else:
                user_input = raw_input(question + " [y/n]: ").lower()
            result = strtobool(user_input)
            return result
        except ValueError:
            print("Please use y/n or yes/no.\n")
        except KeyboardInterrupt:
            print("Ctrl-c keyboard interrupt, shutting down...")
            sys.exit(0)


def scandir_check():
    try:
        import _scandir
    except ImportError:
        _scandir = None
    try:
        import ctypes
    except ImportError:
        ctypes = None
    if _scandir is None and ctypes is None:
        print("ERROR: scandir can't find the compiled _scandir C module or ctypes")
        sys.exit(1)
    
    import warnings
    with warnings.catch_warnings(record=True):
        import scandir
    if scandir.scandir_c is None:
        print("ERROR: Compiled C version of scandir not found!")
        sys.exit(1)


def index_create(indexname):
    """This is the es index create function.
    It checks for existing index and deletes if
    there is one with same name. It also creates
    the new index and sets up mappings.
    """
    logger.info('Checking es index: %s', indexname)
    # check for existing es index
    if es.indices.exists(index=indexname):
        # check if reindex cli argument and don't delete existing index
        if cliargs['reindex']:
            logger.info('Reindexing (non-recursive, preserving tags)')
            return
        elif cliargs['reindexrecurs']:
            logger.info('Reindexing (recursive, preserving tags)')
            return
        # delete existing index
        else:
            if cliargs['forcedropexisting']:
                logger.warning('es index exists, deleting')
                es.indices.delete(index=indexname, ignore=[400, 404])
            else:
                if user_prompt("Drop existing index?"):
                    logger.warning('es index exists, deleting')
                    es.indices.delete(index=indexname, ignore=[400, 404])
                else:
                    logger.info("Cannot continue with index. Exiting.")
                    sys.exit(1)

    # set up es index mappings and create new index
    mappings = {
        "settings": {
            "index" : {
                "number_of_shards": config['index_shards'],
                "number_of_replicas": config['index_replicas']
            }
        },
        "mappings": {
            "diskspace": {
                "properties": {
                    "path": {
                        "type": "keyword"
                    },
                    "total": {
                        "type": "long"
                    },
                    "used": {
                        "type": "long"
                    },
                    "free": {
                        "type": "long"
                    },
                    "available": {
                        "type": "long"
                    },
                    "indexing_date": {
                        "type": "date"
                    }
                }
            },
            "crawlstat": {
                "properties": {
                    "path": {
                        "type": "keyword"
                    },
                    "state": {
                        "type": "text"
                    },
                    "crawl_time": {
                        "type": "float"
                    },
                    "indexing_date": {
                        "type": "date"
                    }
                }
            },
            "worker": {
                "properties": {
                    "worker_name": {
                        "type": "keyword"
                    },
                    "dir_count": {
                        "type": "integer"
                    },
                    "file_count": {
                        "type": "integer"
                    },
                    "bulk_time": {
                        "type": "float"
                    },
                    "crawl_time": {
                        "type": "float"
                    },
                    "indexing_date": {
                        "type": "date"
                    }
                }
            },
            "directory": {
                "properties": {
                    "filename": {
                        "type": "keyword"
                    },
                    "path_parent": {
                        "type": "keyword"
                    },
                    "filesize": {
                        "type": "long"
                    },
                    "items": {
                        "type": "long"
                    },
                    "items_files": {
                        "type": "long"
                    },
                    "items_subdirs": {
                        "type": "long"
                    },
                    "owner": {
                        "type": "keyword"
                    },
                    "group": {
                        "type": "keyword"
                    },
                    "last_modified": {
                        "type": "date"
                    },
                    "last_access": {
                        "type": "date"
                    },
                    "last_change": {
                        "type": "date"
                    },
                    "hardlinks": {
                        "type": "integer"
                    },
                    "inode": {
                        "type": "keyword"
                    },
                    "tag": {
                        "type": "keyword"
                    },
                    "tag_custom": {
                        "type": "keyword"
                    },
                    "crawl_time": {
                        "type": "float"
                    },
                    "change_percent_filesize": {
                        "type": "float"
                    },
                    "change_percent_items": {
                        "type": "float"
                    },
                    "change_percent_items_files": {
                        "type": "float"
                    },
                    "change_percent_items_subdirs": {
                        "type": "float"
                    },
                    "worker_name": {
                        "type": "keyword"
                    },
                    "indexing_date": {
                        "type": "date"
                    }
                }
            },
            "file": {
                "properties": {
                    "filename": {
                        "type": "keyword"
                    },
                    "extension": {
                        "type": "keyword"
                    },
                    "path_parent": {
                        "type": "keyword"
                    },
                    "filesize": {
                        "type": "long"
                    },
                    "owner": {
                        "type": "keyword"
                    },
                    "group": {
                        "type": "keyword"
                    },
                    "last_modified": {
                        "type": "date"
                    },
                    "last_access": {
                        "type": "date"
                    },
                    "last_change": {
                        "type": "date"
                    },
                    "hardlinks": {
                        "type": "integer"
                    },
                    "inode": {
                        "type": "keyword"
                    },
                    "filehash": {
                        "type": "keyword"
                    },
                    "tag": {
                        "type": "keyword"
                    },
                    "tag_custom": {
                        "type": "keyword"
                    },
                    "dupe_md5": {
                        "type": "keyword"
                    },
                    "worker_name": {
                        "type": "keyword"
                    },
                    "indexing_date": {
                        "type": "date"
                    }
                }
            }
        }
    }

    # check plugins for additional mappings
    for plugin in plugins:
        mappings = (plugin.add_mappings(mappings))

    logger.info('Creating es index')
    es.indices.create(index=indexname, body=mappings)
    time.sleep(.5)


def index_bulk_add(es, doclist, config, cliargs):
    """This is the es index bulk add function.
    It bulk adds/updates/removes using file/directory
    meta data lists from worker's crawl results.
    """
    if config['es_wait_status_yellow'] == "true":
        # wait for es health to be at least yellow
        es.cluster.health(wait_for_status='yellow',
                          request_timeout=config['es_timeout'])
    # bulk load data to Elasticsearch index
    try:
        helpers.bulk(es, doclist, index=cliargs['index'],
                chunk_size=config['es_chunksize'], request_timeout=config['es_timeout'])
    except helpers.BulkIndexError:
        # bulk index error can happen occasionaly when using splitfiles and
        # if a doc trying be updated has a version mismatch, try to bulk
        # upload the update list items again
        time.sleep(.5)
        if cliargs['splitfiles']:
            update_doclist = []
            for d in doclist:
                try:
                    if d['_op_type']:
                        update_doclist.append(d)
                except KeyError:
                    pass
            doclist = update_doclist
        index_bulk_add(es, doclist, config, cliargs)


def index_delete_path(path, cliargs, logger, reindex_dict, recursive=False):
    """This is the es delete path bulk function.
    It finds all file and directory docs in path and deletes them from es
    including the directory (path).
    Recursive will also find and delete all docs in subdirs of path.
    Stores any existing tags in reindex_dict.
    Returns reindex_dict.
    """
    file_id_list = []
    dir_id_list = []
    file_delete_list = []
    dir_delete_list = []

    # refresh index
    es.indices.refresh(index=cliargs['index'])

    # escape special characters
    newpath = escape_chars(path)
    # create wildcard string and check for / (root) path
    if newpath == '\/':
        newpathwildcard = '\/*'
    else:
        newpathwildcard = newpath + '\/*'

    # file doc search
    if recursive:
        data = {
            "query": {
                "query_string": {
                    "query": "path_parent: " + newpath + " OR "
                    "path_parent: " + newpathwildcard,
                    "analyze_wildcard": "true"
                }
            }
        }
    else:
        data = {
            "query": {
                "query_string": {
                    "query": "path_parent: " + newpath
                }
            }
        }

    logger.info('Searching for all files in %s' % path)
    # search es and start scroll
    res = es.search(index=cliargs['index'], doc_type='file', scroll='1m',
                    size=config['es_scrollsize'], body=data,
                    request_timeout=config['es_timeout'])

    while res['hits']['hits'] and len(res['hits']['hits']) > 0:
        for hit in res['hits']['hits']:
            # add doc id to file_id_list
            file_id_list.append(hit['_id'])
            # add file path info inc. tags to reindex_file_list
            reindex_dict['file'].append((hit['_source']['path_parent'] +
                                      '/' + hit['_source']['filename'],
                                      hit['_source']['tag'],
                                      hit['_source']['tag_custom']))
        # get es scroll id
        scroll_id = res['_scroll_id']
        # use es scroll api
        res = es.scroll(scroll_id=scroll_id, scroll='1m',
                        request_timeout=config['es_timeout'])

    logger.info('Found %s files for %s' % (len(file_id_list), path))

    # add file id's to delete_list
    for i in file_id_list:
        d = {
            '_op_type': 'delete',
            '_index': cliargs['index'],
            '_type': 'file',
            '_id': i
        }
        file_delete_list.append(d)

    if len(file_delete_list) > 0:
        # bulk delete files in es
        logger.info('Bulk deleting files in es index')
        index_bulk_add(es, file_delete_list, config, cliargs)

    # directory doc search
    if recursive:
        data = {
            'query': {
                'query_string': {
                    'query': '(path_parent: ' + newpath + ') OR '
                             '(path_parent: ' + newpathwildcard + ') OR (filename: "'
                             + os.path.basename(path) + '" AND path_parent: "'
                             + os.path.abspath(os.path.join(path, os.pardir)) + '")',
                    'analyze_wildcard': 'true'
                }
            }
        }
    else:
        data = {
            'query': {
                'query_string': {
                    'query': '(path_parent: ' + newpath + ') OR (filename: "'
                             + os.path.basename(path) + '" AND path_parent: "'
                             + os.path.abspath(os.path.join(path, os.pardir)) + '")'
                }
            }
        }

    logger.info('Searching for all directories in %s' % path)
    # search es and start scroll
    res = es.search(index=cliargs['index'], doc_type='directory', scroll='1m',
                    size=config['es_scrollsize'], body=data, request_timeout=config['es_timeout'])

    while res['hits']['hits'] and len(res['hits']['hits']) > 0:
        for hit in res['hits']['hits']:
            # add directory doc id to dir_id_list
            dir_id_list.append(hit['_id'])
            # add directory path info inc. tags, filesize, items to reindex_dir_list
            reindex_dict['directory'].append((hit['_source']['path_parent'] +
                                     '/' + hit['_source']['filename'],
                                     hit['_source']['tag'],
                                     hit['_source']['tag_custom']))
        # get es scroll id
        scroll_id = res['_scroll_id']
        # use es scroll api
        res = es.scroll(scroll_id=scroll_id, scroll='1m',
                        request_timeout=config['es_timeout'])

    logger.info('Found %s directories for %s' % (len(dir_id_list), path))

    # add dir id's to delete_list
    for i in dir_id_list:
        d = {
            '_op_type': 'delete',
            '_index': cliargs['index'],
            '_type': 'directory',
            '_id': i
        }
        dir_delete_list.append(d)

    if len(dir_delete_list) > 0:
        # bulk delete directories in es
        logger.info('Bulk deleting directories in es index')
        index_bulk_add(es, dir_delete_list, config, cliargs)

    return reindex_dict


def index_get_docs(cliargs, logger, doctype='directory', copytags=False, hotdirs=False,
                   index=None, path=None, sort=False, maxdepth=None, pathid=False):
    """This is the es get docs function.
    It finds all docs (by doctype) in es and returns doclist
    which contains doc id, fullpath and mtime for all docs.
    If copytags is True will return tags from previous index.
    If path is specified will return just documents in and under directory path.
    If sort is True, will return paths in asc path order.
    if pathid is True, will return dict with path and their id.
    """

    data = _index_get_docs_data(index, cliargs, logger, doctype=doctype, path=path,
                                maxdepth=maxdepth, sort=sort)

    # refresh index
    es.indices.refresh(index)

    # search es and start scroll
    res = es.search(index=index, doc_type=doctype, scroll='1m',
                    size=config['es_scrollsize'], body=data, request_timeout=config['es_timeout'])

    doclist = []
    pathdict = {}
    doccount = 0
    while res['hits']['hits'] and len(res['hits']['hits']) > 0:
        for hit in res['hits']['hits']:
            fullpath = os.path.abspath(os.path.join(hit['_source']['path_parent'], hit['_source']['filename']))
            if copytags:
                doclist.append((fullpath, hit['_source']['tag'], hit['_source']['tag_custom'], doctype))
            elif hotdirs:
                doclist.append((hit['_id'], fullpath, hit['_source']['filesize'], hit['_source']['items'],
                                hit['_source']['items_files'], hit['_source']['items_subdirs']))
            elif pathid:
                rel_path = fullpath.replace(rootdir_path, ".")
                pathdict[rel_path] = hit['_id']
            else:
                # convert es time to unix time format
                mtime = time.mktime(datetime.strptime(
                    hit['_source']['last_modified'],
                    '%Y-%m-%dT%H:%M:%S').timetuple())
                doclist.append((hit['_id'], fullpath, mtime, doctype))
            doccount += 1
        # use es scroll api
        res = es.scroll(scroll_id=res['_scroll_id'], scroll='1m',
                        request_timeout=config['es_timeout'])

    logger.info('Found %s %s docs' % (str(doccount), doctype))

    if pathid:
        return pathdict
    else:
        return doclist


def _index_get_docs_data(index, cliargs, logger, doctype='directory', path=None, maxdepth=None, sort=False):
    if cliargs['copytags']:
        logger.info('Searching for all %s docs with tags in %s...', doctype, index)
        data = {
            '_source': ['path_parent', 'filename', 'tag', 'tag_custom'],
            'query': {
                'query_string': {
                    'query': 'tag:(NOT "") OR tag_custom:(NOT "")'
                }
            }
        }
    elif cliargs['hotdirs']:
        logger.info('Searching for all %s docs in %s...', doctype, index)
        data = {
            '_source': ['path_parent', 'filename', 'filesize', 'items', 'items_files', 'items_subdirs'],
            'query': {
                'match_all': {}
            }
        }
    else:
        if path is None:
            if maxdepth is None:
                logger.info('Searching for all %s docs in %s...', doctype, index)
                data = {
                    '_source': ['path_parent', 'filename', 'last_modified', 'last_access', 'last_change'],
                    'query': {
                        'match_all': {}
                    }
                }
            else:
                # depth at rootdir
                num_sep = cliargs['rootdir'].count(os.path.sep)
                n = num_sep + maxdepth - 1
                regexp = '(/[^/]+){1,' + str(n) + '}|/?'
                logger.info('Searching for all %s docs in %s (maxdepth %s)...', doctype, index, maxdepth)
                data = {
                    '_source': ['path_parent', 'filename', 'last_modified', 'last_access', 'last_change'],
                    'query': {
                        'regexp': {'path_parent': regexp}
                    }
                }
        else:
            # escape special characters
            newpath = escape_chars(path)
            # create wildcard string and check for / (root) path
            if newpath == '\/':
                newpathwildcard = '\/*'
            else:
                newpathwildcard = newpath + '\/*'
            logger.info('Searching for all %s docs in %s for path %s...', doctype, index, path)
            data = {
                '_source': ['path_parent', 'filename', 'last_modified', 'last_access', 'last_change'],
                'query': {
                    'query_string': {
                        'query': '(path_parent: ' + newpath + ') OR '
                                                              '(path_parent: ' + newpathwildcard + ') OR (filename: "'
                                 + os.path.basename(path) + '" AND path_parent: "'
                                 + os.path.abspath(os.path.join(path, os.pardir)) + '")',
                    }
                }
            }

    if sort:
        data['sort'] = [{'path_parent': {'order': 'desc'}}]

    return data


def replace_path(path):
    """This is the replace path function.
    It replaces paths and drive letters sent to bots.
    """
    frompath = cliargs['replacepath'][0]
    topath = cliargs['replacepath'][1]
    path = path.replace(frompath, topath)
    # change any windows path separators (for bots running in linux)
    path = path.replace('\\', '/')
    return path


def split_list(a, n):
        """Generator that splits list a evenly into n pieces
        """
        if IS_PY3:
            xrange = range
        k, m = divmod(len(a), n)
        return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in xrange(n))


def add_diskspace(index, logger, path):
    """This is the add disk space function.
    It adds total, used, free and available
    disk space for a path to es.
    """
    try:  # linux
        statvfs = os.statvfs(path)
        # Size of filesystem in bytes
        total = statvfs.f_frsize * statvfs.f_blocks
        # Actual number of free bytes
        free = statvfs.f_frsize * statvfs.f_bfree
        # Number of free bytes that ordinary users are allowed
        # to use (excl. reserved space)
        available = statvfs.f_frsize * statvfs.f_bavail
    except AttributeError:  # windows
        import ctypes
        total_bytes = ctypes.c_ulonglong(0)
        free_bytes = ctypes.c_ulonglong(0)
        available_bytes = ctypes.c_ulonglong(0)
        ctypes.windll.kernel32.GetDiskFreeSpaceExW(ctypes.c_wchar_p(path),
            ctypes.pointer(available_bytes),
            ctypes.pointer(total_bytes),
            ctypes.pointer(free_bytes))
        total = total_bytes.value
        free = free_bytes.value
        available = available_bytes.value
        if cliargs['replacepath']:
            path = replace_path(path)

    used = total - free
    indextime_utc = datetime.utcnow().isoformat()
    data = {
        "path": path,
        "total": total,
        "used": used,
        "free": free,
        "available": available,
        "indexing_date": indextime_utc
    }
    # add to es
    logger.info('Adding disk space info to es index')
    es.index(index=index, doc_type='diskspace', body=data)


def add_crawl_stats(es, index, path, crawltime, state):
    """This is the add crawl stats function.
    It adds crawl stats info to es when crawl starts and finishes.
    """
    data = {
        "path": path,
        "state": state,  # running, finished_crawl, finished_dircalc
        "crawl_time": round(crawltime, 6),
        "indexing_date": datetime.utcnow().isoformat()
    }
    es.index(index=index, doc_type='crawlstat', body=data)


def dir_excluded(path, config, cliargs):
    """Return True if path in excluded_dirs set,
    False if not in the list"""
    name = os.path.basename(path)
    # return if directory in included list (whitelist)
    if name in config['included_dirs'] or path in config['included_dirs']:
        return False
    # skip any dirs in excluded_dirs
    if name in config['excluded_dirs'] or path in config['excluded_dirs']:
        if cliargs['verbose']:
            logger.info('Skipping (excluded dir) %s', path)
        return True
    # skip any dirs which start with . (dot) and in excluded_dirs
    if name.startswith('.') and u'.*' in config['excluded_dirs']:
        if cliargs['verbose']:
            logger.info('Skipping (.* dir) %s', path)
        return True
    # skip any dirs that are found in reg exp checks including wildcard searches
    found_dir = False
    found_path = False
    for d in config['excluded_dirs']:
        if d == '.*':
            continue
        if d.startswith('*') and d.endswith('*'):
            d = d.replace('*', '')
            if re.search(d, name):
                found_dir = True
                break
            elif re.search(d, path):
                found_path = True
                break
        elif d.startswith('*'):
            d = d + '$'
            if re.search(d, name):
                found_dir = True
                break
            elif re.search(d, path):
                found_path = True
                break
        elif d.endswith('*'):
            d = '^' + d
            if re.search(d, name):
                found_dir = True
                break
            elif re.search(d, path):
                found_path = True
                break
        else:
            if d == name:
                found_dir = True
                break
            elif d == path:
                found_path = True
                break

    if found_dir or found_path:
        if cliargs['verbose']:
            logger.info('Skipping (excluded dir) %s', path)
        return True

    return False


def escape_chars(text):
    """This is the escape special characters function.
    It returns escaped path strings for es queries.
    """
    # escape any backslash chars
    text = text.replace('\\', '\\\\')
    # escape any characters in chr_dict
    chr_dict = {'\n': '\\n', '\t': '\\t',
                '/': '\\/', '(': '\\(', ')': '\\)', '[': '\\[', ']': '\\]', '$': '\\$',
                ' ': '\\ ', '&': '\\&', '<': '\\<', '>': '\\>', '+': '\\+', '-': '\\-',
                '|': '\\|', '!': '\\!', '{': '\\{', '}': '\\}', '^': '\\^', '~': '\\~',
                '?': '\\?', ':': '\\:', '=': '\\=', '\'': '\\\'', '"': '\\"', '@': '\\@',
                '.': '\\.', '#': '\\#', '*': '\\*', '　': '\\　'}
    def char_trans(text, chr_dict):
        for key, value in chr_dict.items():
            text = text.replace(key, value)
        return text
    if IS_PY3:
        text_esc = text.translate(str.maketrans(chr_dict))
    else:
        text_esc = char_trans(text, chr_dict)
    return text_esc


def get_time(seconds):
    """This is the get time function
    It returns human readable time format for stats.
    """
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    d, h = divmod(h, 24)
    return "%dd:%dh:%02dm:%02ds" % (d, h, m, s)


def convert_size(size_bytes):
    """This is the convert size function
    It returns human readable file sizes.
    """
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return "%s %s" % (s, size_name[i])


def parse_cli_args(indexname):
    """This is the parse CLI arguments function.
    It parses command line arguments.
    """

    parser = argparse.ArgumentParser()
    parser.add_argument("-d", "--rootdir", metavar='ROOTDIR', default=".",
                        help="Directory to start crawling from (default: .)")
    parser.add_argument("-m", "--mtime", metavar='DAYS', default=0, type=int,
                        help="Minimum (+num) / maximum (-num) days ago for file modified time (default: 0)")
    parser.add_argument("-s", "--minsize", metavar='BYTES', default=1, type=int,
                        help="Minimum file size in Bytes (default: 1 Bytes)")
    parser.add_argument("-e", "--indexemptydirs", action="store_true",
                        help="Index empty directories (default: don't index)")
    parser.add_argument("-i", "--index", default=indexname,
                        help="Elasticsearch index name (default: from config)")
    parser.add_argument("-M", "--maxdepth", type=int, default=None,
                        help="Maximum directory depth to crawl (default: None)")
    parser.add_argument("-c", "--maxdcdepth", type=int, default=None,
                        help="Maximum directory depth to calculate directory sizes/items (default: None)")
    parser.add_argument("-b", "--batchsize", type=int, default=50,
                        help="Batch size (dir count) for sending to worker bots, \
                                setting too low could impact performance (default: 50)")
    parser.add_argument("-a", "--adaptivebatch", action="store_true",
                        help="Adaptive batch size for sending to worker bots (intelligent crawl)")
    parser.add_argument("-f", "--filehashsizeonly", action="store_true",
                        help="Use filesize only for generating filehash field (default: use filesize + last_modified)")
    parser.add_argument("-T", "--walkthreads", type=int, default=cpu_count()*2,
                        help="Number of threads for treewalk (default: cpu core count x 2)")
    parser.add_argument("-A", "--autotag", action="store_true",
                        help="Get bots to auto-tag files/dirs based on patterns in config")
    parser.add_argument("-S", "--sizeondisk", action="store_true",
                        help="Store size on disk (disk usage size) using block count x blocksize instead of file size")
    parser.add_argument("-B", "--blocksize", type=int, metavar='BLOCKSIZE', default=512,
                        help="Blocksize (in bytes) used for --sizeondisk (default: 512)")
    parser.add_argument("-O", "--optimizeindex", action="store_true",
                        help="Optimize index at end of crawl (reduce size)")
    parser.add_argument("-r", "--reindex", action="store_true",
                        help="Reindex directory (non-recursive), data is added to existing index")
    parser.add_argument("-R", "--reindexrecurs", action="store_true",
                        help="Reindex directory and all subdirs (recursive), data is added to existing index")
    parser.add_argument("-F", "--forcedropexisting", action="store_true",
                        help="Silently drop an existing index (if present)")
    parser.add_argument("-D", "--finddupes", action="store_true",
                        help="Find duplicate files in existing index and update their dupe_md5 field")
    parser.add_argument("-C", "--copytags", metavar='INDEX2',
                        help="Copy tags from index2 to index")
    parser.add_argument("-H", "--hotdirs", metavar='INDEX2',
                        help="Find hot dirs by calculating change percents from index2 (prev index) and update \
                                change_percent fields in index")
    parser.add_argument("-l", "--listen", action="store_true",
                        help="Start tcp socket server and listen for remote commands")
    parser.add_argument("-L", "--listentwc", action="store_true",
                        help="Start tcp socket server and listen for messages from diskover treewalk client")
    parser.add_argument("--twcport", type=int, metavar='PORT',
                        help="Port number for tree walk client socket server (default: from config)")
    parser.add_argument("--replacepath", nargs=2, metavar="PATH",
                        help="Replace path, example: --replacepath Z:\\ /mnt/share/")
    parser.add_argument("--splitfiles", action="store_true",
                        help="For directories with lots of files, split meta collecting of files amongst bots")
    parser.add_argument("--splitfilesnum", type=int, metavar='NUMFILES', default=10000,
                        help="Minimum number of files required in directory to cause file split for --splitfiles, \
                                setting too low could impact performance (default: 10000)")
    parser.add_argument("--chunkfiles", action="store_true",
                        help="Chunk file lists and send to worker bots when scanning large directories")
    parser.add_argument("--chunkfilesnum", type=int, metavar='NUMFILES', default=1000,
                        help="Minimum number of files required in directory to cause file chunking, \
                                setting too low could impact performance (default: 1000)")
    parser.add_argument("--noworkerdocs", action="store_true",
                        help="Don't add worker docs (worker crawl stats) into Elasticsearch, could help to reduce index \
                                size for very large indexes")
    parser.add_argument("--nowait", action="store_true",
                        help="Don't wait for worker bots to be running before enqueuing crawl jobs in RQ")
    parser.add_argument("--inchardlinks", action="store_true",
                        help="Include any number of hardlinked files when finding dupes with --finddupes \
                                (default: no files with hardlink count > 1)")
    parser.add_argument("--crawlapi", action="store_true",
                        help="Use storage Restful API instead of scandir")
    parser.add_argument("--storagent", metavar='HOST', nargs='+',
                        help="Use diskover Storage Agent instead of scandir")
    parser.add_argument("--dircalcsonly", action="store_true",
                        help="Calculate sizes and item counts for each directory doc in existing index \
                                (done automatically after each crawl)")
    parser.add_argument("--gourcert", action="store_true",
                        help="Get realtime crawl data from ES for gource")
    parser.add_argument("--gourcemt", action="store_true",
                        help="Get file mtime data from ES for gource")
    parser.add_argument("-q", "--quiet", action="store_true",
                        help="Runs with no output")
    parser.add_argument("-v", "--verbose", action="store_true",
                        help="Increase output verbosity")
    parser.add_argument("--debug", action="store_true",
                        help="Debug message output")
    parser.add_argument("--listplugins", action="store_true",
                        help="List plugins")
    parser.add_argument("-V", "--version", action="version",
                        version="diskover v%s" % version,
                        help="Prints version and exits")
    args = parser.parse_args()
    if args.index:
        args.index = args.index.lower()
    return args


def log_setup(cliargs):
    """This is the log set up function.
    It configures log output for diskover.
    """
    diskover_logger = logging.getLogger('diskover')
    diskover_logger.setLevel(logging.INFO)
    es_logger = logging.getLogger('elasticsearch')
    es_logger.setLevel(logging.WARNING)
    urllib3_logger = logging.getLogger('urllib3')
    urllib3_logger.setLevel(logging.WARNING)
    requests_logger = logging.getLogger('requests')
    requests_logger.setLevel(logging.WARNING)

    logging.addLevelName(
        logging.INFO, "\033[1;32m%s\033[1;0m"
                      % logging.getLevelName(logging.INFO))
    logging.addLevelName(
        logging.WARNING, "\033[1;31m%s\033[1;0m"
                         % logging.getLevelName(logging.WARNING))
    logging.addLevelName(
        logging.ERROR, "\033[1;41m%s\033[1;0m"
                       % logging.getLevelName(logging.ERROR))
    logging.addLevelName(
        logging.DEBUG, "\033[1;33m%s\033[1;0m"
                       % logging.getLevelName(logging.DEBUG))
    logformatter = '%(asctime)s [%(levelname)s][%(name)s] %(message)s'

    loglevel = logging.INFO
    logging.basicConfig(format=logformatter, level=loglevel)

    if cliargs['verbose']:
        diskover_logger.setLevel(logging.INFO)
        es_logger.setLevel(logging.INFO)
        urllib3_logger.setLevel(logging.INFO)
        requests_logger.setLevel(logging.INFO)
    if cliargs['debug']:
        diskover_logger.setLevel(logging.DEBUG)
        es_logger.setLevel(logging.DEBUG)
        urllib3_logger.setLevel(logging.DEBUG)
        requests_logger.setLevel(logging.DEBUG)
    if cliargs['quiet']:
        diskover_logger.disabled = True
        es_logger.disabled = True
        urllib3_logger.disabled = True
        requests_logger.disabled = True

    return diskover_logger


def progress_bar(event):
    if event == 'Checking' or event == 'Calculating':
        widgets = [progressbar.AnimatedMarker(), ' ', event + ' (Queue: ', progressbar.Counter(), ') ', progressbar.Timer()]
        bar = progressbar.ProgressBar(widgets=widgets, max_value=progressbar.UnknownLength)
    else:
        widgets = [event + ' ', progressbar.Bar(), progressbar.Percentage(),
            ' (', progressbar.Timer(), ', ', progressbar.ETA(), ')']
        bar = progressbar.ProgressBar(widgets=widgets, max_value=100)
    return bar


def adaptive_batch(q, cliargs, batchsize):
    """This is the adaptive batch function.
    It auto adjusts the batch size sent to rq.
    Could be made better :)
    """
    q_len = len(q)
    if q_len == 0:
        if (batchsize - ab_step) >= ab_start:
            batchsize = batchsize - ab_step
    elif q_len > 0:
        if (batchsize + ab_step) <= ab_max:
            batchsize = batchsize + ab_step
    cliargs['batchsize'] = batchsize

    return batchsize


def calc_dir_sizes(cliargs, logger, path=None):
    from diskover_bot_module import calc_dir_size
    jobcount = 0
    # max depth to calc dir sizes
    maxdepth = cliargs['maxdcdepth']
    index = cliargs['index']

    try:
        # wait for worker bots to be idle and all queues are empty
        logger.info('Waiting for diskover worker bots to be done with any jobs in rq...')
        while worker_bots_busy([q, q_crawl, q_calc]):
            time.sleep(1)

        if cliargs['adaptivebatch']:
            batchsize = ab_start
        else:
            batchsize = cliargs['batchsize']
        if cliargs['verbose'] or cliargs['debug']:
            logger.info('Batch size: %s' % batchsize)

        # use generator and yield docs while scrolling index in es
        logger.info('Getting diskover bots to calculate directory sizes (maxdepth %s)...' % maxdepth)

        if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
            bar = progress_bar('Calculating')
            bar.start()
        else:
            bar = None

        data = _index_get_docs_data(index, cliargs, logger, path=path, maxdepth=maxdepth)

        # refresh index
        es.indices.refresh(index)

        starttime = time.time()
        # search es and start scroll
        res = es.search(index=index, doc_type='directory', scroll='1m',
                        size=config['es_scrollsize'], body=data, request_timeout=config['es_timeout'])

        dirlist = []
        dircount = 0
        bartimestamp = time.time()
        while res['hits']['hits'] and len(res['hits']['hits']) > 0:
            for hit in res['hits']['hits']:
                fullpath = os.path.join(hit['_source']['path_parent'], hit['_source']['filename'])
                # convert es time to unix time format
                mtime = time.mktime(datetime.strptime(hit['_source']['last_modified'],
                    '%Y-%m-%dT%H:%M:%S').timetuple())
                atime = time.mktime(datetime.strptime(hit['_source']['last_access'],
                    '%Y-%m-%dT%H:%M:%S').timetuple())
                ctime = time.mktime(datetime.strptime(hit['_source']['last_change'],
                    '%Y-%m-%dT%H:%M:%S').timetuple())
                dirlist.append((hit['_id'], fullpath, mtime, atime, ctime))
                dircount += 1
                dirlist_len = len(dirlist)
                if dirlist_len >= batchsize:
                    q_calc.enqueue(calc_dir_size, args=(dirlist, cliargs,), result_ttl=config['redis_ttl'])
                    jobcount += 1
                    if cliargs['debug'] or cliargs['verbose']:
                        logger.info("enqueued batchsize: %s (batchsize: %s)" % (dirlist_len, batchsize))
                    del dirlist[:]
                    if cliargs['adaptivebatch']:
                        batchsize = adaptive_batch(q_crawl, cliargs, batchsize)
                        if cliargs['debug'] or cliargs['verbose']:
                            logger.info("batchsize set to: %s" % batchsize)

            # update progress bar
            if bar:
                if time.time() - bartimestamp >= 2:
                    try:
                        bar.update(len(q_calc))
                    except (ZeroDivisionError, ValueError):
                        bar.update(0)
                    finally:
                        bartimestamp = time.time()

            # use es scroll api
            res = es.scroll(scroll_id=res['_scroll_id'], scroll='1m',
                            request_timeout=config['es_timeout'])

        # enqueue dir calc job for any remaining in dirlist
        if len(dirlist) > 0:
            q_calc.enqueue(calc_dir_size, args=(dirlist, cliargs,), result_ttl=config['redis_ttl'])
            jobcount += 1

        logger.info('Found %s directory docs' % str(dircount))

        # set up progress bar with time remaining
        if bar:
            bar.finish()
            bar_max_val = len(q_calc)
            bar = progressbar.ProgressBar(max_value=bar_max_val)
            bar.start()

        # update progress bar until all worker bots are idle and q_calc queue is empty
        while worker_bots_busy([q_calc]):
            if bar:
                q_len = len(q_calc)
                try:
                    bar.update(bar_max_val - q_len)
                except (ZeroDivisionError, ValueError):
                    bar.update(0)
            time.sleep(1)

        if bar:
            bar.finish()

        elapsed = get_time(time.time() - starttime)
        logger.info('Finished calculating %s directory sizes in %s' % (dircount, elapsed))

    except KeyboardInterrupt:
        print("Ctrl-c keyboard interrupt, shutting down...")
        sys.exit(0)


def scandirwalk_worker(threadn, num_sep, level, cliargs, logger):
    dirs = []
    nondirs = []
    # check if we are using storage agent and make connection
    if cliargs['storagent']:
        stor_agent = True
        hostlist = cliargs['storagent']
        stor_agent_conn = diskover_agent.AgentConnection(hosts=hostlist)
        stor_agent_conn.connect()
        if cliargs['debug'] or cliargs['verbose']:
            logger.info("[thread-%s] Connected to Storage Agent host: %s" % (threadn, stor_agent_conn.conn_host()))
    else:
        stor_agent = False
    while True:
        path = q_paths.get()
        try:
            q_paths_in_progress.put(path)
            if cliargs['debug'] or cliargs['verbose']:
                logger.info("[thread-%s] scandirwalk_worker: %s" % (threadn, path))
            if cliargs['crawlapi']:
                root, api_dirs, api_nondirs = api_listdir(path, api_ses)
                path = root
                for d in api_dirs:
                    # check if at maxdepth level to not enqueue subdirs and 
                    # descend further down the tree
                    if cliargs['maxdepth']:
                        num_sep_this = path.count(os.path.sep)
                        if num_sep + level < num_sep_this:
                            break
                    if not dir_excluded(d[0], config, cliargs):
                        q_paths.put(d[0])
                        dirs.append(d)
                for f in api_nondirs:
                    if cliargs['maxdepth']:
                        num_sep_this = f[0].count(os.path.sep)
                        if num_sep + level < num_sep_this:
                            break
                    nondirs.append(f)
                del api_dirs[:]
                del api_nondirs[:]
            elif stor_agent:
                # grab dir list from storage agent server
                dir_list = stor_agent_conn.listdir(path)
                logger.debug("[thread-%s] scandirwalk_worker: Storage Agent host response time: %s" % (threadn, stor_agent_conn.response_time()))
                path, dirs_noexcl, nondirs = dir_list
                for d in dirs_noexcl:
                    if not dir_excluded(d, config, cliargs):
                        dirs.append(d)
            else:
                item_count = 0
                f_count = 0
                for entry in scandir(path):
                    # check if at maxdepth level to not enqueue subdirs and 
                    # descend further down the tree
                    if cliargs['maxdepth']:
                        num_sep_this = entry.path.count(os.path.sep)
                        if num_sep + level < num_sep_this:
                            break
                    if entry.is_dir(follow_symlinks=False) and not dir_excluded(entry.path, config, cliargs):
                        q_paths.put(entry.path)
                        dirs.append(entry.name)
                    elif entry.is_file(follow_symlinks=False):
                        nondirs.append(entry.name)
                        f_count += 1
                    if item_count == 10000 and (cliargs['debug'] or cliargs['verbose']):
                        logger.info("[thread-%s] scandirwalk_worker: processing directory with many items: %s" % (threadn, path))
                    if cliargs['chunkfiles'] and f_count > cliargs['chunkfilesnum']:
                        if cliargs['debug'] or cliargs['verbose']:
                            logger.info("[thread-%s] scandirwalk_worker: chunksize reached, sending partial dirlist to worker bots: %s" % (threadn, path))
                        q_paths_results.put(((path, 'dchunk'), dirs[:], nondirs[:]))
                        del dirs[:]
                        del nondirs[:]
                        f_count = 0
                    item_count += 1
            q_paths_results.put((path, dirs[:], nondirs[:]))
        except (OSError, IOError) as e:
            logger.warning("[thread-%s] OS/IO Exception caused by: %s" % (threadn, e))
            pass
        except UnicodeDecodeError as e:
            logger.warning("[thread-%s] Unicode Decode Exception caused by: %s (path: %s)" % (threadn, e, path))
            pass
        except Exception as e:
            logger.error("[thread-%s] Exception caused by: %s" % (threadn, e))
            raise
        finally:
            q_paths_in_progress.get()
        del dirs[:]
        del nondirs[:]
        q_paths.task_done()


def scandirwalk(path, cliargs, logger):
    q_paths.put(path)
    while True:
        entry = q_paths_results.get()
        root, dirs, nondirs = entry
        if cliargs['debug'] or cliargs['verbose']:
            if cliargs['crawlapi']:
                logger.info("apiwalk: %s (dircount: %s, filecount: %s)" % (root[0], str(len(dirs)), str(len(nondirs))))
            else:
                logger.info("scandirwalk: %s (dircount: %s, filecount: %s)" % (root, str(len(dirs)), str(len(nondirs))))
        yield root, dirs, nondirs
        q_paths_results.task_done()
        if q_paths_results.qsize() == 0 and q_paths.qsize() == 0:
            time.sleep(.5)
            if q_paths_results.qsize() == 0 and q_paths.qsize() == 0 and q_paths_in_progress.qsize() == 0:
                break


def treewalk(top, num_sep, level, batchsize, cliargs, logger, reindex_dict):
    """This is the tree walk function.
    It walks the tree and adds tuple of directory and it's items
    to redis queue for rq worker bots to scrape meta and upload
    to ES index after batch size (dir count) has been reached.
    """
    from diskover_bot_module import scrape_tree_meta
    batch = []
    dircount = 0
    totaldirs = 0
    totalfiles = 0
    starttime = time.time()

    # set up threads for tree walk
    for i in range(cliargs['walkthreads']):
        t = Thread(target=scandirwalk_worker, args=(i, num_sep, level, cliargs, logger,))
        t.daemon = True
        t.start()

    # set up progress bar
    if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
        widgets = [progressbar.AnimatedMarker(), ' Crawling (Queue: ', progressbar.Counter(),
                   progressbar.FormatLabel(''), ') ', progressbar.Timer()]

        bar = progressbar.ProgressBar(widgets=widgets, max_value=progressbar.UnknownLength)
        bar.start()
    else:
        bar = None

    bartimestamp = time.time()
    for root, dirs, files in scandirwalk(top, cliargs, logger):
        if type(root) is tuple:
            _root = root[0]
            if root[1] == 'dchunk':
                dirchunk = True
                statsembeded = False
            else:
                statsembeded = True
                dirchunk = False
        else:
            _root = root
            dirchunk = False
            statsembeded = False
        if not dirchunk or statsembeded:
            dircount += 1
            totaldirs += 1
        files_len = len(files)
        dirs_len = len(dirs)
        # check for empty dirs
        if not cliargs['indexemptydirs']:
            if dirs_len == 0 and files_len == 0:
                if cliargs['debug'] or cliargs['verbose']:
                    logger.info("skipping empty dir: %s" % _root)
                continue
        totalfiles += files_len
        # replace path if cliarg
        if cliargs['replacepath']:
            if dirchunk:
                root[0] = replace_path(root[0])
            else:
                root = replace_path(root)
        # add directory and it's items to batch list
        batch.append((root, dirs, files))
        batch_len = len(batch)
        if batch_len >= batchsize or (cliargs['adaptivebatch'] and totalfiles >= config['adaptivebatch_maxfiles']):
            q_crawl.enqueue(scrape_tree_meta, args=(batch, cliargs, reindex_dict,),
                                    result_ttl=config['redis_ttl'])
            if cliargs['debug'] or cliargs['verbose']:
                logger.info("enqueued batchsize: %s (batchsize: %s)" % (batch_len, batchsize))
            del batch[:]
            totalfiles = 0
            if cliargs['adaptivebatch']:
                batchsize = adaptive_batch(q_crawl, cliargs, batchsize)
                if cliargs['debug'] or cliargs['verbose']:
                    logger.info("batchsize set to: %s" % batchsize)

        # update progress bar
        if bar:
            if time.time() - bartimestamp >= 2:
                try:
                    elapsed = round(time.time() - bartimestamp, 3)
                    dirspersec = round(dircount / elapsed, 3)
                    widgets[4] = progressbar.FormatLabel(', ' + str(dirspersec) + ' dirs/sec) ')
                    bar.update(len(q_crawl))
                except (ZeroDivisionError, ValueError):
                    bar.update(0)
                finally:
                    bartimestamp = time.time()
                    dircount = 0

    # add any remaining in batch to queue
    if len(batch) > 0:
        q_crawl.enqueue(scrape_tree_meta, args=(batch, cliargs, reindex_dict,), result_ttl=config['redis_ttl'])

    # set up progress bar with time remaining
    if bar:
        bar.finish()
        bar_max_val = len(q_crawl)
        bar = progressbar.ProgressBar(max_value=bar_max_val)
        bar.start()

    # update progress bar until bots are idle and queue is empty
    while worker_bots_busy([q_crawl]):
        if bar:
            q_len = len(q_crawl)
            try:
                bar.update(bar_max_val - q_len)
            except (ZeroDivisionError, ValueError):
                bar.update(0)
        time.sleep(1)

    if bar:
        bar.finish()

    elapsed = time.time() - starttime
    dirspersec = round(totaldirs / elapsed, 3)
    elapsed = get_time(elapsed)

    logger.info("Finished crawling in %s, dirs walked %s (%s dirs/sec)" %
                (elapsed, totaldirs, dirspersec))


def crawl_tree(path, cliargs, logger, reindex_dict):
    """This is the crawl tree function.
    It sets up the directory tree walking.
    """

    try:
        wait_for_worker_bots(logger)
        logger.info('Enqueueing crawl to diskover worker bots for %s...', path)

        if cliargs['autotag']:
            logger.info("Worker bots set to auto-tag (-A)")

        if cliargs['sizeondisk']:
            logger.info("Storing on disk size instead of file size using a blocksize of %s (-S)" % cliargs['blocksize'])

        if cliargs['adaptivebatch']:
            batchsize = ab_start
            cliargs['batchsize'] = batchsize
            logger.info("Sending adaptive batches to worker bots (-a)")
            if cliargs['verbose'] or cliargs['debug']:
                logger.info('Batch size: %s' % batchsize)
        else:
            batchsize = cliargs['batchsize']
            if cliargs['verbose'] or cliargs['debug']:
                logger.info('Batch size: %s' % batchsize)
            logger.info("Sending batches of %s to worker bots", batchsize)
            if batchsize < 25:
                logger.warning("Using a small batch size can decrease performance")
        
        if cliargs['splitfiles']:
            logger.info("Bots will split filelists larger than %s and share with other bots", 
                            cliargs['splitfilesnum'])
            if cliargs['splitfilesnum'] < 5000:
                logger.warning("Using a small splitfiles num can decrease performance")

        if cliargs['chunkfiles']:
            logger.info("Sending chunks of %s files to worker bots for any large directories", 
                            cliargs['chunkfilesnum'])
            if cliargs['chunkfilesnum'] < 500:
                logger.warning("Using a small chunkfiles num can decrease performance")

        # set maxdepth level to 1 if reindex
        if cliargs['reindex']:
            level = 1
            cliargs['maxdepth'] = 1
        else:
            level = cliargs['maxdepth']

        # set current depth
        num_sep = path.count(os.path.sep)

        # check for listenlwc socket cli flag to start socket server
        if cliargs['listentwc']:
            from diskover_socket_server import start_socket_server_twc
            starttime = start_socket_server_twc(rootdir_path, num_sep, level, batchsize, cliargs, logger, reindex_dict)
            return starttime

        starttime = time.time()

        logger.info("Starting crawl using %s treewalk threads (maxdepth %s)" % (cliargs['walkthreads'], cliargs['maxdepth']))

        # start tree walking
        treewalk(path, num_sep, level, batchsize, cliargs, logger, reindex_dict)

        return starttime

    except KeyboardInterrupt:
        print("Ctrl-c keyboard interrupt, shutting down...")
        sys.exit(0)


def hotdirs():
    from diskover_bot_module import calc_hot_dirs
    """This is the calculate hot dirs function.
    """
    logger.info('Getting diskover bots to calculate change percent '
                'for directories from %s to %s',
                         cliargs['hotdirs'], cliargs['index'])
    # look in index for all directory docs and add to queue
    dirlist = index_get_docs(cliargs, logger, doctype='directory', hotdirs=True, index=cliargs['index'])
    dirbatch = []
    if cliargs['adaptivebatch']:
        batchsize = ab_start
    else:
        batchsize = cliargs['batchsize']
    if cliargs['verbose'] or cliargs['debug']:
        logger.info('Batch size: %s' % batchsize)
    for d in dirlist:
        dirbatch.append(d)
        if len(dirbatch) >= batchsize:
            q.enqueue(calc_hot_dirs, args=(dirbatch, cliargs,), result_ttl=config['redis_ttl'])
            del dirbatch[:]
            if cliargs['adaptivebatch']:
                batchsize = adaptive_batch(q, cliargs, batchsize)

    # add any remaining in batch to queue
    q.enqueue(calc_hot_dirs, args=(dirbatch, cliargs,), result_ttl=config['redis_ttl'])

    if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
        bar = progress_bar('Checking')
        bar.start()
    else:
        bar = None

    # update progress bar until all bots are idle and q queue is empty
    while worker_bots_busy([q]):
        if bar:
            try:
                bar.update(len(q))
            except (ZeroDivisionError, ValueError):
                bar.update(0)
        time.sleep(1)

    if bar:
        bar.finish()


def worker_bots_busy(queues):
    """This is the worker bots busy function.
    It returns True when bots are busy and queues have jobs,
    else returns False when bots are all idle and queues are empty.
    """
    workers_busy = False
    workers = SimpleWorker.all(connection=redis_conn)
    for worker in workers:
        if worker._state == "busy":
            workers_busy = True
            break
    q_len = 0
    running_jobs = 0
    for qname in queues:
        q_len += len(qname)
        r = StartedJobRegistry(queue=qname)
        running_job_ids = r.get_job_ids()
        running_jobs += len(running_job_ids)
    if q_len == 0 and running_jobs == 0 and workers_busy == False:
        return False
    else:
        return True


def wait_for_worker_bots(logger):
    """This is the wait for worker bots function.
    It loops waiting for worker bots to start.
    """
    if cliargs['nowait']:
        return
    workers = SimpleWorker.all(connection=redis_conn)
    while len(workers) == 0:
        logger.info('Waiting for diskover worker bots to start...')
        time.sleep(2)
        workers = SimpleWorker.all(connection=redis_conn)
    logger.info('Found %s diskover RQ worker bots', len(workers))


def tune_es_for_crawl(defaults=False):
    """This is the tune es for crawl function.
    It optimizes ES for crawling based on config settings and after crawl is over
    sets back to defaults.
    """
    if config['disable_replicas'] == 'true':
        replicas = 0
    else:
        replicas = config['index_replicas']
    default_settings = {
        "index": {
            "refresh_interval": "1s",
            "number_of_replicas": config['index_replicas'],
            "translog.flush_threshold_size": "512mb"
        }
    }
    tuned_settings = {
        "index": {
            "refresh_interval": config['index_refresh'],
            "number_of_replicas": replicas,
            "translog.flush_threshold_size": config['index_translog_size']
        }
    }
    if not defaults:
        logger.info("Tuning ES index settings for crawl")
        es.indices.put_settings(index=cliargs['index'], body=tuned_settings,
                                request_timeout=config['es_timeout'])
    else:
        logger.info("Setting ES index settings back to defaults")
        es.indices.put_settings(index=cliargs['index'], body=default_settings,
                                request_timeout=config['es_timeout'])

        # set logging level for es to ERROR to not output any warnings about timeouts for index optimizing
        logging.getLogger('elasticsearch').setLevel(logging.ERROR)
        logger.info("Force merging ES index...")
        es.indices.forcemerge(index=cliargs['index'], request_timeout=config['es_timeout'])
        # check if we should optimize index
        if cliargs['optimizeindex']:
            logger.info('Optimizing ES index... this could take a while... (-O)')
            try:
                es.indices.forcemerge(index=cliargs['index'], max_num_segments=1, request_timeout=config['es_timeout'])
            except exceptions.ConnectionTimeout:
                logger.info("Optimizing timed out, will finish in background")
                pass


def upload_stats():
    """This is the upload stats function.
    It uploads num of files, dirs, tb crawled to diskoverspace.com.
    """
    index = cliargs['index']
    path = cliargs['rootdir']
    token = config['auth_token']
    es.indices.refresh(index)
    dir_count = es.count(index=index, doc_type='directory', body={'query': { 'match_all': {} }})['count']
    file_count = es.count(index=index, doc_type='file', body={'query': { 'match_all': {} }})['count']
    body = {
        'query': {
            'query_string': {
                'query': 'filename: "%s" AND path_parent: "%s"' % (
                    os.path.basename(path), 
                    os.path.abspath(os.path.join(path, os.pardir)))
            }
        }
    }
    size = es.search(index=index, doc_type='directory', body=body)['hits']['hits'][0]['_source']['filesize']
    size_tb = size/1024/1024/1024/1024

    data = {'token': token, 'dirs': dir_count, 'files': file_count, 'size': size_tb}
    r = requests.post('https://diskoverspace.com/diskover/uploadstats.php', data=data)


def post_crawl_tasks():
    """This is the post crawl tasks function.
    It runs at the end of the crawl and does post tasks.
    """

    # add elapsed time crawl stat to es
    add_crawl_stats(es, cliargs['index'], rootdir_path, (time.time() - starttime), "finished_crawl")

    # calculate directory sizes and items
    if cliargs['reindex'] or cliargs['reindexrecurs']:
        calc_path = rootdir_path
    else:
        calc_path = None
    calc_dir_sizes(cliargs, logger, path=calc_path)

    # add elapsed time crawl stat to es
    add_crawl_stats(es, cliargs['index'], rootdir_path, (time.time() - starttime), "finished_dircalc")

    if cliargs['reindex'] or cliargs['reindexrecurs']:
        # wait for worker bots to be idle and all queues are empty
        logger.info('Waiting for diskover worker bots to be done with any jobs in rq...')
        while worker_bots_busy([q, q_crawl, q_calc]):
            time.sleep(1)

    # set Elasticsearch index settings back to default
    tune_es_for_crawl(defaults=True)

    # upload stats to diskoverspace.com
    upload_stats()


def pre_crawl_tasks():
    # create Elasticsearch index
    index_create(cliargs['index'])

    # add crawl stat to index
    add_crawl_stats(es, cliargs['index'], rootdir_path, 0, "running")

    # optimize Elasticsearch index settings for crawling
    tune_es_for_crawl()

    # add disk space info to es index
    if not cliargs['reindex'] and not cliargs['reindexrecurs']:
        if cliargs['crawlapi']:
            from diskover_crawlapi import api_add_diskspace
            api_add_diskspace(es, cliargs['index'], rootdir_path, api_ses, logger)
        else:
            add_diskspace(cliargs['index'], logger, rootdir_path)


# load config file into config dictionary
config, configfile = load_config()

# set adaptive batch sizes from config
ab_start = config['adaptivebatch_startsize']
ab_max = config['adaptivebatch_maxsize']
ab_step = config['adaptivebatch_stepsize']

# load any available plugins
plugins = load_plugins()

import diskover_connections

# create Elasticsearch connection
diskover_connections.connect_to_elasticsearch()
from diskover_connections import es_conn as es
from diskover_connections import exceptions
from diskover_connections import helpers

# create Reddis connection
diskover_connections.connect_to_redis()
from diskover_connections import redis_conn

# Redis queue names
listen = [config['redis_queue'], config['redis_queue_crawl'], config['redis_queue_calcdir']]

# set up Redis q
q = Queue(listen[0], connection=redis_conn, default_timeout=config['redis_rq_timeout'])
q_crawl = Queue(listen[1], connection=redis_conn, default_timeout=config['redis_rq_timeout'])
q_calc = Queue(listen[2], connection=redis_conn, default_timeout=config['redis_rq_timeout'])

# queue for paths
q_paths = PyQueue()
q_paths_results = PyQueue()
q_paths_in_progress = PyQueue()
lock = Lock()


if __name__ == "__main__":
    # check fast c version of scandir is installed
    scandir_check()

    # parse cli arguments into cliargs dictionary
    cliargs = vars(parse_cli_args(config['index']))

    # cli args check
    if cliargs['splitfilesnum'] < 100:
        print('Error: --splitfilesnum cannot be < 100. See -h for defaults.')
        sys.exit(1)
    if cliargs['chunkfilesnum'] < 100:
        print('Error: --chunkfilesnum cannot be < 100. See -h for defaults.')
        sys.exit(1)
    if cliargs['splitfiles'] and cliargs['chunkfiles']:
        if cliargs['splitfilesnum'] <= cliargs['chunkfilesnum'] + 1:
            print('Error: --splitfilesnum cannot be <= --chunkfilesnum. See -h for defaults.')
            sys.exit(1)

    # set up logging
    logger = log_setup(cliargs)

    if not cliargs['quiet'] and not cliargs['gourcert'] and not cliargs['gourcemt']:
        # print random banner
        print_banner(version)
        logger.info("Using config file: %s" % configfile)

    # list plugins
    if cliargs['listplugins']:
        print("diskover plugins:")
        list_plugins()
        sys.exit(0)

    # run just dir calcs if cli arg
    if cliargs['dircalcsonly']:
        calc_dir_sizes(cliargs, logger)
        sys.exit(0)

    try:
        # check index name
        if cliargs['index'] == "diskover" or \
                        cliargs['index'].split('-')[0] != "diskover":
            print('Please name your index: diskover-<string>')
            sys.exit(1)
    except IndexError:
        print('Please name your index: diskover-<string>')
        sys.exit(1)

    # check for listen socket cli flag to start socket server
    if cliargs['listen']:
        from diskover_socket_server import start_socket_server
        start_socket_server(cliargs, logger)
        sys.exit(0)

    # check for gource cli flags
    if cliargs['gourcert'] or cliargs['gourcemt']:
        try:
            from diskover_gource import gource
            gource(es, cliargs)
        except KeyboardInterrupt:
            print('\nCtrl-c keyboard interrupt received, exiting')
        sys.exit(0)

    # tag duplicate files if cli argument
    if cliargs['finddupes']:
        from diskover_dupes import dupes_finder
        wait_for_worker_bots(logger)
        # Set up worker threads for duplicate file checker queue
        dupes_finder(es, q, cliargs, logger)
        logger.info('DONE checking for dupes! Sayonara!')
        sys.exit(0)

    # copy tags from index2 to index if cli argument
    if cliargs['copytags']:
        from diskover_bot_module import tag_copier
        wait_for_worker_bots(logger)
        logger.info('Copying tags from %s to %s', cliargs['copytags'], cliargs['index'])
        # look in index2 for all directory docs with tags and add to queue
        dirlist = index_get_docs(cliargs, logger, doctype='directory', copytags=True, index=cliargs['copytags'])
        for path in dirlist:
            q.enqueue(tag_copier, args=(path, cliargs,), result_ttl=config['redis_ttl'])
        # look in index2 for all file docs with tags and add to queue
        filelist = index_get_docs(cliargs, logger, doctype='file', copytags=True, index=cliargs['copytags'])
        for path in filelist:
            q.enqueue(tag_copier, args=(path, cliargs,), result_ttl=config['redis_ttl'])
        if len(dirlist) == 0 and len(filelist) == 0:
            logger.info('No tags to copy')
        else:
            logger.info('Worker bots copying tags in background')
        logger.info('Dispatcher is DONE! Sayonara!')
        sys.exit(0)

    # Calculate directory change percent from index2 to index if cli argument
    if cliargs['hotdirs']:
        wait_for_worker_bots(logger)
        hotdirs()
        logger.info('DONE finding hotdirs! Sayonara!')
        sys.exit(0)

    # print plugins
    plugins_list = ""
    for i in get_plugins_info():
        plugins_list = plugins_list + i["name"] + " "
    if plugins:
        logger.info("Plugins loaded: %s", plugins_list)

    # check if rootdir exists
    if cliargs['crawlapi']:
        if cliargs['rootdir'] == '.' or cliargs['rootdir'] == "":
            logger.error("Rootdir path missing, use -d /rootdir, exiting")
            sys.exit(1)
        from diskover_crawlapi import api_connection, api_stat, api_listdir
        logger.info('Connecting to file system storage api at %s... (--crawlapi)' % config['api_url'])
        api_ses = api_connection()
        logger.info('Connected to storage api')
        # check using storage api
        try:
            api_stat(cliargs['rootdir'], api_ses)
        except ValueError as e:
            logger.error("Rootdir path not found or not a directory, exiting (%s)" % e)
            sys.exit(1)
    elif cliargs['storagent']:
        try:
            import diskover_agent
        except ImportError:
            logger.error("Missing diskover_agent.py module, exiting")
            sys.exit(1)
    else:
        # warn if not running as root (linux) or Administrator (windows)
        try:
            is_admin = os.geteuid() == 0
            user = "root"
        except AttributeError:  # windows
            import ctypes
            is_admin = ctypes.windll.shell32.IsUserAnAdmin() != 0
            user = "Administrator"
        if not is_admin:
            logger.warning('Not running as %s, permissions might block crawling some files' % user)
        if not os.path.exists(cliargs['rootdir']) or not \
                os.path.isdir(cliargs['rootdir']):
            logger.error("Rootdir path not found or not a directory, exiting")
            sys.exit(1)
    logger.debug('Excluded dirs: %s', config['excluded_dirs'])
    # set rootdir_path to absolute path
    rootdir_path = os.path.abspath(cliargs['rootdir'])
    # remove any trailing slash unless root /
    if rootdir_path != '/':
        rootdir_path = rootdir_path.rstrip(os.path.sep)
    # check exclude
    if dir_excluded(rootdir_path, config, cliargs):
        logger.info("Directory in exclude list, exiting")
        sys.exit(0)
    cliargs['rootdir'] = rootdir_path
    # convert to unicode if python2
    if not IS_PY3:
        rootdir_path = unicode(rootdir_path)

    # warn if indexing 0 Byte empty files
    if cliargs['minsize'] == 0:
        logger.warning('You are indexing 0 Byte empty files (-s 0)')
    
    # warn if indexing empty dirs
    if cliargs['indexemptydirs']:
        logger.warning('You are indexing empty directories (-e)')

    # check if we are reindexing and remove existing docs in Elasticsearch
    # before crawling and reindexing
    reindex_dict = {'file': [], 'directory': []}
    if cliargs['reindex']:
        reindex_dict = index_delete_path(rootdir_path, cliargs, logger, reindex_dict)
    elif cliargs['reindexrecurs']:
        reindex_dict = index_delete_path(rootdir_path, cliargs, logger, reindex_dict, recursive=True)

    pre_crawl_tasks()

    # start crawling
    starttime = crawl_tree(rootdir_path, cliargs, logger, reindex_dict)

    post_crawl_tasks()

    logger.info('All DONE! Sayonara!')