python source code of crawler

#!/usr/bin/env python3
"""
spidy Web Crawler
Built by rivermont and FalconWarriorr
"""
import time
import shutil
import requests
import urllib
import threading
import queue
import logging

from os import path, makedirs
from copy import copy
from lxml import etree
from lxml.html import iterlinks, resolve_base_href
from reppy.robots import Robots

try:
    from spidy import __version__
except ImportError:
    from __init__ import __version__


VERSION = __version__


# Time statements.
# This is done before anything else to enable timestamp logging at every step
def get_time():
    return time.strftime('%H:%M:%S')


def get_full_time():
    return time.strftime('%H:%M:%S, %A %b %Y')


START_TIME = int(time.time())
START_TIME_LONG = get_time()

# Get current working directory of spidy
WORKING_DIR = path.realpath('.')
PACKAGE_DIR = path.dirname(path.realpath(__file__))

# Open log file for logging
try:
    makedirs(WORKING_DIR + '/logs')  # Attempts to make the logs directory
    makedirs(WORKING_DIR + '/saved')  # Attempts to make the saved directory
except OSError:
    pass  # Assumes only OSError wil complain if /logs already exists

LOG_FILE = open(path.join(WORKING_DIR, 'logs', 'spidy_log_{0}.txt'.format(START_TIME)),
                'w+', encoding='utf-8', errors='ignore')
LOG_FILE_NAME = path.join('logs', 'spidy_log_{0}'.format(START_TIME))

# Error log location
ERR_LOG_FILE = path.join(WORKING_DIR, 'logs', 'spidy_error_log_{0}.txt'.format(START_TIME))
ERR_LOG_FILE_NAME = path.join('logs', 'spidy_error_log_{0}.txt'.format(START_TIME))

LOGGER = logging.getLogger('SPIDY')
LOGGER.setLevel(logging.DEBUG)

# create file handler
handler = logging.FileHandler(ERR_LOG_FILE)
# minimum level logged: DEBUG (0)
handler.setLevel(logging.DEBUG)

# create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# add formatter to handler
handler.setFormatter(formatter)

# add ch to logger
LOGGER.addHandler(handler)

log_mutex = threading.Lock()


def write_log(operation, message, package='spidy', status='INFO', worker=0):
    """
    Writes message to both the console and the log file.

    Operations:
      INIT
      CRAWL
      SAVE
      LOG
      ERROR

    STATUSES:
      INFO
      ERROR
      INPUT

    PACKAGES:
      spidy
      reppy

    Worker 0 = Core
    """
    global LOG_FILE, log_mutex
    with log_mutex:
        now = get_time()
        message = '[{0}] [{1}] [WORKER #{2}] [{3}] [{4}]: {5}'\
                  .format(now, package, str(worker), operation, status, message)
        print(message)
        if not LOG_FILE.closed:
            LOG_FILE.write('\n' + message)


write_log('INIT', 'Starting spidy Web Crawler version {0}'.format(VERSION))
write_log('INIT', 'Report any problems to GitHub at https://github.com/rivermont/spidy')


###########
# CLASSES #
###########

write_log('INIT', 'Creating classes...')


class HeaderError(Exception):
    """
    Raised when there's a problem deciphering returned HTTP headers.
    """
    pass


class SizeError(Exception):
    """
    Raised when a file is too large to download in an acceptable time.
    """
    pass


class Counter(object):
    """
    Thread safe Counter
    """

    def __init__(self, value=0):
        # RawValue because we don't need it to create a Lock:
        self.val = value
        self.lock = threading.Lock()

    def increment(self):
        with self.lock:
            self.val += 1

    def decrement(self):
        with self.lock:
            self.val -= 1

    def value(self):
        with self.lock:
            return self.val


class ThreadSafeSet(list):
    """
    Thread Safe set
    """

    def __init__(self):
        self.lock = threading.Lock()
        self._set = set()

    def get(self):
        with self.lock:
            return self._set.pop()

    def put(self, o):
        with self.lock:
            self._set.add(o)

    def get_all(self):
        with self.lock:
            return self._set

    def clear(self):
        with self.lock:
            self._set.clear()


class RobotsIndex(object):
    """
    Thread Safe Robots Index
    """
    def __init__(self, respect_robots, user_agent):
        self.respect_robots = respect_robots
        self.user_agent = user_agent
        self.lock = threading.Lock()
        self.index = {}

    def is_allowed(self, start_url):
        if self.respect_robots:
            return self._lookup(start_url)
        else:
            return True

    def size(self):
        return len(self.index)

    def _lookup(self, url):
        hostname = urllib.parse.urlparse(url).hostname
        if hostname not in self.index.keys():
            with self.lock:
                # check again to be sure
                if hostname not in self.index.keys():
                    self._remember(url)

        return self.index[hostname].allowed(url)

    def _remember(self, url):
        urlparsed = urllib.parse.urlparse(url)
        robots_url = url.replace(urlparsed.path, '/robots.txt')
        write_log('ROBOTS',
                  'Reading robots.txt file at: {0}'.format(robots_url),
                  package='reppy')
        robots = Robots.fetch(robots_url)
        checker = robots.agent(self.user_agent)
        self.index[urlparsed.hostname] = checker


#############
# FUNCTIONS #
#############

write_log('INIT', 'Creating functions...')


def crawl(url, thread_id=0):
    global WORDS, OVERRIDE_SIZE, HEADER, SAVE_PAGES, SAVE_WORDS
    if not OVERRIDE_SIZE:
        try:
            # Attempt to get the size in bytes of the document
            length = int(requests.head(url, headers=HEADER).headers['Content-Length'])
        except KeyError:  # Sometimes no Content-Length header is returned...
            length = 1
        if length > 524288000:  # If the page is larger than 500 MB
            raise SizeError
    # If the SizeError is raised it will be caught in the except block in the run section,
    # and the following code will not be run.
    page = requests.get(url, headers=HEADER)  # Get page
    word_list = []
    if SAVE_WORDS:
        word_list = make_words(page)
        for word in word_list:
            WORDS.put(word)
    try:
        # Pull out all links after resolving them using any <base> tags found in the document.
        links = [link for element, attribute, link, pos in iterlinks(resolve_base_href(page.content))]
    except etree.ParseError:
        # If the document is not HTML content this will return an empty list.
        links = []
    links = list(set(links))
    if SAVE_PAGES:
        save_page(url, page)
    if SAVE_WORDS:
        # Announce which link was crawled
        write_log('CRAWL', 'Found {0} links and {1} words on {2}'.format(len(links), len(word_list), url),
                  worker=thread_id)
    else:
        # Announce which link was crawled
        write_log('CRAWL', 'Found {0} links on {1}'.format(len(links), url),
                  worker=thread_id)
    return links


def crawl_worker(thread_id, robots_index):
    """
    Crawler worker thread method
    """

    # Declare global variables
    global VERSION, START_TIME, START_TIME_LONG
    global LOG_FILE, LOG_FILE_NAME, ERR_LOG_FILE_NAME
    global HEADER, WORKING_DIR, KILL_LIST
    global COUNTER, NEW_ERROR_COUNT, KNOWN_ERROR_COUNT, HTTP_ERROR_COUNT, NEW_MIME_COUNT
    global MAX_NEW_ERRORS, MAX_KNOWN_ERRORS, MAX_HTTP_ERRORS, MAX_NEW_MIMES
    global USE_CONFIG, OVERWRITE, RAISE_ERRORS, ZIP_FILES, OVERRIDE_SIZE, SAVE_WORDS, SAVE_PAGES, SAVE_COUNT
    global TODO_FILE, DONE_FILE, ERR_LOG_FILE, WORD_FILE
    global RESPECT_ROBOTS, RESTRICT, DOMAIN
    global WORDS, TODO, DONE, THREAD_RUNNING

    while THREAD_RUNNING:
        # Check if there are more urls to crawl
        if TODO.empty():
            # Increment empty counter
            EMPTY_COUNTER.increment()
            # Check if other threads are producing links
            # by waiting till queue is empty
            while TODO.empty():
                # If all threads hit empty counter
                if EMPTY_COUNTER.val == THREAD_COUNT:
                    # Finish crawling
                    done_crawling()
                    return
                time.sleep(1)
            # Got a url in queue
            # Decrement counter
            EMPTY_COUNTER.decrement()
        # Queue not empty
        url = None
        try:
            if NEW_ERROR_COUNT.val >= MAX_NEW_ERRORS or \
               KNOWN_ERROR_COUNT.val >= MAX_KNOWN_ERRORS or \
               HTTP_ERROR_COUNT.val >= MAX_HTTP_ERRORS or \
               NEW_MIME_COUNT.val >= MAX_NEW_MIMES:  # If too many errors have occurred
                write_log('CRAWL', 'Too many errors have accumulated; stopping crawler.')
                done_crawling()
                break
            elif COUNTER.val >= SAVE_COUNT:  # If it's time for an autosave
                # Make sure only one thread saves files
                with save_mutex:
                    if COUNTER.val > 0:
                        try:
                            write_log('CRAWL', 'Queried {0} links.'.format(str(COUNTER.val)), worker=thread_id)
                            info_log()
                            write_log('SAVE', 'Saving files...')
                            save_files()
                            if ZIP_FILES:
                                zip_saved_files(time.time(), 'saved')
                        finally:
                            # Reset variables
                            COUNTER = Counter(0)
                            WORDS.clear()
            # Crawl the page
            else:
                try:
                    url = TODO.get(block=False)
                except queue.Empty:
                    continue
                else:
                    if check_link(url, robots_index):  # If the link is invalid
                        continue
                    links = crawl(url, thread_id)
                    for link in links:
                        # Skip empty links
                        if len(link) <= 0 or link == "/":
                            continue
                        # If link is relative, make it absolute
                        if link[0] == '/':
                            if url[-1] == '/':
                                link = url[:-1] + link
                            else:
                                link = url + link
                        TODO.put(link)
                    DONE.put(url)
                    COUNTER.increment()
                    TODO.task_done()

        # ERROR HANDLING
        except KeyboardInterrupt:  # If the user does ^C
            handle_keyboard_interrupt()

        except Exception as e:
            link = url
            write_log('CRAWL', 'An error was raised trying to process {0}'
                      .format(link), status='ERROR', worker=thread_id)
            err_mro = type(e).mro()

            if SizeError in err_mro:
                KNOWN_ERROR_COUNT.increment()
                write_log('ERROR', 'Document too large.', worker=thread_id)
                err_log(link, 'SizeError', e)

            elif OSError in err_mro:
                KNOWN_ERROR_COUNT.increment()
                write_log('ERROR', 'An OSError occurred.', worker=thread_id)
                err_log(link, 'OSError', e)

            elif str(e) == 'HTTP Error 403: Forbidden':
                write_log('ERROR', 'HTTP 403: Access Forbidden', worker=thread_id)

            elif etree.ParserError in err_mro:  # Error processing html/xml
                KNOWN_ERROR_COUNT.increment()
                write_log('ERROR', 'An XMLSyntaxError occurred. A web dev screwed up somewhere.',
                          worker=thread_id)
                err_log(link, 'XMLSyntaxError', e)

            elif requests.exceptions.SSLError in err_mro:  # Invalid SSL certificate
                KNOWN_ERROR_COUNT.increment()
                write_log('ERROR', 'An SSLError occurred. Site is using an invalid certificate',
                          worker=thread_id)
                err_log(link, 'SSLError', e)

            elif requests.exceptions.ConnectionError in err_mro:  # Error connecting to page
                KNOWN_ERROR_COUNT.increment()
                write_log('ERROR', 'A ConnectionError occurred.'
                                   'There\'s something wrong with somebody\'s network.', worker=thread_id)
                err_log(link, 'ConnectionError', e)

            elif requests.exceptions.TooManyRedirects in err_mro:  # Exceeded 30 redirects.
                KNOWN_ERROR_COUNT.increment()
                write_log('ERROR', 'A TooManyRedirects error occurred.'
                          'Page is probably part of a redirect loop.', worker=thread_id)
                err_log(link, 'TooManyRedirects', e)

            elif requests.exceptions.ContentDecodingError in err_mro:
                # Received response with content-encoding: gzip, but failed to decode it.
                KNOWN_ERROR_COUNT.increment()
                write_log('ERROR', 'A ContentDecodingError occurred.'
                          'Probably just a zip bomb, nothing to worry about.', worker=thread_id)
                err_log(link, 'ContentDecodingError', e)

            elif 'Unknown MIME type' in str(e):
                NEW_MIME_COUNT.increment()
                write_log('ERROR', 'Unknown MIME type: {0}'.format(str(e)[18:]), worker=thread_id)
                err_log(link, 'Unknown MIME', e)

            else:  # Any other error
                NEW_ERROR_COUNT.increment()
                write_log('ERROR', 'An unknown error happened. New debugging material!', worker=thread_id)
                err_log(link, 'Unknown', e)
                if RAISE_ERRORS:
                    done_crawling()
                    raise e
                else:
                    continue

            write_log('LOG', 'Saved error message and timestamp to error log file', worker=thread_id)

    write_log('CRAWL', 'Thread execution stopped.', worker=thread_id)


def check_link(item, robots_index=None):
    """
    Returns True if item is not a valid url.
    Returns False if item passes all inspections (is valid url).
    """
    # Shortest possible url being 'http://a.b', and
    # Links longer than 255 characters are usually too long for the filesystem to handle.
    if robots_index and not robots_index.is_allowed(item):
        return True
    if RESTRICT:
        if DOMAIN not in item:
            return True
    if len(item) < 10 or len(item) > 255:
        return True
    # Must be an http(s) link
    elif item[0:4] != 'http':
        return True
    elif item in copy(DONE.queue):
        return True
    return False


def check_word(word):
    """
    Returns True if word is not valid.
    Returns False if word passes all inspections (is valid).
    """
    # If word is longer than 16 characters (avg password length is ~8)
    if len(word) > 16:
        return True
    else:
        return False


def check_path(file_path):
    """
    Checks the path of a given filename to see whether it will cause errors when saving.
    Returns True if path is valid.
    Returns False if path is invalid.
    """
    if len(file_path) > 256:
        return False
    else:
        return True


def make_words(site):
    """
    Returns list of all valid words in page.
    """
    page = site.text  # Get page content
    word_list = page.split()  # Split content into lists of words, as separated by spaces
    del page
    word_list = list(set(word_list))  # Remove duplicates
    for word in word_list:
        if check_word(word):  # If word is invalid
            word_list.remove(word)  # Remove invalid word from list
    return word_list


def save_files():
    """
    Saves the TODO, done, and word lists into their respective files.
    Also logs the action to the console.
    """

    global TODO, DONE

    with open(TODO_FILE, 'w', encoding='utf-8', errors='ignore') as todoList:
        for site in copy(TODO.queue):
            try:
                todoList.write(site + '\n')  # Save TODO list
            except UnicodeError:
                continue
    write_log('SAVE', 'Saved TODO list to {0}'.format(TODO_FILE))

    with open(DONE_FILE, 'w', encoding='utf-8', errors='ignore') as done_list:
        for site in copy(DONE.queue):
            try:
                done_list.write(site + '\n')  # Save done list
            except UnicodeError:
                continue
    write_log('SAVE', 'Saved DONE list to {0}'.format(TODO_FILE))

    if SAVE_WORDS:
        update_file(WORD_FILE, WORDS.get_all(), 'words')


def make_file_path(url, ext):
    """
    Makes a valid Windows file path for a given url.
    """
    url = url.replace(ext, '')  # Remove extension from path
    for char in """/\\ *""":  # Remove illegal characters from path
        url = url.replace(char, '-')
    for char in """|:?&<>""":
        url = url.replace(char, '')
    url = url[:255] + ext  # Truncate to valid file length
    return url


def get_mime_type(page):
    """
    Extracts the Content-Type header from the headers returned by page.
    """
    try:
        doc_type = str(page.headers['content-type'])
        return doc_type
    except KeyError:  # If no Content-Type was returned, return blank
        return ''


def mime_lookup(value):
    """
    Finds the correct file extension for a MIME type using the MIME_TYPES dictionary.
    If the MIME type is blank it defaults to .html,
    and if the MIME type is not in the dictionary it raises a HeaderError.
    """
    value = value.lower()  # Reduce to lowercase
    value = value.split(';')[0]  # Remove possible encoding
    if value in MIME_TYPES:
        return MIME_TYPES[value]
    elif value == '':
        return '.html'
    else:
        raise HeaderError('Unknown MIME type: {0}'.format(value))


def save_page(url, page):
    """
    Download content of url and save to the save folder.
    """
    # Make file path
    ext = mime_lookup(get_mime_type(page))
    cropped_url = make_file_path(url, ext)
    file_path = path.join(WORKING_DIR, 'saved', '{0}'.format(cropped_url))

    # Save file
    with open(file_path, 'w', encoding='utf-8', errors='ignore') as file:
        if ext == '.html':
            file.write('''<!-- "{0}" -->
<!-- Downloaded with the spidy Web Crawler -->
<!-- https://github.com/rivermont/spidy -->
'''.format(url))
        file.write(page.text)


def update_file(file, content, file_type):
    with open(file, 'r+', encoding='utf-8', errors='ignore') as open_file:  # Open save file for reading and writing
        file_content = open_file.readlines()  # Make list of all lines in file
        contents = []
        for x in file_content:
            contents.append(x.strip())
        for item in file_content:
            content.update(item)  # Otherwise add item to content (set)
        del file_content
        for item in content:
            open_file.write('\n' + str(item))  # Write all words to file
        open_file.truncate()  # Delete everything in file beyond what has been written (old stuff)
    write_log('SAVE', 'Saved {0} {1} to {2}'.format(len(content), file_type, file))


def info_log():
    """
    Logs important information to the console and log file.
    """
    # Print to console
    write_log('LOG', 'Started at {0}'.format(START_TIME_LONG))
    write_log('LOG', 'Log location: {0}'.format(LOG_FILE_NAME))
    write_log('LOG', 'Error log location: {0}'.format(ERR_LOG_FILE_NAME))
    write_log('LOG', '{0} links in TODO'.format(TODO.qsize()))
    write_log('LOG', '{0} links in DONE'.format(DONE.qsize()))
    write_log('LOG', 'TODO/DONE: {0}'.format(TODO.qsize() / DONE.qsize()))
    write_log('LOG', '{0}/{1} new errors caught.'.format(NEW_ERROR_COUNT.val, MAX_NEW_ERRORS))
    write_log('LOG', '{0}/{1} HTTP errors encountered.'.format(HTTP_ERROR_COUNT.val, MAX_HTTP_ERRORS))
    write_log('LOG', '{0}/{1} new MIMEs found.'.format(NEW_MIME_COUNT.val, MAX_NEW_MIMES))
    write_log('LOG', '{0}/{1} known errors caught.'.format(KNOWN_ERROR_COUNT.val, MAX_KNOWN_ERRORS))


def log(message, level=logging.DEBUG):
    """
    Logs a single message to the error log file.
    Prints message verbatim, so message must be formatted correctly in the function call.

    Parameters
    ----------
    message : str
        Message to log
    level : lvl
        logging.[DEBUG, INFO, WARNING, ERROR, CRITICAL]
    """
    LOGGER.log(level, message)


def handle_invalid_input(type_='input'):
    """
    Handles an invalid user input, usually from the input() function.
    """
    LOG_FILE.write('\n[{0}] [spidy] [INPUT] [ERROR]: Please enter a valid {1}. (yes/no)'.format(get_time(), type_))
    raise SyntaxError('[{0}] [spidy] [INPUT] [ERROR]: Please enter a valid {1}. (yes/no)'.format(get_time(), type_))


def err_log(url, error1, error2):
    """
    Saves the triggering error to the log file.
    error1 is the trimmed error source.
    error2 is the extended text of the error.
    """
    LOGGER.error("\nURL: {0}\nERROR: {1}\nEXT: {2}\n\n".format(url, error1, str(error2)))


def zip_saved_files(out_file_name, directory):
    """
    Creates a .zip file in the current directory containing all contents of dir, then empties.
    """
    shutil.make_archive(str(out_file_name), 'zip', directory)  # Zips files
    shutil.rmtree(directory)  # Deletes folder
    makedirs(directory)  # Creates empty folder of same name
    write_log('SAVE', 'Zipped documents to {0}.zip'.format(out_file_name))


########
# INIT #
########

write_log('INIT', 'Creating variables...')

# Sourced mainly from https://www.iana.org/assignments/media-types/media-types.xhtml
# Added by hand after being discovered by the crawler to reduce lookup times.
MIME_TYPES = {
    'application/atom+xml': '.atom',
    'application/epub+zip': '.epub',
    'application/font-woff': '.woff',
    'application/font-woff2': '.woff2',
    'application/force-download': '.bin',  # No idea what this is so saving as .bin
    'application/gzip': '.gz',
    'application/java-archive': '.jar',
    'application/javascript': '.js',
    'application/js': '.js',  # Should be application/javascript
    'application/json': '.json',
    'application/json+oembed': '.json',
    'application/ld+json': '.jsonld',
    'application/marcxml+xml': '.mrcx',
    'application/msword': '.doc',
    'application/n-triples': '.nt',
    'application/octet-stream': '.exe',  # Sometimes .bin
    'application/ogg': '.ogx',
    'application/opensearchdescription+xml': '.osdx',
    'application/pdf': '.pdf',
    'application/postscript': '.eps',  # Also .ps
    'application/rdf+xml': '.rdf',
    'application/rsd+xml': '.rsd',
    'application/rss+xml': '.rss',
    'application/txt': '.txt',
    'application/vnd.ms-cab-compressed': '.cab',
    'application/vnd.ms-excel': '.',
    'application/vnd.ms-fontobject': '.eot',
    'application/x-endnote-refer': '.enw',
    'application/x-www-form-urlencoded': '.png',
    'application/vnd.android.package-archive': '.apk',
    'application/vnd.oasis.opendocument.text': '.odt',
    'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx',
    'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
    'application/vnd.oasis.opendocument.formula-template': '.otf',
    'application/vnd.php.serialized': '.php',
    'application/x-bibtex': '.bib',
    'application/x-font-ttf': '.ttf',
    'application/x-font-woff': '.woff',
    'application/x-gzip': '.gz',
    'application/x-javascript': '.js',
    'application/x-mobipocket-ebook': '.mobi',
    'application/x-mpegurl': '.m3u8',
    'application/x-msi': '.msi',
    'application/x-research-info-systems': '.ris',
    'application/x-rss+xml': '.rss',
    'application/x-shockwave-flash': '.swf',
    'application/x-tar': '.tar.gz',  # Tarballs aren't official IANA types
    'application/xhtml+xml': '.xhtml',
    'application/xml': '.xml',
    'application/zip': '.zip',
    'audio/mpeg': '.mp3',
    'audio/mp3': '.mp3',
    'audio/x-m4a': '.m4a',
    'binary/octet-stream': '.exe',  # Should be application/octet-stream
    'font/woff': '.woff', 'font/woff2': '.woff2',
    'font/ttf': '.ttf',
    'font/otf': '.otf',
    'html': '.html',  # Incorrect
    'image/gif': '.gif',
    'image/jpeg': '.jpeg',
    'image/jpg': '.jpg',
    'image/pjpeg': '.jpg',
    'image/png': '.png',
    'image/ico': '.ico',
    'image/svg+xml': '.svg',
    'image/tiff': '.tif',
    'image/vnd.djvu': '.djvu',
    'image/vnd.microsoft.icon': '.ico',
    'image/webp': '.webp',
    'image/x-bitmap': '.xbm',
    'image/x-icon': '.ico',
    'image/x-ms-bmp': '.bmp',
    'text/calendar': '.ics',
    'text/css': '.css',
    'text/csv': '.csv',
    'text/directory': '.vcf',
    'text/html': '.html',
    'text/html,application/xhtml+xml,application/xml': '.html',  # Misunderstood 'Accept' header?
    'text/javascript': '.js',
    'text/n3': '.n3',
    'text/plain': '.txt',
    'text/turtle': '.ttl',
    'text/vnd.wap.wml': '.xml',  # or .wml
    'text/vtt': '.vtt',
    'text/x-c': '.c',
    'text/x-wiki': '.txt',  # Doesn't seem to have a filetype of its own
    'text/xml charset=utf-8': '.xml',  # Shouldn't have encoding
    'text/xml': '.xml',  # Incorrect
    'video/3gpp': '.3gp',
    'video/3gp': '.3gp',
    'video/mp4': '.mp4',
    'video/webm': '.webp',
    'video/mpeg': '.mpeg',
    'video/x-flv': '.flv',
    'vnd.ms-fontobject': '.eot'  # Incorrect
}

# User-Agent Header Strings
HEADERS = {
    'spidy': {
        'User-Agent': 'spidy Web Crawler (Mozilla/5.0; bot; +https://github.com/rivermont/spidy/)',
        'Accept-Language': 'en_US, en-US, en',
        'Accept-Encoding': 'gzip',
        'Connection': 'keep-alive'
    },
    'Chrome': {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        '(KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
        'Accept-Language': 'en_US, en-US, en',
        'Accept-Encoding': 'gzip',
        'Connection': 'keep-alive'
    },
    'Firefox': {
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0',
        'Accept-Language': 'en_US, en-US, en',
        'Accept-Encoding': 'gzip',
        'Connection': 'keep-alive'
    },
    'IE': {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
        'Accept-Language': 'en_US, en-US, en',
        'Accept-Encoding': 'gzip',
        'Connection': 'keep-alive'
    },
    'Edge': {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        '(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063',
        'Accept-Language': 'en_US, en-US, en',
        'Accept-Encoding': 'gzip',
        'Connection': 'keep-alive'
    }
}

KILL_LIST = [
    # Pages that are known to cause problems with the crawler in some way
    'bhphotovideo.com/c/search',
    'scores.usaultimate.org/',
    'w3.org',
    'web.archive.org/web/'
]

# Links to start crawling if the TODO list is empty
START = ['https://en.wikipedia.org/wiki/Main_Page']

# Counter variables
COUNTER = Counter(0)
NEW_ERROR_COUNT = Counter(0)
KNOWN_ERROR_COUNT = Counter(0)
HTTP_ERROR_COUNT = Counter(0)
NEW_MIME_COUNT = Counter(0)
EMPTY_COUNTER = Counter(0)

# Empty set for word scraping
WORDS = ThreadSafeSet()
words_mutex = threading.Lock()

# Getting arguments

yes = ['y', 'yes', 'Y', 'Yes', 'True', 'true']
no = ['n', 'no', 'N', 'No', 'False', 'false']

# Initialize variables as empty that will be needed in the global scope
HEADER = {}
SAVE_COUNT, MAX_NEW_ERRORS, MAX_KNOWN_ERRORS, MAX_HTTP_ERRORS = 0, 0, 0, 0
MAX_NEW_MIMES = 0
RESPECT_ROBOTS, RESTRICT, DOMAIN = False, False, ''
USE_CONFIG, OVERWRITE, RAISE_ERRORS, ZIP_FILES, OVERRIDE_SIZE = False, False, False, False, False
SAVE_PAGES, SAVE_WORDS = False, False
TODO_FILE, DONE_FILE, WORD_FILE = '', '', ''
TODO, DONE = queue.Queue(), queue.Queue()
THREAD_COUNT = 1
THREAD_LIST = []
save_mutex = threading.Lock()
FINISHED = False
THREAD_RUNNING = True


def init():
    """
    Sets all of the variables for spidy,
    and as a result can be used for effectively resetting the crawler.
    """
    # Declare global variables
    global VERSION, START_TIME, START_TIME_LONG
    global LOG_FILE, LOG_FILE_NAME, ERR_LOG_FILE_NAME
    global HEADER, PACKAGE_DIR, WORKING_DIR, KILL_LIST
    global COUNTER, NEW_ERROR_COUNT, KNOWN_ERROR_COUNT, HTTP_ERROR_COUNT, NEW_MIME_COUNT
    global MAX_NEW_ERRORS, MAX_KNOWN_ERRORS, MAX_HTTP_ERRORS, MAX_NEW_MIMES
    global USE_CONFIG, OVERWRITE, RAISE_ERRORS, ZIP_FILES, OVERRIDE_SIZE, SAVE_WORDS, SAVE_PAGES, SAVE_COUNT
    global TODO_FILE, DONE_FILE, ERR_LOG_FILE, WORD_FILE
    global RESPECT_ROBOTS, RESTRICT, DOMAIN
    global WORDS, TODO, DONE, THREAD_COUNT

    # Getting Arguments

    if not path.exists(path.join(PACKAGE_DIR, 'config')):
        write_log('INIT', 'No config folder available.')
        USE_CONFIG = False
    else:
        write_log('INIT', 'Should spidy load settings from an available config file? (y/n):')
        input_ = input()
        if not bool(input_):
            USE_CONFIG = False
        elif input_ in yes:
            USE_CONFIG = True
        elif input_ in no:
            USE_CONFIG = False
        else:
            handle_invalid_input()

    if USE_CONFIG:
        try:
            write_log('INIT', 'Config file name:', status='INPUT')
            input_ = input()
            if input_[-4:] == '.cfg':
                file_path = path.join(PACKAGE_DIR, 'config', input_)
            else:
                file_path = path.join(PACKAGE_DIR, 'config', '{0}.cfg'.format(input_))
            write_log('INIT', 'Loading configuration settings from {0}'.format(file_path))
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                for line in file.readlines():
                    exec(line, globals())
        except FileNotFoundError:
            write_log('INPUT', 'Config file not found.', status='ERROR')
            raise FileNotFoundError()
        except Exception:
            write_log('INPUT', 'Please name a valid .cfg file.', status='ERROR')
            raise Exception()

    else:
        write_log('INIT', 'Please enter the following arguments. Leave blank to use the default values.')

        write_log('INIT', 'How many parallel threads should be used for crawler? (Default: 1):', status='INPUT')
        input_ = input()
        if not bool(input_):  # Use default value
            THREAD_COUNT = 1
        elif input_.isdigit():
            THREAD_COUNT = int(input_)
        else:  # Invalid input
            handle_invalid_input()

        write_log('INIT', 'Should spidy load from existing save files? (y/n) (Default: Yes):', status='INPUT')
        input_ = input()
        if not bool(input_):  # Use default value
            OVERWRITE = False
        elif input_ in yes:  # Yes
            OVERWRITE = False
        elif input_ in no:  # No
            OVERWRITE = True
        else:  # Invalid input
            handle_invalid_input()

        write_log('INIT', 'Should spidy raise NEW errors and stop crawling? (y/n) (Default: No):', status='INPUT')
        input_ = input()
        if not bool(input_):
            RAISE_ERRORS = False
        elif input_ in yes:
            RAISE_ERRORS = True
        elif input_ in no:
            RAISE_ERRORS = False
        else:
            handle_invalid_input()

        write_log('INIT', 'Should spidy save the pages it scrapes to the saved folder? (y/n) (Default: Yes):', status='INPUT')
        input_ = input()
        if not bool(input_):
            SAVE_PAGES = True
        elif input_ in yes:
            SAVE_PAGES = True
        elif input_ in no:
            SAVE_PAGES = False
        else:
            handle_invalid_input()

        if SAVE_PAGES:
            write_log('INIT', 'Should spidy zip saved documents when autosaving? (y/n) (Default: No):', status='INPUT')
            input_ = input()
            if not bool(input_):
                ZIP_FILES = False
            elif input_ in yes:
                ZIP_FILES = True
            elif input_ in no:
                ZIP_FILES = False
            else:
                handle_invalid_input()
        else:
            ZIP_FILES = False

        write_log('INIT', 'Should spidy download documents larger than 500 MB? (y/n) (Default: No):', status='INPUT')
        input_ = input()
        if not bool(input_):
            OVERRIDE_SIZE = False
        elif input_ in yes:
            OVERRIDE_SIZE = True
        elif input_ in no:
            OVERRIDE_SIZE = False
        else:
            handle_invalid_input()

        write_log('INIT', 'Should spidy scrape words and save them? (y/n) (Default: Yes):', status='INPUT')
        input_ = input()
        if not bool(input_):
            SAVE_WORDS = True
        elif input_ in yes:
            SAVE_WORDS = True
        elif input_ in no:
            SAVE_WORDS = False
        else:
            handle_invalid_input()

        write_log('INIT', 'Should spidy restrict crawling to a specific domain only? (y/n) (Default: No):',
                  status='INPUT')
        input_ = input()
        if not bool(input_):
            RESTRICT = False
        elif input_ in yes:
            RESTRICT = True
        elif input_ in no:
            RESTRICT = False
        else:
            handle_invalid_input()

        if RESTRICT:
            write_log('INIT', 'What domain should crawling be limited to? Can be subdomains, http/https, etc.',
                      status='INPUT')
            input_ = input()
            try:
                DOMAIN = input_
            except KeyError:
                handle_invalid_input('string')

        write_log('INIT', 'Should spidy respect sites\' robots.txt? (y/n) (Default: Yes):', status='INPUT')
        input_ = input()
        if not bool(input_):
            RESPECT_ROBOTS = True
        elif input_ in yes:
            RESPECT_ROBOTS = True
        elif input_ in no:
            RESPECT_ROBOTS = False
        else:
            handle_invalid_input()

        write_log('INIT', 'What HTTP browser headers should spidy imitate?', status='INPUT')
        write_log('INIT', 'Choices: spidy (default), Chrome, Firefox, IE, Edge, Custom:', status='INPUT')
        input_ = input()
        if not bool(input_):
            HEADER = HEADERS['spidy']
        elif input_.lower() == 'custom':
            write_log('INIT', 'Valid HTTP headers:', status='INPUT')
            HEADER = input()
        else:
            try:
                HEADER = HEADERS[input_]
            except KeyError:
                handle_invalid_input('browser name')

        write_log('INIT', 'Location of the TODO save file (Default: crawler_todo.txt):', status='INPUT')
        input_ = input()
        if not bool(input_):
            TODO_FILE = 'crawler_todo.txt'
        else:
            TODO_FILE = input_

        write_log('INIT', 'Location of the DONE save file (Default: crawler_done.txt):', status='INPUT')
        input_ = input()
        if not bool(input_):
            DONE_FILE = 'crawler_done.txt'
        else:
            DONE_FILE = input_

        if SAVE_WORDS:
            write_log('INIT', 'Location of the words save file (Default: crawler_words.txt):', status='INPUT')
            input_ = input()
            if not bool(input_):
                WORD_FILE = 'crawler_words.txt'
            else:
                WORD_FILE = input_
        else:
            WORD_FILE = 'None'

        write_log('INIT', 'After how many queried links should the crawler autosave? (Default: 100):', status='INPUT')
        input_ = input()
        if not bool(input_):
            SAVE_COUNT = 100
        elif not input_.isdigit():
            handle_invalid_input('integer')
        else:
            SAVE_COUNT = int(input_)

        if not RAISE_ERRORS:
            write_log('INIT', 'After how many new errors should spidy stop? (Default: 5):', status='INPUT')
            input_ = input()
            if not bool(input_):
                MAX_NEW_ERRORS = 5
            elif not input_.isdigit():
                handle_invalid_input('integer')
            else:
                MAX_NEW_ERRORS = int(input_)
        else:
            MAX_NEW_ERRORS = 1

        write_log('INIT', 'After how many known errors should spidy stop? (Default: 10):', status='INPUT')
        input_ = input()
        if not bool(input_):
            MAX_KNOWN_ERRORS = 20
        elif not input_.isdigit():
            handle_invalid_input('integer')
        else:
            MAX_KNOWN_ERRORS = int(input_)

        write_log('INIT', 'After how many HTTP errors should spidy stop? (Default: 20):', status='INPUT')
        input_ = input()
        if not bool(input_):
            MAX_HTTP_ERRORS = 50
        elif not input_.isdigit():
            handle_invalid_input('integer')
        else:
            MAX_HTTP_ERRORS = int(input_)

        write_log('INIT', 'After encountering how many new MIME types should spidy stop? (Default: 20):',
                  status='INPUT')
        input_ = input()
        if not bool(input_):
            MAX_NEW_MIMES = 10
        elif not input_.isdigit():
            handle_invalid_input('integer')
        else:
            MAX_NEW_MIMES = int(input_)

        # Remove INPUT variable from memory
        del input_

    if OVERWRITE:
        write_log('INIT', 'Creating save files...')
        for start in START:
            TODO.put(start)
        DONE = queue.Queue()
    else:
        write_log('INIT', 'Loading save files...')
        # Import saved TODO file data
        try:
            with open(TODO_FILE, 'r', encoding='utf-8', errors='ignore') as f:
                contents = f.readlines()
        except FileNotFoundError:  # If no TODO file is present
            contents = []
        for line in contents:
            TODO.put(line.strip())
        # Import saved done file data
        try:
            with open(DONE_FILE, 'r', encoding='utf-8', errors='ignore') as f:
                contents = f.readlines()
        except FileNotFoundError:  # If no DONE file is present
            contents = []
        for line in contents:
            DONE.put(line.strip())
        del contents

        # If TODO list is empty, add default starting pages
    if TODO.qsize() == 0:
        for start in START:
            TODO.put(start)


def spawn_threads(robots_index):
    """
    Spawn the crawler threads
    """
    try:
        write_log('INIT', 'Spawning {0} worker threads...'.format(THREAD_COUNT))
        for i in range(THREAD_COUNT):
            t = threading.Thread(target=crawl_worker, args=(i+1, robots_index))
            write_log('INIT', 'Starting crawl...', worker=i+1)
            t.daemon = True
            t.start()
            THREAD_LIST.append(t)
        for t in THREAD_LIST:
            t.join()
    except KeyboardInterrupt:
        handle_keyboard_interrupt()


def kill_threads():
    """
    Will terminate all running threads
    """
    global THREAD_RUNNING
    write_log('CRAWL', 'Stopping all threads...')
    THREAD_RUNNING = False


def done_crawling(keyboard_interrupt=False):
    # Make sure only one thread calls this
    with save_mutex:
        global FINISHED
        if FINISHED:
            return
        kill_threads()
        FINISHED = True
        if keyboard_interrupt:
            write_log('CRAWL', 'User performed a KeyboardInterrupt, stopping crawler.', status='ERROR')
            LOGGER.log(logging.INFO, 'User performed a KeyboardInterrupt, stopping crawler.')
        else:
            write_log('CRAWL', 'I think you\'ve managed to download the entire internet. '
                               'I guess you\'ll want to save your files...')
        save_files()
        LOG_FILE.close()


def handle_keyboard_interrupt():
    kill_threads()
    done_crawling(True)


def main():
    """
    The main function of spidy.
    """
    # Declare global variables
    global VERSION, START_TIME, START_TIME_LONG
    global LOG_FILE, LOG_FILE_NAME, ERR_LOG_FILE_NAME
    global HEADER, WORKING_DIR, KILL_LIST
    global COUNTER, NEW_ERROR_COUNT, KNOWN_ERROR_COUNT, HTTP_ERROR_COUNT, NEW_MIME_COUNT
    global MAX_NEW_ERRORS, MAX_KNOWN_ERRORS, MAX_HTTP_ERRORS, MAX_NEW_MIMES
    global USE_CONFIG, OVERWRITE, RAISE_ERRORS, ZIP_FILES, OVERRIDE_SIZE, SAVE_WORDS, SAVE_PAGES, SAVE_COUNT
    global TODO_FILE, DONE_FILE, ERR_LOG_FILE, WORD_FILE
    global RESPECT_ROBOTS, RESTRICT, DOMAIN
    global WORDS, TODO, DONE

    try:
        init()
    except Exception:
        raise SystemExit(1)

    # Create required saved/ folder
    try:
        makedirs('saved')
    except OSError:
        pass  # Assumes only OSError wil complain saved/ already exists

    # Create required files
    with open(WORD_FILE, 'w', encoding='utf-8', errors='ignore'):
        pass

    write_log('INIT', 'Successfully started spidy Web Crawler version {0}...'.format(VERSION))
    LOGGER.log(logging.INFO, 'Successfully started crawler.')

    write_log('INIT', 'Using headers: {0}'.format(HEADER))

    robots_index = RobotsIndex(RESPECT_ROBOTS, HEADER['User-Agent'])

    # Spawn threads here
    spawn_threads(robots_index)


if __name__ == '__main__':
    main()
else:
    write_log('INIT', 'Successfully imported spidy Web Crawler version {0}.'.format(VERSION))
    write_log('INIT',
              'Call `crawler.main()` to start crawling, or refer to DOCS.md to see use of specific functions.')