python source code of downloader

# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
import os
import sys
IS_PY2 = sys.version_info < (3, 0)

if IS_PY2:
    # Python 2
    from urllib2 import Request, urlopen, HTTPError, URLError
else:
    # Python 3
    from urllib.request import Request, urlopen, HTTPError, URLError
    unicode = str

import ssl

from collections import defaultdict
from .threadpool import ThreadPool
from .colorprint import colorprint, OKGREEN, FAIL

MAX_THREADS_DEFAULT = 7

# Used to allow downloading files even if https certificate doesn't match
if hasattr(ssl, "_create_unverified_context"):
    ssl_unverified_context = ssl._create_unverified_context()
else:
    # Not existing in Python 2.6
    ssl_unverified_context = None


def sanitize_url(url):
    """ Make sure this url works with urllib2 (ascii, http, etc) """
    if url and not url.startswith("http"):
        url = u"http://%s" % url
    url = url.encode('ascii', 'ignore').decode("utf-8")
    return url


def get_status_code(url):
    """ Perform HEAD request and return status code """
    try:
        request = Request(sanitize_url(url))
        request.add_header("User-Agent", "Mozilla/5.0 (compatible; MSIE 9.0; "
                           "Windows NT 6.1; Trident/5.0)")
        request.get_method = lambda: 'HEAD'
        response = urlopen(request, context=ssl_unverified_context)
        # print response.info()
        return response.getcode()
    except HTTPError as e:
        return e.code
    except URLError as e:
        return e.reason
    except Exception as e:
        print(e, url)
        return None


def check_refs(refs, verbose=True, max_threads=MAX_THREADS_DEFAULT):
    """ Check if urls exist """
    codes = defaultdict(list)

    def check_url(ref):
        url = ref.ref
        status_code = str(get_status_code(url))
        codes[status_code].append(ref)
        if verbose:
            if status_code == "200":
                colorprint(OKGREEN, "%s - %s" % (status_code, url))
            else:
                colorprint(FAIL, "%s - %s" % (status_code, url))

    # Start a threadpool and add the check-url tasks
    try:
        pool = ThreadPool(5)
        pool.map(check_url, refs)
        pool.wait_completion()

    except Exception as e:
        print(e)
    except KeyboardInterrupt:
        pass

    # Print summary
    print("\nSummary of link checker:")
    if "200" in codes:
        colorprint(OKGREEN, "%s working" % len(codes["200"]))
    for c in sorted(codes):
        if c != "200":
            colorprint(FAIL, "%s broken (reason: %s)" % (len(codes[c]), c))
            for ref in codes[c]:
                o = u"  - %s" % ref.ref
                if ref.page > 0:
                    o += " (page %s)" % ref.page
                print(o)


def download_urls(urls, output_directory, verbose=True,
                  max_threads=MAX_THREADS_DEFAULT):
    """ Download urls to a target directory """
    assert type(urls) in [list, tuple, set], "Urls must be some kind of list"
    assert len(urls), "Need urls"
    assert output_directory, "Need an output_directory"

    def vprint(s):
        if verbose:
            print(s)

    def download_url(url):
        try:
            fn = url.split("/")[-1]
            fn_download = os.path.join(output_directory, fn)
            with open(fn_download, "wb") as f:
                request = Request(sanitize_url(url))
                request.add_header("User-Agent", "Mozilla/5.0 (compatible; "
                                   "MSIE 9.0; Windows NT 6.1; Trident/5.0)")
                response = urlopen(request, context=ssl_unverified_context)
                status_code = response.getcode()
                if status_code == 200:
                    f.write(urlopen(request).read())
                    colorprint(OKGREEN, "Downloaded '%s' to '%s'" %
                                        (url, fn_download))
                else:
                    colorprint(FAIL, "Error downloading '%s' (%s)" %
                                     (url, status_code))
        except HTTPError as e:
            colorprint(FAIL, "Error downloading '%s' (%s)" % (url, e.code))
        except URLError as e:
            colorprint(FAIL, "Error downloading '%s' (%s)" % (url, e.reason))
        except Exception as e:
            colorprint(FAIL, "Error downloading '%s' (%s)" % (url, str(e)))

    # Create directory
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
        vprint("Created directory '%s'" % output_directory)

    try:
        pool = ThreadPool(5)
        pool.map(download_url, urls)
        pool.wait_completion()

    except Exception as e:
        print(e)
    except KeyboardInterrupt:
        pass