python source code of homura

# -*- coding: utf-8 -*-
from __future__ import print_function, absolute_import
import datetime
import os
import six
import sys
import time
import pycurl
import shutil
from six.moves.urllib.parse import urlparse, unquote as _unquote
from humanize import naturalsize

try:
    import certifi
except ImportError:
    certifi = None

PY3 = sys.version_info[0] == 3
STREAM = sys.stderr
DEFAULT_RESOURCE = 'index.html'

__version__ = '0.1.5'


def eval_path(path):
    return os.path.abspath(os.path.expanduser(path))


def utf8_encode(s):
    res = s
    if isinstance(res, six.text_type):
        res = s.encode('utf-8')
    return res


def utf8_decode(s):
    res = s
    if isinstance(res, six.binary_type):
        res = s.decode('utf-8')
    return res


def unquote(s):
    res = s
    if not PY3:
        if isinstance(res, six.text_type):
            res = s.encode('utf-8')
    return _unquote(res)


def dict_to_list(d):
    return ['%s: %s' % (k, v) for k, v in d.items()]


def is_temp_path(path):
    if path is None:
        return True
    else:
        path = eval_path(path)
        if os.path.isdir(path):
            return True
    return False


def get_resource_name(url):
    url = utf8_decode(url)  # decode to unicode so PY3's urlparse won't break
    o = urlparse(url)
    resource = os.path.basename(o.path)
    if not resource:
        res = DEFAULT_RESOURCE
    else:
        res = resource
    return utf8_decode(unquote(res))


class Homura(object):
    progress_template = \
        '%(percent)6d%% %(downloaded)12s %(speed)15s %(eta)18s ETA' + ' ' * 4
    eta_limit = 2592000  # 30 days

    def __init__(self, url, path=None, headers=None, session=None,
                 show_progress=True, resume=True, auto_retry=True,
                 max_rst_retries=5, pass_through_opts=None, cainfo=None,
                 user_agent=None, auth=None):
        """
        :param str url: URL of the file to be downloaded
        :param str path: local path for the downloaded file; if None, it will
            be the URL base name
        :param dict headers: extra headers
        :param session: session used to download (if you want to work with
            requests library)
        :type session: `class:requests.Session`
        :param bool show_progress: whether to show download progress
        :param bool resume: whether to resume download (by
            filename)
        :param bool auto_retry: whether to retry automatically upon closed
            transfer until the file's download is finished
        :param int max_rst_retries: number of retries upon connection reset by
            peer (effective only when `auto_retry` is True)
        :param dict pass_through_opts: a dictionary of options passed to cURL
        :param str cainfo: optional path to a PEM file containing the CA
            certificate
        :param str user_agent: set a custom user agent string
        :param tuple auth: a tuple of username and password for authentication
        """
        self.url = url  # url is in unicode
        self.path = self._get_path(path, url)
        self.headers = headers
        self.session = session
        self.show_progress = show_progress
        self.resume = resume
        self.auto_retry = auto_retry
        self.max_rst_retries = max_rst_retries
        self.cainfo = cainfo
        self.start_time = None
        self.content_length = None
        self.downloaded = 0
        self.auth = None
        if session:
            self.auth = session.auth
        if auth:
            self.auth = auth
        self._path = path  # Save given path
        self._pycurl = pycurl.Curl()
        self._cookie_header = self._get_cookie_header()
        self._last_time = 0.0
        self._rst_retries = 0
        self._pass_through_opts = pass_through_opts
        self._user_agent = user_agent or 'homura/' + __version__

    def _get_cookie_header(self):
        if self.session is not None:
            cookie = dict(self.session.cookies)
            res = []
            for k, v in cookie.items():
                s = '%s=%s' % (k, v)
                res.append(s)
            if not res:
                return None
            return '; '.join(res)

    def _get_path(self, path, url):
        if path is None:
            res = get_resource_name(url)
        else:
            # decode path to unicode so that os.path.join won't break
            res = eval_path(utf8_decode(path))
            if os.path.isdir(res):
                resource = get_resource_name(url)
                res = os.path.join(res, resource)
        return res

    def _get_pycurl_headers(self):
        headers = self.headers or {}
        if self._cookie_header is not None:
            headers['Cookie'] = self._cookie_header
        return dict_to_list(headers) or None

    def _fill_in_cainfo(self):
        """Fill in the path of the PEM file containing the CA certificate.

        The priority is: 1. user provided path, 2. path to the cacert.pem
        bundle provided by certifi (if installed), 3. let pycurl use the
        system path where libcurl's cacert bundle is assumed to be stored,
        as established at libcurl build time.
        """
        if self.cainfo:
            cainfo = self.cainfo
        else:
            try:
                cainfo = certifi.where()
            except AttributeError:
                cainfo = None
        if cainfo:
            self._pycurl.setopt(pycurl.CAINFO, cainfo)

    def curl(self):
        """Sending a single cURL request to download"""
        c = self._pycurl
        # Resume download
        if os.path.exists(self.path) and self.resume:
            mode = 'ab'
            self.downloaded = os.path.getsize(self.path)
            c.setopt(pycurl.RESUME_FROM, self.downloaded)
        else:
            mode = 'wb'
        with open(self.path, mode) as f:
            c.setopt(c.URL, utf8_encode(self.url))
            if self.auth:
                c.setopt(c.USERPWD, '%s:%s' % self.auth)
            c.setopt(c.USERAGENT, self._user_agent)
            c.setopt(c.WRITEDATA, f)
            h = self._get_pycurl_headers()
            if h is not None:
                c.setopt(pycurl.HTTPHEADER, h)
            c.setopt(c.NOPROGRESS, 0)
            c.setopt(pycurl.FOLLOWLOCATION, 1)
            c.setopt(c.PROGRESSFUNCTION, self.progress)
            self._fill_in_cainfo()
            if self._pass_through_opts:
                for key, value in self._pass_through_opts.items():
                    c.setopt(key, value)
            c.perform()

    def start(self):
        """
        Start downloading, handling auto retry, download resume and path
        moving
        """
        if not self.auto_retry:
            self.curl()
            return
        while not self.is_finished:
            try:
                self.curl()
            except pycurl.error as e:
                # transfer closed with n bytes remaining to read
                if e.args[0] == pycurl.E_PARTIAL_FILE:
                    pass
                # HTTP server doesn't seem to support byte ranges.
                # Cannot resume.
                elif e.args[0] == pycurl.E_HTTP_RANGE_ERROR:
                    break
                # Recv failure: Connection reset by peer
                elif e.args[0] == pycurl.E_RECV_ERROR:
                    if self._rst_retries < self.max_rst_retries:
                        pass
                    else:
                        raise e
                    self._rst_retries += 1
                else:
                    raise e
        self._move_path()
        self._done()

    def progress(self, download_t, download_d, upload_t, upload_d):
        self.content_length = self.downloaded + int(download_t)
        if int(download_t) == 0:
            return
        if not self.show_progress:
            return
        if self.start_time is None:
            self.start_time = time.time()
        duration = time.time() - self.start_time + 1
        speed = download_d / duration
        speed_s = naturalsize(speed, binary=True)
        speed_s += '/s'
        if speed == 0.0:
            eta = self.eta_limit
        else:
            eta = int((download_t - download_d) / speed)
        if eta < self.eta_limit:
            eta_s = str(datetime.timedelta(seconds=eta))
        else:
            eta_s = 'n/a'
        downloaded = self.downloaded + download_d
        downloaded_s = naturalsize(downloaded, binary=True)
        percent = int(downloaded / self.content_length * 100)
        params = {
            'downloaded': downloaded_s,
            'percent': percent,
            'speed': speed_s,
            'eta': eta_s,
        }
        if STREAM.isatty():
            p = (self.progress_template + '\r') % params
        else:
            current_time = time.time()
            if self._last_time == 0.0:
                self._last_time = current_time
            else:
                interval = current_time - self._last_time
                if interval < 0.5:
                    return
                self._last_time = current_time
            p = (self.progress_template + '\n') % params
        STREAM.write(p)
        STREAM.flush()

    @property
    def is_finished(self):
        if os.path.exists(self.path):
            return self.content_length == os.path.getsize(self.path)

    def _done(self):
        STREAM.write('\n')
        STREAM.flush()

    def _move_path(self):
        """
        Move the downloaded file to the authentic path (identified by
        effective URL)
        """
        if is_temp_path(self._path) and self._pycurl is not None:
            eurl = self._pycurl.getinfo(pycurl.EFFECTIVE_URL)
            er = get_resource_name(eurl)
            r = get_resource_name(self.url)
            if er != r and os.path.exists(self.path):
                new_path = self._get_path(self._path, eurl)
                shutil.move(self.path, new_path)
                self.path = new_path


def download(url, path=None, headers=None, session=None, show_progress=True,
             resume=True, auto_retry=True, max_rst_retries=5,
             pass_through_opts=None, cainfo=None, user_agent=None, auth=None):
    """Main download function"""
    hm = Homura(url, path, headers, session, show_progress, resume,
                auto_retry, max_rst_retries, pass_through_opts, cainfo,
                user_agent, auth)
    hm.start()