# Copyright (c) 2019 NASK. All rights reserved. import datetime import requests from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry from n6lib.datetime_helpers import parse_iso_datetime_to_utc class RequestPerformer(object): """ A simple yet flexible tool to download data with HTTP/HTTPS (able to deal with huge downloads in a memory-efficient manner). RequestPerformer is intended to be used in one of the following ways: (1) using its `fetch()` class method to perform a "one-shot" download (in this case the `stream` option is automatically forced to be `False`, so it cannot be passed in): content = RequestPerformer.fetch('GET', 'https://example.com') (2) using its *context manager* and *iterator* interfaces to download the content in a stream-like way (making it possible to deal with large amounts of data in a memory-efficient manner): with RequestPerformer('GET', 'https://example.com') as perf: for chunk in perf: # `chunk` is a byte string (str) my_temp_file.write(chunk) (3) using the *context manager* interface to initialize the download and use `requests.Response` and/or `requests.Session` objects directly (within its `with` block a `RequestPerformer` instance has the following public attributes: `request` set to `requests.Response` instance, and `session` set to `requests.Session` instance): with RequestPerformer('GET', 'https://example.com') as perf: print perf.response.content # download all data now! print perf.response.headers # get headers print perf.session.cookies # get cookies (as a CookieJar instance) # etc... The (2) and (3) ways can be combined, except that -- if the `stream` keyword argument is true (see below) -- you can **either** use the *iterator* interface [see above: (2)] **or** get directly the `content` attribute of the `response` attribute [see above: (3)], but should **not** do both. Apart from specifying the HTTP method and the URL, you can also pass in a lot of other arguments (see the constructor args/kwargs described below), especially various options related to the `requests` library's stuff (used under the hood). For example: with RequestPerformer('POST', 'https://www.example.com', data={'example': 'data'}, headers={'header-test: 'true'}, retries=3, allow_redirects=True) as perf: # (...) Required constructor args/kwargs: `method` (str or unicode): The HTTP method name (the `SUPPORTED_HTTP_METHODS` class attribute contains its allowed values). To be passed into `requests.Session.request()`. `url` (str or unicode): The URL to download from (must start with one of the strings the `SUPPORTED_URL_PREFIXES` class attribute contains). To be passed into `requests.Session.request()`. Optional constructor kwargs: `data` (str or file-like, or dict, or list of 2-tuples; default: `None`): The data to be optionally sent as the request body. To be passed into `requests.Session.request()`. `headers` (dict; default: `None`): The headers to be optionally attached to the request. To be passed into `requests.Session.request()`. `allow_redirects` (boolean; default: `False`): Set to `True` to allow redirects in case of GET/OPTIONS/POST/PUT/PATCH/DELETE/HEAD method. To be passed into `requests.Session.request()`. `timeout` (int or float, or 2-tuple of ints/floats; default: `(12.1, 25)`): The request timeout specification -- to be passed into `requests.Session.request()`. See also: http://docs.python-requests.org/en/stable/user/advanced/#timeouts. `retries` (int; default: `0`): Maximum number of attempts to retry on request failures (such as a connection problem or a server-side error whose HTTP status code suggests that a repeated request may succeed). If non-zero, it is, along with `backoff_factor`, passed into the `Retry` constructor (as the `total` argument -- see: `urllib3.util.retry.Retry`). If equal to 0 (the default), no retries are attempted (that is, an exception is raised on first failure). Note that the `retries` number does not include the first request, e.g., `retries=3` means that there can be up to 4 request attempts (the first one plus maximum of 3 retries). *Beware* that, depending on the server semantics, each attempt may change the server-side state/data (especially when `method` is POST or PATCH). `backoff_factor` (float; default: `0.1`): Backoff factor to apply between retry attempts. If the `retries` argument is non-zero, the `backoff_factor` argument is passed into the `Retry` constructor (see: `urllib3.util.retry.Retry`). For non-zero `backoff_factor`, delays between requests increase. According to `urllib3` docs, the delay before the first retry is `0s` (i.e., the first retry is immediate), and the formula for consecutive delays is: `backoff_factor * (2 ** (<retry number> - 1)) seconds` (but *never more* than `urllib3.util.retry.Retry.BACKOFF_MAX`, that is, 120 seconds). Therefore, for `backoff_factor=0.1` (default) the sequence of delays between consecutive requests is: 0s, 0.2s, 0.4s, 0.8s, 1.6s... `stream` (boolean; default: `True`): * If `True`: contents will be downloaded *in chunks* (see: the `chunk_size` argument described below) -- and *only* when: (a) the RequestPerformer instance is iterated over (downloading one chunk per iteration step), or (b) the `response.content` attribute (or a similar attribute, such as `response.text`...) is accessed the first time (then all contents will be downloaded immediately and set as the `response.content` attribute) -- however, note that if you do not need the *stream-like* (iteration-based) way of downloading the contents you probably just need to use the `stream=False` variant (see below); note: the (a) and (b) ways should *not* be combined. * If `False`: contents will be downloaded *all at once*, just when entering the `with` block; then the contents will be accessible as the `response.content` attribute (and also by iteration over the RequestPerformer instance). This argument will also be passed into `requests.Session.request()`. `chunk_size` (int; default: `2 ** 16`): When downloading data in the stream-like way -- the amount of data (in bytes) to be transferred per iteration. Relevant only if the `stream` argument is true, otherwise ignored. Note: it is not necessarily the length of each yielded data chunk (because of decoding...). `custom_session_attrs` (dict; default: `None`): Custom `requests.Session()` instance attribute values. (see: http://docs.python-requests.org/en/master/user/advanced/ and http://docs.python-requests.org/en/master/api/#sessionapi) Other arbitrary kwargs: To be passed into `requests.Session.request()`. Exceptions raised by the constructor and/or methods: * ValueError -- for: * unsupported `method`/`url` values; * `data` being a file-like object whose content's length cannot be determined without consuming it, *and*, at the same time, non-zero `retries` given. * Any exception that can be raised by the `requests` or `urrlib3` libraries. For more information about args/kwargs related to the `requests` or `urllib3` libraries mentioned above -- see: * http://docs.python-requests.org/en/stable/api/ * http://docs.python-requests.org/en/stable/user/advanced/ * https://urllib3.readthedocs.io/en/stable/reference/urllib3.util.html#urllib3.util.retry.Retry """ SUPPORTED_HTTP_METHODS = ('DELETE', 'GET', 'HEAD', 'OPTIONS', 'PATCH', 'POST', 'PUT', 'TRACE') SUPPORTED_URL_PREFIXES = ('http://', 'https://') _RETRY_KWARGS_BASE = { 'redirect': False, 'respect_retry_after_header': False, # ad `status_forcelist` -- see: https://tools.ietf.org/html/rfc7231#section-6.6 'status_forcelist': {500, 502, 503, 504}, 'method_whitelist': False, } # sequence of datetime.strptime() formats tried (consecutively) by # the get_dt_header() method to parse date+time response headers # (see also: https://tools.ietf.org/html/rfc7231#section-7.1.1.1) _HTTP_DATETIME_FORMATS = ( # *note:* these 3 formats require that the C locale is set # (`n6lib` already ensures that; see n6lib/__init__.py...) # the preferred format '%a, %d %b %Y %H:%M:%S GMT', # old RFC-850 format '%A, %d-%b-%y %H:%M:%S GMT', # old ANSI C's asctime() format # (note: using '%d' here is OK, because datetime.strptime() # is lenient about '%d' vs. numbers that are *not* zero-padded, # as well as about extra spaces *between* input string elements). '%a %b %d %H:%M:%S %Y', # (apart from trying the above 3 formats, the get_dt_header() # method tries -- as the last attempt -- ISO 8601 parsing) ) def __init__(self, method, url, data=None, headers=None, allow_redirects=False, timeout=(12.1, 25), retries=0, backoff_factor=0.1, stream=True, chunk_size=(2 ** 16), custom_session_attrs=None, **extra_request_kwargs): method = self._get_valid_method(method=method) url = self._get_valid_url(url=url) self.session = None # to be set in __enter__() self.response = None # to be set in __enter__() self._actual_iterator = None # to be set in __enter__() self._custom_session_attrs = custom_session_attrs self._request_kwargs = dict(extra_request_kwargs, method=method, url=url, data=data, headers=headers, timeout=timeout, allow_redirects=allow_redirects, stream=stream) self._retry_conf = self._get_retry_conf(retries=retries, backoff_factor=backoff_factor) self._chunk_size = chunk_size if stream else None @classmethod def fetch(cls, *args, **kwargs): """ Download all content at once. Args/kwargs: The same as constructor args/kwargs, except that `stream` is forbidden (internally it is forcibly set to `False`). Returns: The downloaded content (str). Raises: * ValueError: for unsupported `method`/`url` values. * Any exception that can be raised by the `requests` or `urrlib3` libraries. """ with RequestPerformer(*args, stream=False, **kwargs) as perf: return perf.response.content def __enter__(self): self.session = requests.Session() try: self._set_custom_session_attrs() self._set_up_retries() self.response = self.session.request(**self._request_kwargs) self.response.raise_for_status() self._actual_iterator = self.response.iter_content(chunk_size=self._chunk_size) except: self.session.close() raise return self def __exit__(self, exc_type, exc_val, exc_tb): self.session.close() def __iter__(self): return self def next(self): return next(self._actual_iterator) def get_dt_header(self, header_key): """ A helper method to retrieve a response header as a date+time. Args/kwargs: `header_key`: The name of the HTTP response header. Returns: `None` or UTC date+time as a `datetime.datetime` instance (a naive one, i.e., without explicit timezone information). Example usage: with RequestPerformer('GET', 'http://example.com/FOO') as perf: foo_last_modified = perf.get_dt_header('Last-Modified') if foo_last_modified is None: print 'I have no idea when FOO was modified.` else: print 'FOO modification date+time:', foo_last_modified.isoformat() """ raw_value = (self.response.headers.get(header_key) or '').strip() if raw_value: for dt_format in self._HTTP_DATETIME_FORMATS: try: return datetime.datetime.strptime(raw_value, dt_format) except ValueError: pass try: return parse_iso_datetime_to_utc(raw_value) except ValueError: pass return None def _get_valid_method(self, method): method = method.upper() if method not in self.SUPPORTED_HTTP_METHODS: raise ValueError('HTTP method {!r} not supported'.format(method)) return method def _get_valid_url(self, url): url = self._get_url_with_lowercased_proto(url) if not url.startswith(self.SUPPORTED_URL_PREFIXES): raise ValueError('URL prefix {!r} not supported'.format(url)) return url @staticmethod def _get_url_with_lowercased_proto(url): url_parts = url.split(':', 1) url_parts[0] = url_parts[0].lower() url = ':'.join(url_parts) return url def _get_retry_conf(self, retries, backoff_factor): if retries: retry_conf = Retry(backoff_factor=backoff_factor, total=retries, **self._RETRY_KWARGS_BASE) return retry_conf return None def _set_custom_session_attrs(self): if self._custom_session_attrs: for name, value in self._custom_session_attrs.items(): setattr(self.session, name, value) def _set_up_retries(self): if self._retry_conf is not None: for http_prefix in self.SUPPORTED_URL_PREFIXES: self.session.mount(http_prefix, _HTTPAdapterForRetries(max_retries=self._retry_conf)) class _HTTPAdapterForRetries(HTTPAdapter): def send(self, request, *args, **kwargs): content_length_is_unknown = (request.body is not None and 'Content-Length' not in request.headers) if content_length_is_unknown: # it seems that requests's HTTPAdapter does not perform # retries in such a case, even though they were requested # (see the source code of HTTPAdapter.send() in conjunction # with urllib3.connectionpool.HTTPConnectionPool.urlopen() # and urllib3.util.retry.Retry.increment()...) -- so here # we raise an exception to prevent such a silent omission # [we analyzed this for requests==2.21.0 and urllib3==1.24.1] raise ValueError('non-zero `retries` has been specified and, ' 'at the same time, Content-Length of the request ' 'could not be determined (suggested solutions: ' 'specify `data` whose length is discoverable, ' 'or specify `retries=0`)') return super(_HTTPAdapterForRetries, self).send(request, *args, **kwargs)