# encoding: utf-8 # # Copyright (c) 2014 Dean Jackson <deanishe@deanishe.net> # # MIT Licence. See http://opensource.org/licenses/MIT # # Created on 2014-02-15 # """Lightweight HTTP library with a requests-like interface.""" import codecs import json import mimetypes import os import random import re import socket import string import unicodedata import urllib import urllib2 import urlparse import zlib USER_AGENT = u'Alfred-Workflow/1.36 (+http://www.deanishe.net/alfred-workflow)' # Valid characters for multipart form data boundaries BOUNDARY_CHARS = string.digits + string.ascii_letters # HTTP response codes RESPONSES = { 100: 'Continue', 101: 'Switching Protocols', 200: 'OK', 201: 'Created', 202: 'Accepted', 203: 'Non-Authoritative Information', 204: 'No Content', 205: 'Reset Content', 206: 'Partial Content', 300: 'Multiple Choices', 301: 'Moved Permanently', 302: 'Found', 303: 'See Other', 304: 'Not Modified', 305: 'Use Proxy', 307: 'Temporary Redirect', 400: 'Bad Request', 401: 'Unauthorized', 402: 'Payment Required', 403: 'Forbidden', 404: 'Not Found', 405: 'Method Not Allowed', 406: 'Not Acceptable', 407: 'Proxy Authentication Required', 408: 'Request Timeout', 409: 'Conflict', 410: 'Gone', 411: 'Length Required', 412: 'Precondition Failed', 413: 'Request Entity Too Large', 414: 'Request-URI Too Long', 415: 'Unsupported Media Type', 416: 'Requested Range Not Satisfiable', 417: 'Expectation Failed', 500: 'Internal Server Error', 501: 'Not Implemented', 502: 'Bad Gateway', 503: 'Service Unavailable', 504: 'Gateway Timeout', 505: 'HTTP Version Not Supported' } def str_dict(dic): """Convert keys and values in ``dic`` into UTF-8-encoded :class:`str`. :param dic: Mapping of Unicode strings :type dic: dict :returns: Dictionary containing only UTF-8 strings :rtype: dict """ if isinstance(dic, CaseInsensitiveDictionary): dic2 = CaseInsensitiveDictionary() else: dic2 = {} for k, v in dic.items(): if isinstance(k, unicode): k = k.encode('utf-8') if isinstance(v, unicode): v = v.encode('utf-8') dic2[k] = v return dic2 class NoRedirectHandler(urllib2.HTTPRedirectHandler): """Prevent redirections.""" def redirect_request(self, *args): """Ignore redirect.""" return None # Adapted from https://gist.github.com/babakness/3901174 class CaseInsensitiveDictionary(dict): """Dictionary with caseless key search. Enables case insensitive searching while preserving case sensitivity when keys are listed, ie, via keys() or items() methods. Works by storing a lowercase version of the key as the new key and stores the original key-value pair as the key's value (values become dictionaries). """ def __init__(self, initval=None): """Create new case-insensitive dictionary.""" if isinstance(initval, dict): for key, value in initval.iteritems(): self.__setitem__(key, value) elif isinstance(initval, list): for (key, value) in initval: self.__setitem__(key, value) def __contains__(self, key): return dict.__contains__(self, key.lower()) def __getitem__(self, key): return dict.__getitem__(self, key.lower())['val'] def __setitem__(self, key, value): return dict.__setitem__(self, key.lower(), {'key': key, 'val': value}) def get(self, key, default=None): """Return value for case-insensitive key or default.""" try: v = dict.__getitem__(self, key.lower()) except KeyError: return default else: return v['val'] def update(self, other): """Update values from other ``dict``.""" for k, v in other.items(): self[k] = v def items(self): """Return ``(key, value)`` pairs.""" return [(v['key'], v['val']) for v in dict.itervalues(self)] def keys(self): """Return original keys.""" return [v['key'] for v in dict.itervalues(self)] def values(self): """Return all values.""" return [v['val'] for v in dict.itervalues(self)] def iteritems(self): """Iterate over ``(key, value)`` pairs.""" for v in dict.itervalues(self): yield v['key'], v['val'] def iterkeys(self): """Iterate over original keys.""" for v in dict.itervalues(self): yield v['key'] def itervalues(self): """Interate over values.""" for v in dict.itervalues(self): yield v['val'] class Response(object): """ Returned by :func:`request` / :func:`get` / :func:`post` functions. Simplified version of the ``Response`` object in the ``requests`` library. >>> r = request('http://www.google.com') >>> r.status_code 200 >>> r.encoding ISO-8859-1 >>> r.content # bytes <html> ... >>> r.text # unicode, decoded according to charset in HTTP header/meta tag u'<html> ...' >>> r.json() # content parsed as JSON """ def __init__(self, request, stream=False): """Call `request` with :mod:`urllib2` and process results. :param request: :class:`urllib2.Request` instance :param stream: Whether to stream response or retrieve it all at once :type stream: bool """ self.request = request self._stream = stream self.url = None self.raw = None self._encoding = None self.error = None self.status_code = None self.reason = None self.headers = CaseInsensitiveDictionary() self._content = None self._content_loaded = False self._gzipped = False # Execute query try: self.raw = urllib2.urlopen(request) except urllib2.HTTPError as err: self.error = err try: self.url = err.geturl() # sometimes (e.g. when authentication fails) # urllib can't get a URL from an HTTPError # This behaviour changes across Python versions, # so no test cover (it isn't important). except AttributeError: # pragma: no cover pass self.status_code = err.code else: self.status_code = self.raw.getcode() self.url = self.raw.geturl() self.reason = RESPONSES.get(self.status_code) # Parse additional info if request succeeded if not self.error: headers = self.raw.info() self.transfer_encoding = headers.getencoding() self.mimetype = headers.gettype() for key in headers.keys(): self.headers[key.lower()] = headers.get(key) # Is content gzipped? # Transfer-Encoding appears to not be used in the wild # (contrary to the HTTP standard), but no harm in testing # for it if 'gzip' in headers.get('content-encoding', '') or \ 'gzip' in headers.get('transfer-encoding', ''): self._gzipped = True @property def stream(self): """Whether response is streamed. Returns: bool: `True` if response is streamed. """ return self._stream @stream.setter def stream(self, value): if self._content_loaded: raise RuntimeError("`content` has already been read from " "this Response.") self._stream = value def json(self): """Decode response contents as JSON. :returns: object decoded from JSON :rtype: list, dict or unicode """ return json.loads(self.content, self.encoding or 'utf-8') @property def encoding(self): """Text encoding of document or ``None``. :returns: Text encoding if found. :rtype: str or ``None`` """ if not self._encoding: self._encoding = self._get_encoding() return self._encoding @property def content(self): """Raw content of response (i.e. bytes). :returns: Body of HTTP response :rtype: str """ if not self._content: # Decompress gzipped content if self._gzipped: decoder = zlib.decompressobj(16 + zlib.MAX_WBITS) self._content = decoder.decompress(self.raw.read()) else: self._content = self.raw.read() self._content_loaded = True return self._content @property def text(self): """Unicode-decoded content of response body. If no encoding can be determined from HTTP headers or the content itself, the encoded response body will be returned instead. :returns: Body of HTTP response :rtype: unicode or str """ if self.encoding: return unicodedata.normalize('NFC', unicode(self.content, self.encoding)) return self.content def iter_content(self, chunk_size=4096, decode_unicode=False): """Iterate over response data. .. versionadded:: 1.6 :param chunk_size: Number of bytes to read into memory :type chunk_size: int :param decode_unicode: Decode to Unicode using detected encoding :type decode_unicode: bool :returns: iterator """ if not self.stream: raise RuntimeError("You cannot call `iter_content` on a " "Response unless you passed `stream=True`" " to `get()`/`post()`/`request()`.") if self._content_loaded: raise RuntimeError( "`content` has already been read from this Response.") def decode_stream(iterator, r): dec = codecs.getincrementaldecoder(r.encoding)(errors='replace') for chunk in iterator: data = dec.decode(chunk) if data: yield data data = dec.decode(b'', final=True) if data: # pragma: no cover yield data def generate(): if self._gzipped: decoder = zlib.decompressobj(16 + zlib.MAX_WBITS) while True: chunk = self.raw.read(chunk_size) if not chunk: break if self._gzipped: chunk = decoder.decompress(chunk) yield chunk chunks = generate() if decode_unicode and self.encoding: chunks = decode_stream(chunks, self) return chunks def save_to_path(self, filepath): """Save retrieved data to file at ``filepath``. .. versionadded: 1.9.6 :param filepath: Path to save retrieved data. """ filepath = os.path.abspath(filepath) dirname = os.path.dirname(filepath) if not os.path.exists(dirname): os.makedirs(dirname) self.stream = True with open(filepath, 'wb') as fileobj: for data in self.iter_content(): fileobj.write(data) def raise_for_status(self): """Raise stored error if one occurred. error will be instance of :class:`urllib2.HTTPError` """ if self.error is not None: raise self.error return def _get_encoding(self): """Get encoding from HTTP headers or content. :returns: encoding or `None` :rtype: unicode or ``None`` """ headers = self.raw.info() encoding = None if headers.getparam('charset'): encoding = headers.getparam('charset') # HTTP Content-Type header for param in headers.getplist(): if param.startswith('charset='): encoding = param[8:] break if not self.stream: # Try sniffing response content # Encoding declared in document should override HTTP headers if self.mimetype == 'text/html': # sniff HTML headers m = re.search(r"""<meta.+charset=["']{0,1}(.+?)["'].*>""", self.content) if m: encoding = m.group(1) elif ((self.mimetype.startswith('application/') or self.mimetype.startswith('text/')) and 'xml' in self.mimetype): m = re.search(r"""<?xml.+encoding=["'](.+?)["'][^>]*\?>""", self.content) if m: encoding = m.group(1) # Format defaults if self.mimetype == 'application/json' and not encoding: # The default encoding for JSON encoding = 'utf-8' elif self.mimetype == 'application/xml' and not encoding: # The default for 'application/xml' encoding = 'utf-8' if encoding: encoding = encoding.lower() return encoding def request(method, url, params=None, data=None, headers=None, cookies=None, files=None, auth=None, timeout=60, allow_redirects=False, stream=False): """Initiate an HTTP(S) request. Returns :class:`Response` object. :param method: 'GET' or 'POST' :type method: unicode :param url: URL to open :type url: unicode :param params: mapping of URL parameters :type params: dict :param data: mapping of form data ``{'field_name': 'value'}`` or :class:`str` :type data: dict or str :param headers: HTTP headers :type headers: dict :param cookies: cookies to send to server :type cookies: dict :param files: files to upload (see below). :type files: dict :param auth: username, password :type auth: tuple :param timeout: connection timeout limit in seconds :type timeout: int :param allow_redirects: follow redirections :type allow_redirects: bool :param stream: Stream content instead of fetching it all at once. :type stream: bool :returns: Response object :rtype: :class:`Response` The ``files`` argument is a dictionary:: {'fieldname' : { 'filename': 'blah.txt', 'content': '<binary data>', 'mimetype': 'text/plain'} } * ``fieldname`` is the name of the field in the HTML form. * ``mimetype`` is optional. If not provided, :mod:`mimetypes` will be used to guess the mimetype, or ``application/octet-stream`` will be used. """ # TODO: cookies socket.setdefaulttimeout(timeout) # Default handlers openers = [] if not allow_redirects: openers.append(NoRedirectHandler()) if auth is not None: # Add authorisation handler username, password = auth password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm() password_manager.add_password(None, url, username, password) auth_manager = urllib2.HTTPBasicAuthHandler(password_manager) openers.append(auth_manager) # Install our custom chain of openers opener = urllib2.build_opener(*openers) urllib2.install_opener(opener) if not headers: headers = CaseInsensitiveDictionary() else: headers = CaseInsensitiveDictionary(headers) if 'user-agent' not in headers: headers['user-agent'] = USER_AGENT # Accept gzip-encoded content encodings = [s.strip() for s in headers.get('accept-encoding', '').split(',')] if 'gzip' not in encodings: encodings.append('gzip') headers['accept-encoding'] = ', '.join(encodings) # Force POST by providing an empty data string if method == 'POST' and not data: data = '' if files: if not data: data = {} new_headers, data = encode_multipart_formdata(data, files) headers.update(new_headers) elif data and isinstance(data, dict): data = urllib.urlencode(str_dict(data)) # Make sure everything is encoded text headers = str_dict(headers) if isinstance(url, unicode): url = url.encode('utf-8') if params: # GET args (POST args are handled in encode_multipart_formdata) scheme, netloc, path, query, fragment = urlparse.urlsplit(url) if query: # Combine query string and `params` url_params = urlparse.parse_qs(query) # `params` take precedence over URL query string url_params.update(params) params = url_params query = urllib.urlencode(str_dict(params), doseq=True) url = urlparse.urlunsplit((scheme, netloc, path, query, fragment)) req = urllib2.Request(url, data, headers) return Response(req, stream) def get(url, params=None, headers=None, cookies=None, auth=None, timeout=60, allow_redirects=True, stream=False): """Initiate a GET request. Arguments as for :func:`request`. :returns: :class:`Response` instance """ return request('GET', url, params, headers=headers, cookies=cookies, auth=auth, timeout=timeout, allow_redirects=allow_redirects, stream=stream) def post(url, params=None, data=None, headers=None, cookies=None, files=None, auth=None, timeout=60, allow_redirects=False, stream=False): """Initiate a POST request. Arguments as for :func:`request`. :returns: :class:`Response` instance """ return request('POST', url, params, data, headers, cookies, files, auth, timeout, allow_redirects, stream) def encode_multipart_formdata(fields, files): """Encode form data (``fields``) and ``files`` for POST request. :param fields: mapping of ``{name : value}`` pairs for normal form fields. :type fields: dict :param files: dictionary of fieldnames/files elements for file data. See below for details. :type files: dict of :class:`dict` :returns: ``(headers, body)`` ``headers`` is a :class:`dict` of HTTP headers :rtype: 2-tuple ``(dict, str)`` The ``files`` argument is a dictionary:: {'fieldname' : { 'filename': 'blah.txt', 'content': '<binary data>', 'mimetype': 'text/plain'} } - ``fieldname`` is the name of the field in the HTML form. - ``mimetype`` is optional. If not provided, :mod:`mimetypes` will be used to guess the mimetype, or ``application/octet-stream`` will be used. """ def get_content_type(filename): """Return or guess mimetype of ``filename``. :param filename: filename of file :type filename: unicode/str :returns: mime-type, e.g. ``text/html`` :rtype: str """ return mimetypes.guess_type(filename)[0] or 'application/octet-stream' boundary = '-----' + ''.join(random.choice(BOUNDARY_CHARS) for i in range(30)) CRLF = '\r\n' output = [] # Normal form fields for (name, value) in fields.items(): if isinstance(name, unicode): name = name.encode('utf-8') if isinstance(value, unicode): value = value.encode('utf-8') output.append('--' + boundary) output.append('Content-Disposition: form-data; name="%s"' % name) output.append('') output.append(value) # Files to upload for name, d in files.items(): filename = d[u'filename'] content = d[u'content'] if u'mimetype' in d: mimetype = d[u'mimetype'] else: mimetype = get_content_type(filename) if isinstance(name, unicode): name = name.encode('utf-8') if isinstance(filename, unicode): filename = filename.encode('utf-8') if isinstance(mimetype, unicode): mimetype = mimetype.encode('utf-8') output.append('--' + boundary) output.append('Content-Disposition: form-data; ' 'name="%s"; filename="%s"' % (name, filename)) output.append('Content-Type: %s' % mimetype) output.append('') output.append(content) output.append('--' + boundary + '--') output.append('') body = CRLF.join(output) headers = { 'Content-Type': 'multipart/form-data; boundary=%s' % boundary, 'Content-Length': str(len(body)), } return (headers, body)