#!/usr/bin/env python #-*- coding: utf8 -*- # Author: lilydjwg (https://github.com/lilydjwg) # Source: https://github.com/lilydjwg/winterpy/blob/master/pylib/mytornado/fetchtitle.py # 由 cold (https://github.com/coldnight) 添加 Python2 支持 # vim:fileencoding=utf-8 import re import socket try: from urllib.parse import urlsplit, urljoin py3 = True except ImportError: from urlparse import urlsplit, urljoin # py2 py3 = False from functools import partial from collections import namedtuple import struct import json import logging import encodings.idna try: # Python 3.3 from html.entities import html5 as _entities def _extract_entity_name(m): return m.group()[1:] except ImportError: try: from html.entities import entitydefs as _entities except ImportError: from htmlentitydefs import entitydefs as _entities # py2 def _extract_entity_name(m): return m.group()[1:-1] import tornado.ioloop import tornado.iostream # try to import C parser then fallback in pure python parser. try: from http_parser.parser import HttpParser except ImportError: try: from http_parser.pyparser import HttpParser except ImportError: from HTMLParser import HttpParser # py2 UserAgent = 'FetchTitle/1.2 (wh_linux@126.com)' class SingletonFactory: def __init__(self, name): self.name = name def __repr__(self): return '<%s>' % self.name MediaType = namedtuple('MediaType', 'type size dimension') defaultMediaType = MediaType('application/octet-stream', None, None) ConnectionClosed = SingletonFactory('ConnectionClosed') TooManyRedirection = SingletonFactory('TooManyRedirection') Timeout = SingletonFactory('Timeout') logger = logging.getLogger(__name__) def _sharp2uni(code): '''&#...; ==> unicode''' s = code[1:].rstrip(';') if s.startswith('x'): return chr(int('0'+s, 16)) else: return chr(int(s)) def _mapEntity(m): name = _extract_entity_name(m) if name.startswith('#'): return _sharp2uni(name) try: return _entities[name] except KeyError: return '&' + name def replaceEntities(s): return re.sub(r'&[^;]+;', _mapEntity, s) class ContentFinder: buf = b'' def __init__(self, mediatype): self._mt = mediatype @classmethod def match_type(cls, mediatype): ctype = mediatype.type.split(';', 1)[0] if hasattr(cls, '_mime') and cls._mime == ctype: return cls(mediatype) if hasattr(cls, '_match_type') and cls._match_type(ctype): return cls(mediatype) return False class TitleFinder(ContentFinder): found = False title_begin = re.compile(b'<title[^>]*>', re.IGNORECASE) title_end = re.compile(b'</title\s*>', re.IGNORECASE) pos = 0 default_charset = 'UTF-8' meta_charset = re.compile(br'<meta\s+http-equiv="?content-type"?\s+content="?[^;]+;\s*charset=([^">]+)"?\s*/?>|<meta\s+charset="?([^">/"]+)"?\s*/?>', re.IGNORECASE) charset = None @staticmethod def _match_type(ctype): return ctype.find('html') != -1 def __init__(self, mediatype): ctype = mediatype.type pos = ctype.find('charset=') if pos > 0: self.charset = ctype[pos+8:] if self.charset.lower() == 'gb2312': # Windows misleadingly uses gb2312 when it's gbk or gb18030 self.charset = 'gb18030' def __call__(self, data): if data is not None: self.buf += data self.pos += len(data) if len(self.buf) < 100: return buf = self.buf if self.charset is None: m = self.meta_charset.search(buf) if m: self.charset = (m.group(1) or m.group(2)).decode('latin1') if not self.found: m = self.title_begin.search(buf) if m: buf = self.buf = buf[m.end():] self.found = True if self.found: m = self.title_end.search(buf) if m: raw_title = buf[:m.start()].strip() logger.debug('title found at %d', self.pos - len(buf) + m.start()) elif len(buf) > 200: # when title goes too long raw_title = buf[:200] + b'...' logger.warn('title too long, starting at %d', self.pos - len(buf)) else: raw_title = False if raw_title: return self.decode_title(raw_title) if not self.found: self.buf = buf[-100:] def decode_title(self, raw_title): try: title = replaceEntities(raw_title.decode(self.get_charset(), errors='replace')) return title except (UnicodeDecodeError, LookupError): return raw_title def get_charset(self): return self.charset or self.default_charset class PNGFinder(ContentFinder): _mime = 'image/png' def __call__(self, data): if data is None: return self._mt self.buf += data if len(self.buf) < 24: # can't decide yet return if self.buf[:16] != b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR': logging.warn('Bad PNG signature and header: %r', self.buf[:16]) return self._mt._replace(dimension='Bad PNG') else: s = struct.unpack('!II', self.buf[16:24]) return self._mt._replace(dimension=s) class JPEGFinder(ContentFinder): _mime = 'image/jpeg' isfirst = True def __call__(self, data): if data is None: return self._mt # http://www.64lines.com/jpeg-width-height if data: self.buf += data if self.isfirst is True: # finding header if len(self.buf) < 5: return if self.buf[:3] != b'\xff\xd8\xff': logging.warn('Bad JPEG signature: %r', self.buf[:3]) return self._mt._replace(dimension='Bad JPEG') else: if not py3: self.blocklen = ord(self.buf[4]) * 256 + ord(self.buf[5]) + 2 else: self.blocklen = self.buf[4] * 256 + self.buf[5] + 2 self.buf = self.buf[2:] self.isfirst = False if self.isfirst is False: # receiving a block. 4 is for next block size if len(self.buf) < self.blocklen + 4: return buf = self.buf if ord(buf[0]) != 0xff: logging.warn('Bad JPEG: %r', self.buf[:self.blocklen]) return self._mt._replace(dimension='Bad JPEG') if (py3 and buf[1] == 0xc0 or buf[1] == 0xc2) or\ (ord(buf[1]) == 0xc0 or (buf[1]) == 0xc2): if not py3: s = ord(buf[7]) * 256 + ord(buf[8]), ord(buf[5]) * 256 + ord(buf[6]) else: s = buf[7] * 256 + buf[8], buf[5] * 256 + buf[6] return self._mt._replace(dimension=s) else: # not Start Of Frame, retry with next block self.buf = buf = buf[self.blocklen:] if not py3: self.blocklen = ord(buf[2]) * 256 + ord(buf[3]) + 2 else: self.blocklen = buf[2] * 256 + buf[3] + 2 return self(b'') class GIFFinder(ContentFinder): _mime = 'image/gif' def __call__(self, data): if data is None: return self._mt self.buf += data if len(self.buf) < 10: # can't decide yet return if self.buf[:3] != b'GIF': logging.warn('Bad GIF signature: %r', self.buf[:3]) return self._mt._replace(dimension='Bad GIF') else: s = struct.unpack('<HH', self.buf[6:10]) return self._mt._replace(dimension=s) class TitleFetcher: status_code = 0 followed_times = 0 # 301, 302 finder = None addr = None stream = None max_follows = 10 timeout = 15 _finished = False _cookie = None _connected = False _redirected_stream = None _content_finders = (TitleFinder, PNGFinder, JPEGFinder, GIFFinder) _url_finders = () def __init__(self, url, callback, timeout=None, max_follows=None, io_loop=None, content_finders=None, url_finders=None, referrer=None, run_at_init=True, ): ''' url: the (full) url to fetch callback: called with title or MediaType or an instance of SingletonFactory timeout: total time including redirection before giving up max_follows: max redirections may raise: <UnicodeError: label empty or too long> in host preparation ''' self._callback = callback self.referrer = referrer if max_follows is not None: self.max_follows = max_follows if timeout is not None: self.timeout = timeout if hasattr(tornado.ioloop, 'current'): default_io_loop = tornado.ioloop.IOLoop.current else: default_io_loop = tornado.ioloop.IOLoop.instance self.io_loop = io_loop or default_io_loop() if content_finders is not None: self._content_finders = content_finders if url_finders is not None: self._url_finders = url_finders self.origurl = url self.url_visited = [] if run_at_init: self.run() def run(self): if self.url_visited: raise Exception("can't run again") else: self.start_time = self.io_loop.time() self._timeout = self.io_loop.add_timeout( self.timeout + self.start_time, self.on_timeout, ) try: self.new_url(self.origurl) finally: self.io_loop.remove_timeout(self._timeout) def on_timeout(self): self.run_callback(Timeout) def parse_url(self, url): '''parse `url`, set self.host and return address and stream class''' self.url = u = urlsplit(url) self.host = u.netloc if u.scheme == 'http': addr = u.hostname, u.port or 80 stream = tornado.iostream.IOStream elif u.scheme == 'https': addr = u.hostname, u.port or 443 stream = tornado.iostream.SSLIOStream else: raise ValueError('bad url: %r' % url) return addr, stream def new_connection(self, addr, StreamClass): '''set self.addr, self.stream and connect to host''' s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.addr = addr self.stream = StreamClass(s) logger.debug('%s: connecting to %s...', self.origurl, addr) self.stream.set_close_callback(self.before_connected) self.stream.connect(addr, self.send_request) def new_url(self, url): self.url_visited.append(url) self.fullurl = url for finder in self._url_finders: f = finder.match_url(url, self) if f: self.finder = f f() return addr, StreamClass = self.parse_url(url) if addr != self.addr: if self.stream: self.stream.close() self.new_connection(addr, StreamClass) else: logger.debug('%s: try to reuse existing connection to %s', self.origurl, self.addr) try: self.send_request(nocallback=True) except tornado.iostream.StreamClosedError: logger.debug('%s: server at %s doesn\'t like keep-alive, will reconnect.', self.origurl, self.addr) # The close callback should have already run self.stream.close() self.new_connection(addr, StreamClass) def run_callback(self, arg): self.io_loop.remove_timeout(self._timeout) self._finished = True if self.stream: self.stream.close() self._callback(arg, self) def send_request(self, nocallback=False): self._connected = True req = ['GET %s HTTP/1.1', 'Host: %s', # t.co will return 200 and use js/meta to redirect using the following :-( # 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0', 'User-Agent: %s' % UserAgent, 'Accept: text/html,application/xhtml+xml;q=0.9,*/*;q=0.7', 'Accept-Language: zh-cn,zh;q=0.7,en;q=0.3', 'Accept-Charset: utf-8,gb18030;q=0.7,*;q=0.7', 'Accept-Encoding: gzip, deflate', 'Connection: keep-alive', ] if self.referrer is not None: req.append('Referer: ' + self.referrer.replace('%', '%%')) path = self.url.path or '/' if self.url.query: path += '?' + self.url.query req = '\r\n'.join(req) % ( path, self._prepare_host(self.host), ) if self._cookie: req += '\r\n' + self._cookie req += '\r\n\r\n' self.stream.write(req.encode()) self.headers_done = False self.parser = HttpParser(decompress=True) if not nocallback: self.stream.read_until_close( # self.addr and self.stream may have been changed when close callback is run partial(self.on_data, close=True, addr=self.addr, stream=self.stream), streaming_callback=self.on_data, ) def _prepare_host(self, host): if not py3: host = host.decode("utf-8") host = encodings.idna.nameprep(host) return b'.'.join(encodings.idna.ToASCII(x) for x in host.split('.')).decode('ascii') def on_data(self, data, close=False, addr=None, stream=None): if close: logger.debug('%s: connection to %s closed.', self.origurl, addr) if (close and stream and self._redirected_stream is stream) or self._finished: # The connection is closing, and we are being redirected or we're done. self._redirected_stream = None return recved = len(data) logger.debug('%s: received data: %d bytes', self.origurl, recved) p = self.parser nparsed = p.execute(data, recved) if close: # feed EOF p.execute(b'', 0) if not self.headers_done and p.is_headers_complete(): if not self.on_headers_done(): return if p.is_partial_body(): chunk = p.recv_body() if self.finder is None: # redirected but has body received return t = self.feed_finder(chunk) if t is not None: self.run_callback(t) return if p.is_message_complete(): if self.finder is None: # redirected but has body received return t = self.feed_finder(None) # if title not found, t is None self.run_callback(t) elif close: self.run_callback(self.stream.error or ConnectionClosed) def before_connected(self): '''check if something wrong before connected''' if not self._connected and not self._finished: self.run_callback(self.stream.error) def process_cookie(self): setcookie = self.headers.get('Set-Cookie', None) if not setcookie: return cookies = [c.rsplit(None, 1)[-1] for c in setcookie.split('; expires')[:-1]] self._cookie = 'Cookie: ' + '; '.join(cookies) def on_headers_done(self): '''returns True if should proceed, None if should stop for current chunk''' self.headers_done = True self.headers = self.parser.get_headers() self.status_code = self.parser.get_status_code() if self.status_code in (301, 302): self.process_cookie() # or we may be redirecting to a loop logger.debug('%s: redirect to %s', self.origurl, self.headers['Location']) self.followed_times += 1 if self.followed_times > self.max_follows: self.run_callback(TooManyRedirection) else: newurl = urljoin(self.fullurl, self.headers['Location']) self._redirected_stream = self.stream self.new_url(newurl) return try: l = int(self.headers.get('Content-Length', None)) except (ValueError, TypeError): l = None ctype = self.headers.get('Content-Type', 'text/html') mt = defaultMediaType._replace(type=ctype, size=l) for finder in self._content_finders: f = finder.match_type(mt) if f: self.finder = f break else: self.run_callback(mt) return return True def feed_finder(self, chunk): '''feed data to TitleFinder, return the title if found''' t = self.finder(chunk) if t is not None: return t class URLFinder: def __init__(self, url, fetcher, match=None): self.fullurl = url self.match = match self.fetcher = fetcher @classmethod def match_url(cls, url, fetcher): if hasattr(cls, '_url_pat'): m = cls._url_pat.match(url) if m is not None: return cls(url, fetcher, m) if hasattr(cls, '_match_url') and cls._match_url(url, fetcher): return cls(url, fetcher) def done(self, info): self.fetcher.run_callback(info) class GithubFinder(URLFinder): _url_pat = re.compile(r'https://github\.com/(?!blog/)(?P<repo_path>[^/]+/[^/]+)/?$') _api_pat = 'https://api.github.com/repos/{repo_path}' httpclient = None def __call__(self): if self.httpclient is None: from tornado.httpclient import AsyncHTTPClient httpclient = AsyncHTTPClient() else: httpclient = self.httpclient m = self.match httpclient.fetch(self._api_pat.format(**m.groupdict()), self.parse_info, headers={ 'User-Agent': UserAgent, }) def parse_info(self, res): repoinfo = json.loads(res.body.decode('utf-8')) self.response = res self.done(repoinfo) class GithubUserFinder(GithubFinder): _url_pat = re.compile(r'https://github\.com/(?!blog(?:$|/))(?P<user>[^/]+)/?$') _api_pat = 'https://api.github.com/users/{user}' def main(urls): class BatchFetcher: n = 0 def __call__(self, title, fetcher): if isinstance(title, bytes): try: title = title.decode('gb18030') except UnicodeDecodeError: pass url = ' <- '.join(reversed(fetcher.url_visited)) logger.info('done: [%d] %s <- %s' % (fetcher.status_code, title, url)) self.n -= 1 if not self.n: tornado.ioloop.IOLoop.instance().stop() def add(self, url): TitleFetcher(url, self, url_finders=(GithubFinder,)) self.n += 1 from tornado.log import enable_pretty_logging enable_pretty_logging() f = BatchFetcher() for u in urls: f.add(u) tornado.ioloop.IOLoop.instance().start() def test(): urls = ( 'http://lilydjwg.is-programmer.com/', 'http://www.baidu.com', 'https://zh.wikipedia.org', # redirection 'http://redis.io/', 'http://kernel.org', 'http://lilydjwg.is-programmer.com/2012/10/27/streaming-gzip-decompression-in-python.36130.html', # maybe timeout 'http://img.vim-cn.com/22/cd42b4c776c588b6e69051a22e42dabf28f436', # image with length 'https://github.com/m13253/titlebot/blob/master/titlebot.py_', # 404 'http://lilydjwg.is-programmer.com/admin', # redirection 'http://twitter.com', # timeout 'http://www.wordpress.com', # reset 'https://www.wordpress.com', # timeout 'http://jquery-api-zh-cn.googlecode.com/svn/trunk/xml/jqueryapi.xml', # xml 'http://lilydjwg.is-programmer.com/user_files/lilydjwg/config/avatar.png', # PNG 'http://img01.taobaocdn.com/bao/uploaded/i1/110928240/T2okG7XaRbXXXXXXXX_!!110928240.jpg', # JPEG with Start Of Frame as the second block 'http://file3.u148.net/2013/1/images/1357536246993.jpg', # JPEG that failed previous code 'http://gouwu.hao123.com/', # HTML5 GBK encoding 'https://github.com/lilydjwg/winterpy', # github url finder 'http://github.com/lilydjwg/winterpy', # github url finder with redirect 'http://导航.中国/', # Punycode. This should not be redirected 'http://t.cn/zTOgr1n', # multiple redirections 'http://www.galago-project.org/specs/notification/0.9/x408.html', # </TITLE\n> 'http://x.co/dreamz', # redirection caused false ConnectionClosed error 'http://m8y.org/tmp/zipbomb/zipbomb_light_nonzero.html', # very long title ) main(urls) if __name__ == "__main__": import sys try: if len(sys.argv) == 1: sys.exit('no urls given.') elif sys.argv[1] == 'test': test() else: main(sys.argv[1:]) except KeyboardInterrupt: print('Interrupted.')