#!/usr/bin/env python """ CleanScraper.py This module defines a series of functions to fetch html content from from any publically-available URL, optionally clean it of any ads, navigation bars, and other irrelevant boilerplate, and convert it into a pdf document framed with wide margins, in an easy-to-read format. Edit the associated settings.py file to match your environment, or use a local_settings.py file which is not checked into this repo. """ from readability.readability import Document import pycurl from cStringIO import StringIO from string import Template from datetime import datetime import subprocess import codecs import sys import os from settings import ENCODING, UA, OUTPUT_FOLDER, \ PANDOC_PATH, EPUB_XML, ISBN_XML, EPUB_CSS, EPUB_COVER, EPUB_LANG, \ WKHTMLTOX_PATH, PDF_PAGE_SIZE, HTML_FRAME EPUB_CONVERT_CMD = Template("""$pandoc_path/pandoc -f html -t epub --epub-metadata="$metadata_xml" -o $folder/$article_id.epub """) PDF_CONVERT_CMD = Template("$wkhtmltox_path/wkhtmltopdf --page-size $page_size $folder/$article_id.html $folder/$article_id.pdf") def get_url (url, user_agent=UA, referrer=None): """Make a GET request of the url using pycurl and return the data (which is None if unsuccessful)""" data = None databuffer = StringIO() curl = pycurl.Curl() curl.setopt(pycurl.URL, url) curl.setopt(pycurl.FOLLOWLOCATION, 1) curl.setopt(pycurl.CONNECTTIMEOUT, 5) curl.setopt(pycurl.TIMEOUT, 8) curl.setopt(pycurl.WRITEFUNCTION, databuffer.write) curl.setopt(pycurl.COOKIEFILE, '') if user_agent: curl.setopt(pycurl.USERAGENT, user_agent) if referrer is not None: curl.setopt(pycurl.REFERER, referrer) try: curl.perform() data = databuffer.getvalue() except Exception: pass curl.close() return data def to_unicode (s, enc=ENCODING): """Convert the given string to unicode, using the requested encoding, (unless it's already in unicode), then return it""" if isinstance(s, basestring): if not isinstance(s, unicode): s = unicode(s, enc) return s def datestring (display_format="%a, %d %b %Y %H:%M:%S", datetime_object=None): """Convert the datetime.date object (defaults to now, in utc) into a string, in the given display format""" if datetime_object is None: datetime_object = datetime.utcnow() return datetime.strftime(datetime_object, display_format) def generate_epub_xml (epub_title, lang, isbn): """Return the contents of an xml file to be used in the '--epub-metadata' parameter in the epub conversion command""" id_string = '' if isbn is not None and len(isbn) > 0: id_string = ISBN_XML.substitute(isbn_number=isbn) return EPUB_XML.substitute(title=epub_title, lang=lang, date=datestring(display_format="%Y-%m-%dT%H:%M:%S"), # iso 8601 format identifier=id_string) def write_file (folder, filename, contents): """Attempt to write the contents to the filename in the tmp_folder as a utf-8 file, and return a boolean on success/fail""" result = False try: f = codecs.open(os.path.join(folder, filename), 'w', ENCODING) f.write(contents) f.close() result = True except (OSError, IOError): print "Sorry, could not save contents in", os.path.join(folder, filename) return result def generate_epub (file_folder, filename, epub_title, html_file, css_file, cover_image, lang, isbn): """Use pandoc to convert the html file and associated metadata into an epub file""" metadata = generate_epub_xml(epub_title, lang, isbn) if write_file(file_folder, 'metadata.xml', metadata): pandoc_cmd = EPUB_CONVERT_CMD.substitute(pandoc_path=PANDOC_PATH, folder=file_folder, metadata_xml=os.path.join(file_folder, 'metadata.xml'), article_id=filename) if cover_image is not None: pandoc_cmd += '--epub-cover-image="' + cover_image +'" ' pandoc_cmd += '-s --smart --parse-raw ' pandoc_cmd += os.path.join(file_folder, html_file) proc = subprocess.Popen(pandoc_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout_value, stderr_value = proc.communicate() print u'\n'.join(filter(None, [pandoc_cmd, stdout_value, stderr_value])), '\n' def generate_pdf (tmp_folder, filename, pdf_page_size): """Use wkhtmltopdf to convert the html file at tmp_folder/filename into a pdf file, and return the stdout and stderr results""" shell_cmd = PDF_CONVERT_CMD.substitute(wkhtmltox_path=WKHTMLTOX_PATH, folder=tmp_folder, article_id=filename, page_size=pdf_page_size) proc = subprocess.Popen(shell_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout_value, stderr_value = proc.communicate() print u'\n'.join(filter(None, [shell_cmd, stdout_value, stderr_value])) def scrape (url, pdf_filename, pdf_page_size=PDF_PAGE_SIZE, folder=OUTPUT_FOLDER, clean_it=True, css_file=EPUB_CSS, lang=EPUB_LANG, cover_image=EPUB_COVER, isbn=None): """Fetch the html content at url and convert it to a pdf file, cleaned by readability and framed in an easy-to-read format if clean_it is True""" raw_html = get_url(url) if raw_html is None: print "Sorry, could not read ", url else: filename_prefix, file_ext = os.path.splitext(pdf_filename) if clean_it: # use readability to get rid of crap title = Document(raw_html).short_title() content = Document(raw_html).summary(html_partial=True) # write the cleaned contents to an html frame for pdf conversion frame = HTML_FRAME.substitute(content=to_unicode(content), url=url, title=title) # unlike pdf, epub is controlled by css, so save the cleaned html alone epub_source = write_file(folder, os.extsep.join([filename_prefix+'_epub', 'html']), to_unicode(content)) pdf_source = write_file(folder, os.extsep.join([filename_prefix, 'html']), frame) else: title = filename_prefix # no readability cleaning requested, so use the fetched html as-is epub_source = write_file(folder, os.extsep.join([filename_prefix+'_epub', 'html']), to_unicode(raw_html)) pdf_source = write_file(folder, os.extsep.join([filename_prefix, 'html']), to_unicode(raw_html)) if epub_source: generate_epub (folder, filename_prefix, title, os.path.join(folder, os.extsep.join([filename_prefix+'_epub', 'html'])), css_file, cover_image, lang, isbn) if pdf_source: generate_pdf (folder, filename_prefix, pdf_page_size) if __name__ == "__main__": """Create a command-line main() entry point""" if len(sys.argv) < 3: # Define the usage print sys.argv[0], '[URL]', '[PDF & EPUB filename]', '--noclean (optional: leave the source html found at the URL as-is)' else: # Do the deed clean_html = True try: if sys.argv[3] == '--noclean': clean_html = False except IndexError: pass scrape(sys.argv[1], sys.argv[2], clean_it=clean_html)