python source code of html

# -*- coding: utf-8 -*-

"""Provider for HTML documents

This provider is a little bit special, in that it isn't simply pulling an 
academic paper from a site, but instead aims to pull a HTML article.

Author: G.J.J. van den Burg
License: See LICENSE file.
Copyright: 2020, G.J.J. van den Burg

"""

import html2text
import markdown
import readability
import titlecase
import unidecode
import urllib
import weasyprint
import weasyprint.fonts

from ._base import Provider
from ._info import Informer

from ..utils import (
    clean_string,
    get_page_with_retry,
    get_content_type_with_retry,
)
from ..log import Logger

logger = Logger()

CSS = """
@import url('https://fonts.googleapis.com/css?family=EB+Garamond|Noto+Serif|Inconsolata&display=swap');
@page { size: 702px 936px; margin: 1in; }
a { color: black; }
img { display: block; margin: 0 auto; text-align: center; max-width: 70%; max-height: 300px; }
p, li { font-size: 10pt; font-family: 'EB Garamond'; hyphens: auto; text-align: justify; }
h1,h2,h3 { font-family: 'Noto Serif'; }
h1 { font-size: 26px; }
h2 { font-size: 18px; }
h3 { font-size: 14px; }
blockquote { font-style: italic; }
pre { font-family: 'Inconsolata'; padding-left: 2.5%; background: #efefef; }
code { font-family: 'Inconsolata'; font-size: .7rem; background: #efefef; }
"""


def url_fetcher(url):
    if url.startswith("//"):
        url = "https:" + url
    elif url.startswith("file:///"):
        url = "https:" + url[len("file:/") :]
    return weasyprint.default_url_fetcher(url)


class ImgProcessor(markdown.treeprocessors.Treeprocessor):
    def __init__(self, base_url, *args, **kwargs):
        self._base_url = base_url
        super().__init__(*args, **kwargs)

    def run(self, root):
        """ Ensure all img src urls are absolute """
        for img in root.iter("img"):
            img.attrib["src"] = urllib.parse.urljoin(
                self._base_url, img.attrib["src"]
            )
            img.attrib["src"] = img.attrib['src'].rstrip('/')


class HTMLInformer(Informer):
    def __init__(self):
        super().__init__()

    def get_filename(self, abs_url):
        request_text = get_page_with_retry(abs_url, return_text=True)
        doc = readability.Document(request_text)
        title = doc.title()

        # Clean the title and make it titlecase
        title = clean_string(title)
        title = titlecase.titlecase(title)
        title = title.replace(" ", "_")
        title = clean_string(title)
        name = title.strip("_") + ".pdf"
        name = unidecode.unidecode(name)
        logger.info("Created filename: %s" % name)
        return name


class HTML(Provider):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.informer = HTMLInformer()

    def get_abs_pdf_urls(self, url):
        return url, url

    def retrieve_pdf(self, pdf_url, filename):
        """Turn the HTML article in a clean pdf file"""
        # Steps
        # 1. Pull the HTML page using requests
        # 2. Extract the article part of the page using readability
        # 3. Convert the article HTML to markdown using html2text
        # 4. Convert the markdown back to HTML (this is done to sanitize HTML)
        # 4. Convert the HTML to PDF, pulling in images where needed
        # 5. Save the PDF to the specified filename.
        request_text = get_page_with_retry(pdf_url, return_text=True)
        doc = readability.Document(request_text)
        title = doc.title()
        raw_html = doc.summary(html_partial=True)

        h2t = html2text.HTML2Text()
        h2t.wrap_links = False
        text = h2t.handle(raw_html)

        # Add the title back to the document
        article = "# {title}\n\n{text}".format(title=title, text=text)

        # Convert to html, fixing relative image urls.
        md = markdown.Markdown()
        md.treeprocessors.register(ImgProcessor(pdf_url), "img", 10)
        html_article = md.convert(article)

        if self.debug:
            with open("./paper.html", "w") as fp:
                fp.write(html_article)

        font_config = weasyprint.fonts.FontConfiguration()
        html = weasyprint.HTML(string=html_article, url_fetcher=url_fetcher)
        css = weasyprint.CSS(string=CSS, font_config=font_config)

        html.write_pdf(filename, stylesheets=[css], font_config=font_config)

    def validate(src):
        # first check if it is a valid url
        parsed = urllib.parse.urlparse(src)
        if not all([parsed.scheme, parsed.netloc, parsed.path]):
            return False
        # next, get the header and check the content type
        ct = get_content_type_with_retry(src)
        if ct is None:
            return False
        return ct.startswith("text/html")