Python html5lib.parse() Examples

The following are 30 code examples of html5lib.parse(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module html5lib , or try the search function .
Example #1
Source File: answer_dist.py    From edx-analytics-pipeline with GNU Affero General Public License v3.0 6 votes vote down vote up
def get_text_from_html(markup):
    """
    Convert html markup to plain text.

    Includes stripping excess whitespace, and assuring whitespace
    exists between elements (e.g. table elements).
    """
    try:
        root = html5lib.parse(markup)
        text_list = []
        for val in get_text_from_element(root):
            text_list.extend(val.split())
        text = u' '.join(text_list)
    except Exception as exception:  # pylint: disable=broad-except
        # TODO: find out what exceptions might actually occur here, if any.
        # This may be unnecessarily paranoid, given html5lib's fallback behavior.
        log.error("Unparseable answer value markup: '%s' return exception %s", markup, exception)
        text = markup.strip()

    return text 
Example #2
Source File: test_performance.py    From normandy with Mozilla Public License 2.0 6 votes vote down vote up
def test_static_cache_headers(conf, requests_session):
    """Test that all scripts included from self-repair have long lived cache headers"""
    req = requests_session.get(conf.getoption("server") + "/en-US/repair")
    req.raise_for_status()
    document = html5lib.parse(req.content, treebuilder="dom")
    scripts = document.getElementsByTagName("script")
    for script in scripts:
        src = script.getAttribute("src")
        url = urljoin(conf.getoption("server"), src)
        script_req = requests_session.get(url)
        script_req.raise_for_status()
        cache_control = parse_cache_control(script_req.headers["cache-control"])
        assert cache_control["public"], f"Cache-control: public for {url}"
        ONE_YEAR = 31_536_000
        assert cache_control["max-age"] >= ONE_YEAR, f"Cache-control: max-age > 1 year for {url}"
        assert cache_control["immutable"], f"Cache-control: immutable for {url}" 
Example #3
Source File: legacy_repository.py    From poetry with MIT License 6 votes vote down vote up
def __init__(self, url, content, headers):
        if not url.endswith("/"):
            url += "/"

        self._url = url
        encoding = None
        if headers and "Content-Type" in headers:
            content_type, params = cgi.parse_header(headers["Content-Type"])

            if "charset" in params:
                encoding = params["charset"]

        self._content = content

        if encoding is None:
            self._parsed = html5lib.parse(content, namespaceHTMLElements=False)
        else:
            self._parsed = html5lib.parse(
                content, transport_encoding=encoding, namespaceHTMLElements=False
            ) 
Example #4
Source File: legacy_repository.py    From poetry with MIT License 6 votes vote down vote up
def link_version(self, link):  # type: (Link) -> Union[Version, None]
        m = wheel_file_re.match(link.filename)
        if m:
            version = m.group("ver")
        else:
            info, ext = link.splitext()
            match = self.VERSION_REGEX.match(info)
            if not match:
                return

            version = match.group(2)

        try:
            version = Version.parse(version)
        except ValueError:
            return

        return version 
Example #5
Source File: py2index.py    From resolvelib with ISC License 6 votes vote down vote up
def collect_best_dist_urls(self, name: str) -> Dict[str, str]:
        all_dists: DistListMapping = collections.defaultdict(list)
        for index_url in self.index_urls:
            res = requests.get(f"{index_url}/{name}")
            res.raise_for_status()
            doc = html5lib.parse(res.content, namespaceHTMLElements=False)
            for el in doc.findall(".//a"):
                url = el.attrib["href"]
                filename = urllib.parse.urlsplit(url).path.rsplit("/", 1)[-1]
                wheel_name, ext = filename.rsplit(".", 1)
                if ext != "whl":
                    continue
                requires_python = el.attrib.get("data-requires-python")
                _, version, tag = _parse_wheel_name(wheel_name)
                rank = self.matcher.rank(tag, requires_python)
                if rank is None:
                    continue
                all_dists[version].append((rank, url))
        urls = {version: min(dists)[1] for version, dists in all_dists.items()}
        logger.info("%d URLs found for %s", len(urls), name)
        return urls 
Example #6
Source File: py2index.py    From resolvelib with ISC License 6 votes vote down vote up
def compatible_with(
        cls,
        python_version: PythonVersion,
        impl: Optional[str],
        plats: Optional[List[str]],
    ) -> WheelMatcher:
        required_python = packaging.version.parse(
            ".".join(str(v) for v in python_version)
        )
        # TODO: Add ABI customization.
        tag_it = itertools.chain(
            packaging.tags.compatible_tags(python_version, impl, plats),
            packaging.tags.cpython_tags(python_version, None, plats),
        )
        tags = {t: i for i, t in enumerate(tag_it)}
        return cls(required_python, tags) 
Example #7
Source File: test_parser2.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def test_namespace_html_elements_1_dom():
    doc = parse("<html></html>",
                treebuilder="dom",
                namespaceHTMLElements=False)
    assert doc.childNodes[0].namespaceURI is None 
Example #8
Source File: _html.py    From yalih with Apache License 2.0 5 votes vote down vote up
def content_parser(data,
                   url=None,
                   response_info=None,
                   transport_encoding=None,
                   default_encoding=DEFAULT_ENCODING,
                   is_html=True):
    '''
    Parse data (a bytes object) into an etree representation such as
    :py:mod:`xml.etree.ElementTree` or `lxml.etree`

    :param bytes data: The data to parse
    :param url: The URL of the document being parsed or None
    :param response_info: Information about the document
        (contains all HTTP headers as :class:`HTTPMessage`)
    :param transport_encoding: The character encoding for the document being
        parsed as specified in the HTTP headers or None.
    :param default_encoding: The character encoding to use if no encoding
        could be detected and no transport_encoding is specified
    :param is_html: If the document is to be parsed as HTML.
    '''
    if not is_html:
        return
    try:
        from html5_parser import parse
    except Exception:
        from html5lib import parse
        kw = {'namespaceHTMLElements': False}
        if transport_encoding and isinstance(data, bytes):
            kw['transport_encoding'] = transport_encoding
        return parse(data, **kw)
    else:
        return parse(data, transport_encoding=transport_encoding) 
Example #9
Source File: utils.py    From open-ledger with MIT License 5 votes vote down vote up
def select_nodes(resp, selector):
    """Give a response from the app, return just the HTML fragment defined by `selector`"""
    h = html5lib.parse(resp.content.decode('utf-8'), treebuilder='lxml', namespaceHTMLElements=False)
    return h.getroot().cssselect(selector) 
Example #10
Source File: test_parser2.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def test_assertDoctypeCloneable():
    doc = parse('<!DOCTYPE HTML>', treebuilder="dom")
    assert doc.cloneNode(True) is not None 
Example #11
Source File: test_parser2.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def test_line_counter():
    # http://groups.google.com/group/html5lib-discuss/browse_frm/thread/f4f00e4a2f26d5c0
    assert parse("<pre>\nx\n&gt;\n</pre>") is not None 
Example #12
Source File: test_parser2.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def test_namespace_html_elements_0_dom():
    doc = parse("<html></html>",
                treebuilder="dom",
                namespaceHTMLElements=True)
    assert doc.childNodes[0].namespaceURI == namespaces["html"] 
Example #13
Source File: py2index.py    From resolvelib with ISC License 5 votes vote down vote up
def iter_package_entries(self, name: str) -> Iterator[PackageEntry]:
        for version, url in self.collect_best_dist_urls(name).items():
            http_file = cast(IO[bytes], HttpFile(url, self.session))
            with zipfile.ZipFile(http_file) as zf:
                with _open_metadata(zf, name) as f:
                    parser = email.parser.BytesParser()
                    data = parser.parse(cast(BinaryIO, f), headersonly=True)
            dependencies: List[str] = data.get_all("Requires-Dist", [])
            yield PackageEntry(version, dependencies) 
Example #14
Source File: test_parser2.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def test_namespace_html_elements_0_etree():
    doc = parse("<html></html>",
                treebuilder="etree",
                namespaceHTMLElements=True)
    assert doc.tag == "{%s}html" % (namespaces["html"],) 
Example #15
Source File: test_parser2.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def test_unicode_file():
    assert parse(io.StringIO("a")) is not None 
Example #16
Source File: test_parser2.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def test_duplicate_attribute():
    # This is here because we impl it in parser and not tokenizer
    doc = parse('<p class=a class=b>')
    el = doc[1][0]
    assert el.get("class") == "a" 
Example #17
Source File: test_parser2.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def test_debug_log():
    parser = HTMLParser(debug=True)
    parser.parse("<!doctype html><title>a</title><p>b<script>c</script>d</p>e")

    expected = [('dataState', 'InitialPhase', 'InitialPhase', 'processDoctype', {'type': 'Doctype'}),
                ('dataState', 'BeforeHtmlPhase', 'BeforeHtmlPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
                ('dataState', 'BeforeHeadPhase', 'BeforeHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
                ('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
                ('rcdataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}),
                ('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'title', 'type': 'EndTag'}),
                ('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
                ('dataState', 'AfterHeadPhase', 'AfterHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),
                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
                ('dataState', 'InBodyPhase', 'InHeadPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
                ('scriptDataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}),
                ('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'script', 'type': 'EndTag'}),
                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),
                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processEndTag', {'name': 'p', 'type': 'EndTag'}),
                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'})]

    if PY2:
        for i, log in enumerate(expected):
            log = [x.encode("ascii") if isinstance(x, text_type) else x for x in log]
            expected[i] = tuple(log)

    assert parser.log == expected 
Example #18
Source File: test_treeadapters.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def test_to_sax():
    handler = support.TracingSaxHandler()
    tree = html5lib.parse("""<html xml:lang="en">
        <title>Directory Listing</title>
        <a href="/"><b/></p>
    """, treebuilder="etree")
    walker = getTreeWalker("etree")
    sax.to_sax(walker(tree), handler)
    expected = [
        'startDocument',
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'html'),
            'html', {(None, 'xml:lang'): 'en'}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head', {}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title', {}),
        ('characters', 'Directory Listing'),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title'),
        ('characters', '\n        '),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head'),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a', {(None, 'href'): '/'}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b', {}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p', {}),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p'),
        ('characters', '\n    '),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b'),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a'),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body'),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'html'), 'html'),
        'endDocument',
    ]
    assert expected == handler.visited 
Example #19
Source File: ng_parser.py    From gd.py with MIT License 5 votes vote down vote up
def html_parse(text: str) -> Element:
    text = re.sub(re_class, lambda match: match.group(0).replace(" ", ""), text)

    if use_lxml:
        return html.fromstring(text)
    else:
        return parse(text, "etree", False) 
Example #20
Source File: test_treeadapters.py    From MARA_Framework with GNU Lesser General Public License v3.0 5 votes vote down vote up
def test_to_sax():
    handler = support.TracingSaxHandler()
    tree = html5lib.parse("""<html xml:lang="en">
        <title>Directory Listing</title>
        <a href="/"><b/></p>
    """, treebuilder="etree")
    walker = getTreeWalker("etree")
    sax.to_sax(walker(tree), handler)
    expected = [
        'startDocument',
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'html'),
            'html', {(None, 'xml:lang'): 'en'}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head', {}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title', {}),
        ('characters', 'Directory Listing'),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title'),
        ('characters', '\n        '),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head'),
        ('startElementNS',  ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a', {(None, 'href'): '/'}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b', {}),
        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p', {}),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p'),
        ('characters', '\n    '),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b'),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a'),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body'),
        ('endElementNS', ('http://www.w3.org/1999/xhtml', 'html'), 'html'),
        'endDocument',
    ]
    assert expected == handler.visited 
Example #21
Source File: _html.py    From Vaile with GNU General Public License v3.0 5 votes vote down vote up
def content_parser(data,
                   url=None,
                   response_info=None,
                   transport_encoding=None,
                   default_encoding=DEFAULT_ENCODING,
                   is_html=True):
    '''
    Parse data (a bytes object) into an etree representation such as
    :py:mod:`xml.etree.ElementTree` or `lxml.etree`

    :param bytes data: The data to parse
    :param url: The URL of the document being parsed or None
    :param response_info: Information about the document
        (contains all HTTP headers as :class:`HTTPMessage`)
    :param transport_encoding: The character encoding for the document being
        parsed as specified in the HTTP headers or None.
    :param default_encoding: The character encoding to use if no encoding
        could be detected and no transport_encoding is specified
    :param is_html: If the document is to be parsed as HTML.
    '''
    if not is_html:
        return
    try:
        from html5_parser import parse
    except Exception:
        from html5lib import parse
        kw = {'namespaceHTMLElements': False}
        if transport_encoding and isinstance(data, bytes):
            kw['transport_encoding'] = transport_encoding
        return parse(data, **kw)
    else:
        return parse(data, transport_encoding=transport_encoding) 
Example #22
Source File: test_integration_sandbox.py    From django-payfast with MIT License 5 votes vote down vote up
def parse_payfast_page(response):  # type: (requests.Response) -> Dict[str, str]
    """
    Scrape some data from a PayFast payment page response.
    """
    assert 'text/html; charset=UTF-8' == response.headers['Content-Type']
    html = response.text
    doc = html5lib.parse(html)  # type: ElementTree

    def _parse():  # type: () -> Iterable[Tuple[str, str]]
        # The session info:
        session_tracker = find_id(doc, 'session-tracker')
        for name in ['type', 'id']:
            value = session_tracker.attrib['data-{}'.format(name)]
            if value:
                yield ('session_{}'.format(name), value)

        # The payment summary on the left.
        left = find_id(doc, 'left-column')
        yield ('payment_summary', text_collapsed(left))

        right = find_id(doc, 'right-column')
        content_box = find_id(right, 'contentBox')

        # The error notice, if any:
        notice = find_id_maybe(content_box, 'notice')
        if notice is not None:
            yield ('notice', text_lines(notice))

        # The wallet payment completion option, if present:
        wa_tab = find_id_maybe(content_box, 'waTab')
        if wa_tab is not None:
            yield ('payment_method', (wa_tab.attrib['data-methodkey']))
            pay_button = find_id(wa_tab, 'pay-with-wallet')
            yield ('pay_button', pay_button.attrib['value'])

    return dict(_parse()) 
Example #23
Source File: pypi_wheel_provider.py    From resolvelib with ISC License 5 votes vote down vote up
def get_metadata_for_wheel(url):
    data = requests.get(url).content
    with ZipFile(BytesIO(data)) as z:
        for n in z.namelist():
            if n.endswith(".dist-info/METADATA"):
                p = BytesParser()
                return p.parse(z.open(n), headersonly=True)

    # If we didn't find the metadata, return an empty dict
    return EmailMessage() 
Example #24
Source File: pypi_wheel_provider.py    From resolvelib with ISC License 5 votes vote down vote up
def get_project_from_pypi(project, extras):
    """Return candidates created from the project name and extras."""
    url = "https://pypi.org/simple/{}".format(project)
    data = requests.get(url).content
    doc = html5lib.parse(data, namespaceHTMLElements=False)
    for i in doc.findall(".//a"):
        url = i.attrib["href"]
        py_req = i.attrib.get("data-requires-python")
        # Skip items that need a different Python version
        if py_req:
            spec = SpecifierSet(py_req)
            if PYTHON_VERSION not in spec:
                continue

        path = urlparse(url).path
        filename = path.rpartition("/")[-1]
        # We only handle wheels
        if not filename.endswith(".whl"):
            continue

        # TODO: Handle compatibility tags?

        # Very primitive wheel filename parsing
        name, version = filename[:-4].split("-")[:2]
        try:
            version = Version(version)
        except InvalidVersion:
            # Ignore files with invalid versions
            continue

        yield Candidate(name, version, url=url, extras=extras) 
Example #25
Source File: test_html.py    From crocoite with MIT License 5 votes vote down vote up
def test_strip_attribute ():
    d = html5lib.parse ('<a b=1 c="yes" d></a><br b=2 c="no" d keep=1>')
    stream = StripAttributeFilter (getTreeWalker ('etree')(d), ['b', 'c', 'd'])
    serializer = HTMLSerializer ()
    assert serializer.render (stream) == '<a></a><br keep=1>' 
Example #26
Source File: test_html.py    From crocoite with MIT License 5 votes vote down vote up
def test_strip_tag ():
    d = html5lib.parse ('<a>barbaz<b>foobar</b>.</a><b>foobar</b>.<b attr=1><c></c>')
    stream = StripTagFilter (getTreeWalker ('etree')(d), ['b', 'c'])
    serializer = HTMLSerializer ()
    assert serializer.render (stream) == '<a>barbaz.</a>.' 
Example #27
Source File: parsers.py    From riko with MIT License 5 votes vote down vote up
def xml2etree(f, xml=True, html5=False):
    if xml:
        element_tree = etree.parse(f)
    elif html5 and html5parser:
        element_tree = html5parser.parse(f)
    elif html5parser:
        element_tree = html.parse(f)
    else:
        # html5lib's parser returns an Element, so we must convert it into an
        # ElementTree
        element_tree = ElementTree(html.parse(f))

    return element_tree 
Example #28
Source File: parsers.py    From riko with MIT License 5 votes vote down vote up
def parse_rss(url=None, **kwargs):
    try:
        f = fetch(decode(url), **kwargs)
    except (ValueError, URLError):
        parsed = rssparser.parse(url)
    else:
        content = f.read() if speedparser else f

        try:
            parsed = rssparser.parse(content)
        finally:
            f.close()

    return parsed 
Example #29
Source File: test_performance.py    From normandy with Mozilla Public License 2.0 5 votes vote down vote up
def test_cache_headers(self, conf, requests_session, path, only_readonly):
        if path.startswith("/api/"):
            pytest.xfail("caching temporarily hidden on api by nginx")
        r = requests_session.get(conf.getoption("server") + path)
        r.raise_for_status()
        cache_control = r.headers.get("cache-control")
        assert cache_control is not None

        # parse cache-control header.
        parts = [part.strip() for part in cache_control.split(",")]
        max_age = [part for part in parts if part.startswith("max-age=")][0]
        max_age_seconds = int(max_age.split("=")[1])
        assert "public" in parts
        assert max_age_seconds > 0 
Example #30
Source File: test_integration_sandbox.py    From django-payfast with MIT License 5 votes vote down vote up
def do_checkout(
        checkout_data,  # type: Dict[str, str]
        sign_checkout,  # type: bool
):  # type: (...) -> Dict[str, str]
    """
    Common test helper: do a checkout, and assert results.

    This takes unsigned checkout data, and will add a signature if `sign_checkout` is true.

    Return the checkout page's parse.
    """
    # Expected values for result assertions:
    try:
        expected_amount = '{:.2f}'.format(decimal.Decimal(checkout_data['amount']))
    except decimal.InvalidOperation:
        # We may be testing a value that isn't Decimal-parseable;
        # in that case, just expect it unmodified.
        expected_amount = checkout_data['amount']
    expected_item_name = checkout_data['item_name'].strip()  # PayFast strips this for display.
    expected_payment_summary = (
        '{} Payment total R {} ZAR'.format(expected_item_name, expected_amount)
        .strip()  # Strip to handle item names that render empty.
    )

    if sign_checkout:
        assert 'signature' not in checkout_data, checkout_data
        checkout_data['signature'] = api.checkout_signature(checkout_data)

    response = post_sandbox_checkout(checkout_data)
    parsed = parse_payfast_page(response)
    assert {
        'session_type': 'p-sb',
        'session_id': parsed.get('session_id', 'MISSING'),
        'payment_summary': expected_payment_summary,
        'payment_method': '1',
        'pay_button': 'Complete Payment',
    } == parsed

    return parsed