Python html5lib.parse() Examples
The following are 30
code examples of html5lib.parse().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
html5lib
, or try the search function
.
Example #1
Source File: answer_dist.py From edx-analytics-pipeline with GNU Affero General Public License v3.0 | 6 votes |
def get_text_from_html(markup): """ Convert html markup to plain text. Includes stripping excess whitespace, and assuring whitespace exists between elements (e.g. table elements). """ try: root = html5lib.parse(markup) text_list = [] for val in get_text_from_element(root): text_list.extend(val.split()) text = u' '.join(text_list) except Exception as exception: # pylint: disable=broad-except # TODO: find out what exceptions might actually occur here, if any. # This may be unnecessarily paranoid, given html5lib's fallback behavior. log.error("Unparseable answer value markup: '%s' return exception %s", markup, exception) text = markup.strip() return text
Example #2
Source File: test_performance.py From normandy with Mozilla Public License 2.0 | 6 votes |
def test_static_cache_headers(conf, requests_session): """Test that all scripts included from self-repair have long lived cache headers""" req = requests_session.get(conf.getoption("server") + "/en-US/repair") req.raise_for_status() document = html5lib.parse(req.content, treebuilder="dom") scripts = document.getElementsByTagName("script") for script in scripts: src = script.getAttribute("src") url = urljoin(conf.getoption("server"), src) script_req = requests_session.get(url) script_req.raise_for_status() cache_control = parse_cache_control(script_req.headers["cache-control"]) assert cache_control["public"], f"Cache-control: public for {url}" ONE_YEAR = 31_536_000 assert cache_control["max-age"] >= ONE_YEAR, f"Cache-control: max-age > 1 year for {url}" assert cache_control["immutable"], f"Cache-control: immutable for {url}"
Example #3
Source File: legacy_repository.py From poetry with MIT License | 6 votes |
def __init__(self, url, content, headers): if not url.endswith("/"): url += "/" self._url = url encoding = None if headers and "Content-Type" in headers: content_type, params = cgi.parse_header(headers["Content-Type"]) if "charset" in params: encoding = params["charset"] self._content = content if encoding is None: self._parsed = html5lib.parse(content, namespaceHTMLElements=False) else: self._parsed = html5lib.parse( content, transport_encoding=encoding, namespaceHTMLElements=False )
Example #4
Source File: legacy_repository.py From poetry with MIT License | 6 votes |
def link_version(self, link): # type: (Link) -> Union[Version, None] m = wheel_file_re.match(link.filename) if m: version = m.group("ver") else: info, ext = link.splitext() match = self.VERSION_REGEX.match(info) if not match: return version = match.group(2) try: version = Version.parse(version) except ValueError: return return version
Example #5
Source File: py2index.py From resolvelib with ISC License | 6 votes |
def collect_best_dist_urls(self, name: str) -> Dict[str, str]: all_dists: DistListMapping = collections.defaultdict(list) for index_url in self.index_urls: res = requests.get(f"{index_url}/{name}") res.raise_for_status() doc = html5lib.parse(res.content, namespaceHTMLElements=False) for el in doc.findall(".//a"): url = el.attrib["href"] filename = urllib.parse.urlsplit(url).path.rsplit("/", 1)[-1] wheel_name, ext = filename.rsplit(".", 1) if ext != "whl": continue requires_python = el.attrib.get("data-requires-python") _, version, tag = _parse_wheel_name(wheel_name) rank = self.matcher.rank(tag, requires_python) if rank is None: continue all_dists[version].append((rank, url)) urls = {version: min(dists)[1] for version, dists in all_dists.items()} logger.info("%d URLs found for %s", len(urls), name) return urls
Example #6
Source File: py2index.py From resolvelib with ISC License | 6 votes |
def compatible_with( cls, python_version: PythonVersion, impl: Optional[str], plats: Optional[List[str]], ) -> WheelMatcher: required_python = packaging.version.parse( ".".join(str(v) for v in python_version) ) # TODO: Add ABI customization. tag_it = itertools.chain( packaging.tags.compatible_tags(python_version, impl, plats), packaging.tags.cpython_tags(python_version, None, plats), ) tags = {t: i for i, t in enumerate(tag_it)} return cls(required_python, tags)
Example #7
Source File: test_parser2.py From bazarr with GNU General Public License v3.0 | 5 votes |
def test_namespace_html_elements_1_dom(): doc = parse("<html></html>", treebuilder="dom", namespaceHTMLElements=False) assert doc.childNodes[0].namespaceURI is None
Example #8
Source File: _html.py From yalih with Apache License 2.0 | 5 votes |
def content_parser(data, url=None, response_info=None, transport_encoding=None, default_encoding=DEFAULT_ENCODING, is_html=True): ''' Parse data (a bytes object) into an etree representation such as :py:mod:`xml.etree.ElementTree` or `lxml.etree` :param bytes data: The data to parse :param url: The URL of the document being parsed or None :param response_info: Information about the document (contains all HTTP headers as :class:`HTTPMessage`) :param transport_encoding: The character encoding for the document being parsed as specified in the HTTP headers or None. :param default_encoding: The character encoding to use if no encoding could be detected and no transport_encoding is specified :param is_html: If the document is to be parsed as HTML. ''' if not is_html: return try: from html5_parser import parse except Exception: from html5lib import parse kw = {'namespaceHTMLElements': False} if transport_encoding and isinstance(data, bytes): kw['transport_encoding'] = transport_encoding return parse(data, **kw) else: return parse(data, transport_encoding=transport_encoding)
Example #9
Source File: utils.py From open-ledger with MIT License | 5 votes |
def select_nodes(resp, selector): """Give a response from the app, return just the HTML fragment defined by `selector`""" h = html5lib.parse(resp.content.decode('utf-8'), treebuilder='lxml', namespaceHTMLElements=False) return h.getroot().cssselect(selector)
Example #10
Source File: test_parser2.py From bazarr with GNU General Public License v3.0 | 5 votes |
def test_assertDoctypeCloneable(): doc = parse('<!DOCTYPE HTML>', treebuilder="dom") assert doc.cloneNode(True) is not None
Example #11
Source File: test_parser2.py From bazarr with GNU General Public License v3.0 | 5 votes |
def test_line_counter(): # http://groups.google.com/group/html5lib-discuss/browse_frm/thread/f4f00e4a2f26d5c0 assert parse("<pre>\nx\n>\n</pre>") is not None
Example #12
Source File: test_parser2.py From bazarr with GNU General Public License v3.0 | 5 votes |
def test_namespace_html_elements_0_dom(): doc = parse("<html></html>", treebuilder="dom", namespaceHTMLElements=True) assert doc.childNodes[0].namespaceURI == namespaces["html"]
Example #13
Source File: py2index.py From resolvelib with ISC License | 5 votes |
def iter_package_entries(self, name: str) -> Iterator[PackageEntry]: for version, url in self.collect_best_dist_urls(name).items(): http_file = cast(IO[bytes], HttpFile(url, self.session)) with zipfile.ZipFile(http_file) as zf: with _open_metadata(zf, name) as f: parser = email.parser.BytesParser() data = parser.parse(cast(BinaryIO, f), headersonly=True) dependencies: List[str] = data.get_all("Requires-Dist", []) yield PackageEntry(version, dependencies)
Example #14
Source File: test_parser2.py From bazarr with GNU General Public License v3.0 | 5 votes |
def test_namespace_html_elements_0_etree(): doc = parse("<html></html>", treebuilder="etree", namespaceHTMLElements=True) assert doc.tag == "{%s}html" % (namespaces["html"],)
Example #15
Source File: test_parser2.py From bazarr with GNU General Public License v3.0 | 5 votes |
def test_unicode_file(): assert parse(io.StringIO("a")) is not None
Example #16
Source File: test_parser2.py From bazarr with GNU General Public License v3.0 | 5 votes |
def test_duplicate_attribute(): # This is here because we impl it in parser and not tokenizer doc = parse('<p class=a class=b>') el = doc[1][0] assert el.get("class") == "a"
Example #17
Source File: test_parser2.py From bazarr with GNU General Public License v3.0 | 5 votes |
def test_debug_log(): parser = HTMLParser(debug=True) parser.parse("<!doctype html><title>a</title><p>b<script>c</script>d</p>e") expected = [('dataState', 'InitialPhase', 'InitialPhase', 'processDoctype', {'type': 'Doctype'}), ('dataState', 'BeforeHtmlPhase', 'BeforeHtmlPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}), ('dataState', 'BeforeHeadPhase', 'BeforeHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}), ('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}), ('rcdataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}), ('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'title', 'type': 'EndTag'}), ('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}), ('dataState', 'AfterHeadPhase', 'AfterHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}), ('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}), ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}), ('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}), ('dataState', 'InBodyPhase', 'InHeadPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}), ('scriptDataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}), ('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'script', 'type': 'EndTag'}), ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}), ('dataState', 'InBodyPhase', 'InBodyPhase', 'processEndTag', {'name': 'p', 'type': 'EndTag'}), ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'})] if PY2: for i, log in enumerate(expected): log = [x.encode("ascii") if isinstance(x, text_type) else x for x in log] expected[i] = tuple(log) assert parser.log == expected
Example #18
Source File: test_treeadapters.py From bazarr with GNU General Public License v3.0 | 5 votes |
def test_to_sax(): handler = support.TracingSaxHandler() tree = html5lib.parse("""<html xml:lang="en"> <title>Directory Listing</title> <a href="/"><b/></p> """, treebuilder="etree") walker = getTreeWalker("etree") sax.to_sax(walker(tree), handler) expected = [ 'startDocument', ('startElementNS', ('http://www.w3.org/1999/xhtml', 'html'), 'html', {(None, 'xml:lang'): 'en'}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head', {}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title', {}), ('characters', 'Directory Listing'), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title'), ('characters', '\n '), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head'), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a', {(None, 'href'): '/'}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b', {}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p', {}), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p'), ('characters', '\n '), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b'), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a'), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body'), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'html'), 'html'), 'endDocument', ] assert expected == handler.visited
Example #19
Source File: ng_parser.py From gd.py with MIT License | 5 votes |
def html_parse(text: str) -> Element: text = re.sub(re_class, lambda match: match.group(0).replace(" ", ""), text) if use_lxml: return html.fromstring(text) else: return parse(text, "etree", False)
Example #20
Source File: test_treeadapters.py From MARA_Framework with GNU Lesser General Public License v3.0 | 5 votes |
def test_to_sax(): handler = support.TracingSaxHandler() tree = html5lib.parse("""<html xml:lang="en"> <title>Directory Listing</title> <a href="/"><b/></p> """, treebuilder="etree") walker = getTreeWalker("etree") sax.to_sax(walker(tree), handler) expected = [ 'startDocument', ('startElementNS', ('http://www.w3.org/1999/xhtml', 'html'), 'html', {(None, 'xml:lang'): 'en'}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head', {}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title', {}), ('characters', 'Directory Listing'), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title'), ('characters', '\n '), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head'), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a', {(None, 'href'): '/'}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b', {}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p', {}), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p'), ('characters', '\n '), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b'), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a'), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body'), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'html'), 'html'), 'endDocument', ] assert expected == handler.visited
Example #21
Source File: _html.py From Vaile with GNU General Public License v3.0 | 5 votes |
def content_parser(data, url=None, response_info=None, transport_encoding=None, default_encoding=DEFAULT_ENCODING, is_html=True): ''' Parse data (a bytes object) into an etree representation such as :py:mod:`xml.etree.ElementTree` or `lxml.etree` :param bytes data: The data to parse :param url: The URL of the document being parsed or None :param response_info: Information about the document (contains all HTTP headers as :class:`HTTPMessage`) :param transport_encoding: The character encoding for the document being parsed as specified in the HTTP headers or None. :param default_encoding: The character encoding to use if no encoding could be detected and no transport_encoding is specified :param is_html: If the document is to be parsed as HTML. ''' if not is_html: return try: from html5_parser import parse except Exception: from html5lib import parse kw = {'namespaceHTMLElements': False} if transport_encoding and isinstance(data, bytes): kw['transport_encoding'] = transport_encoding return parse(data, **kw) else: return parse(data, transport_encoding=transport_encoding)
Example #22
Source File: test_integration_sandbox.py From django-payfast with MIT License | 5 votes |
def parse_payfast_page(response): # type: (requests.Response) -> Dict[str, str] """ Scrape some data from a PayFast payment page response. """ assert 'text/html; charset=UTF-8' == response.headers['Content-Type'] html = response.text doc = html5lib.parse(html) # type: ElementTree def _parse(): # type: () -> Iterable[Tuple[str, str]] # The session info: session_tracker = find_id(doc, 'session-tracker') for name in ['type', 'id']: value = session_tracker.attrib['data-{}'.format(name)] if value: yield ('session_{}'.format(name), value) # The payment summary on the left. left = find_id(doc, 'left-column') yield ('payment_summary', text_collapsed(left)) right = find_id(doc, 'right-column') content_box = find_id(right, 'contentBox') # The error notice, if any: notice = find_id_maybe(content_box, 'notice') if notice is not None: yield ('notice', text_lines(notice)) # The wallet payment completion option, if present: wa_tab = find_id_maybe(content_box, 'waTab') if wa_tab is not None: yield ('payment_method', (wa_tab.attrib['data-methodkey'])) pay_button = find_id(wa_tab, 'pay-with-wallet') yield ('pay_button', pay_button.attrib['value']) return dict(_parse())
Example #23
Source File: pypi_wheel_provider.py From resolvelib with ISC License | 5 votes |
def get_metadata_for_wheel(url): data = requests.get(url).content with ZipFile(BytesIO(data)) as z: for n in z.namelist(): if n.endswith(".dist-info/METADATA"): p = BytesParser() return p.parse(z.open(n), headersonly=True) # If we didn't find the metadata, return an empty dict return EmailMessage()
Example #24
Source File: pypi_wheel_provider.py From resolvelib with ISC License | 5 votes |
def get_project_from_pypi(project, extras): """Return candidates created from the project name and extras.""" url = "https://pypi.org/simple/{}".format(project) data = requests.get(url).content doc = html5lib.parse(data, namespaceHTMLElements=False) for i in doc.findall(".//a"): url = i.attrib["href"] py_req = i.attrib.get("data-requires-python") # Skip items that need a different Python version if py_req: spec = SpecifierSet(py_req) if PYTHON_VERSION not in spec: continue path = urlparse(url).path filename = path.rpartition("/")[-1] # We only handle wheels if not filename.endswith(".whl"): continue # TODO: Handle compatibility tags? # Very primitive wheel filename parsing name, version = filename[:-4].split("-")[:2] try: version = Version(version) except InvalidVersion: # Ignore files with invalid versions continue yield Candidate(name, version, url=url, extras=extras)
Example #25
Source File: test_html.py From crocoite with MIT License | 5 votes |
def test_strip_attribute (): d = html5lib.parse ('<a b=1 c="yes" d></a><br b=2 c="no" d keep=1>') stream = StripAttributeFilter (getTreeWalker ('etree')(d), ['b', 'c', 'd']) serializer = HTMLSerializer () assert serializer.render (stream) == '<a></a><br keep=1>'
Example #26
Source File: test_html.py From crocoite with MIT License | 5 votes |
def test_strip_tag (): d = html5lib.parse ('<a>barbaz<b>foobar</b>.</a><b>foobar</b>.<b attr=1><c></c>') stream = StripTagFilter (getTreeWalker ('etree')(d), ['b', 'c']) serializer = HTMLSerializer () assert serializer.render (stream) == '<a>barbaz.</a>.'
Example #27
Source File: parsers.py From riko with MIT License | 5 votes |
def xml2etree(f, xml=True, html5=False): if xml: element_tree = etree.parse(f) elif html5 and html5parser: element_tree = html5parser.parse(f) elif html5parser: element_tree = html.parse(f) else: # html5lib's parser returns an Element, so we must convert it into an # ElementTree element_tree = ElementTree(html.parse(f)) return element_tree
Example #28
Source File: parsers.py From riko with MIT License | 5 votes |
def parse_rss(url=None, **kwargs): try: f = fetch(decode(url), **kwargs) except (ValueError, URLError): parsed = rssparser.parse(url) else: content = f.read() if speedparser else f try: parsed = rssparser.parse(content) finally: f.close() return parsed
Example #29
Source File: test_performance.py From normandy with Mozilla Public License 2.0 | 5 votes |
def test_cache_headers(self, conf, requests_session, path, only_readonly): if path.startswith("/api/"): pytest.xfail("caching temporarily hidden on api by nginx") r = requests_session.get(conf.getoption("server") + path) r.raise_for_status() cache_control = r.headers.get("cache-control") assert cache_control is not None # parse cache-control header. parts = [part.strip() for part in cache_control.split(",")] max_age = [part for part in parts if part.startswith("max-age=")][0] max_age_seconds = int(max_age.split("=")[1]) assert "public" in parts assert max_age_seconds > 0
Example #30
Source File: test_integration_sandbox.py From django-payfast with MIT License | 5 votes |
def do_checkout( checkout_data, # type: Dict[str, str] sign_checkout, # type: bool ): # type: (...) -> Dict[str, str] """ Common test helper: do a checkout, and assert results. This takes unsigned checkout data, and will add a signature if `sign_checkout` is true. Return the checkout page's parse. """ # Expected values for result assertions: try: expected_amount = '{:.2f}'.format(decimal.Decimal(checkout_data['amount'])) except decimal.InvalidOperation: # We may be testing a value that isn't Decimal-parseable; # in that case, just expect it unmodified. expected_amount = checkout_data['amount'] expected_item_name = checkout_data['item_name'].strip() # PayFast strips this for display. expected_payment_summary = ( '{} Payment total R {} ZAR'.format(expected_item_name, expected_amount) .strip() # Strip to handle item names that render empty. ) if sign_checkout: assert 'signature' not in checkout_data, checkout_data checkout_data['signature'] = api.checkout_signature(checkout_data) response = post_sandbox_checkout(checkout_data) parsed = parse_payfast_page(response) assert { 'session_type': 'p-sb', 'session_id': parsed.get('session_id', 'MISSING'), 'payment_summary': expected_payment_summary, 'payment_method': '1', 'pay_button': 'Complete Payment', } == parsed return parsed