Python html5lib.HTMLParser() Examples

The following are 30 code examples of html5lib.HTMLParser(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module html5lib , or try the search function .
Example #1
Source File: test_alphabeticalattributes.py    From bazarr with GNU General Public License v3.0 6 votes vote down vote up
def test_with_serializer():
    """Verify filter works in the context of everything else"""
    parser = html5lib.HTMLParser()
    dom = parser.parseFragment('<svg><pattern xlink:href="#patt2" id="patt1"></svg>')
    walker = html5lib.getTreeWalker('etree')
    ser = HTMLSerializer(
        alphabetical_attributes=True,
        quote_attr_values='always'
    )

    # FIXME(willkg): The "xlink" namespace gets dropped by the serializer. When
    # that gets fixed, we can fix this expected result.
    assert (
        ser.render(walker(dom)) ==
        '<svg><pattern id="patt1" href="#patt2"></pattern></svg>'
    ) 
Example #2
Source File: _html5lib.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def __init__(self, namespaceHTMLElements, soup=None,
                 store_line_numbers=True, **kwargs):
        if soup:
            self.soup = soup
        else:
            from bs4 import BeautifulSoup
            # TODO: Why is the parser 'html.parser' here? To avoid an
            # infinite loop?
            self.soup = BeautifulSoup(
                "", "html.parser", store_line_numbers=store_line_numbers,
                **kwargs
            )
        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)

        # This will be set later to an html5lib.html5parser.HTMLParser
        # object, which we can use to track the current line number.
        self.parser = None
        self.store_line_numbers = store_line_numbers 
Example #3
Source File: _html5lib.py    From Tautulli with GNU General Public License v3.0 6 votes vote down vote up
def __init__(self, namespaceHTMLElements, soup=None,
                 store_line_numbers=True, **kwargs):
        if soup:
            self.soup = soup
        else:
            from bs4 import BeautifulSoup
            # TODO: Why is the parser 'html.parser' here? To avoid an
            # infinite loop?
            self.soup = BeautifulSoup(
                "", "html.parser", store_line_numbers=store_line_numbers,
                **kwargs
            )
        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)

        # This will be set later to an html5lib.html5parser.HTMLParser
        # object, which we can use to track the current line number.
        self.parser = None
        self.store_line_numbers = store_line_numbers 
Example #4
Source File: __init__.py    From Tautulli with GNU General Public License v3.0 5 votes vote down vote up
def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
          styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
          strip_comments=True):
    """Clean an HTML fragment and return it

    :arg text: the text to clean
    :arg tags: whitelist of allowed tags; defaults to
        ``bleach.ALLOWED_TAGS``
    :arg attributes: whitelist of allowed attributes; defaults to
        ``bleach.ALLOWED_ATTRIBUTES``
    :arg styles: whitelist of allowed css; defaults to
        ``bleach.ALLOWED_STYLES``
    :arg protocols: whitelist of allowed protocols for links; defaults
        to ``bleach.ALLOWED_PROTOCOLS``
    :arg strip: whether or not to strip disallowed elements
    :arg strip_comments: whether or not to strip HTML comments

    """
    if not text:
        return ''

    text = force_unicode(text)

    class s(BleachSanitizer):
        allowed_elements = tags
        allowed_attributes = attributes
        allowed_css_properties = styles
        allowed_protocols = protocols
        strip_disallowed_elements = strip
        strip_html_comments = strip_comments

    parser = html5lib.HTMLParser(tokenizer=s)

    return _render(parser.parseFragment(text)) 
Example #5
Source File: _html5lib.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def feed(self, markup):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)

        extra_kwargs = dict()
        if not isinstance(markup, unicode):
            if new_html5lib:
                extra_kwargs['override_encoding'] = self.user_specified_encoding
            else:
                extra_kwargs['encoding'] = self.user_specified_encoding
        doc = parser.parse(markup, **extra_kwargs)

        # Set the character encoding detected by the tokenizer.
        if isinstance(markup, unicode):
            # We need to special-case this because html5lib sets
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
        else:
            original_encoding = parser.tokenizer.stream.charEncoding[0]
            if not isinstance(original_encoding, basestring):
                # In 0.99999999 and up, the encoding is an html5lib
                # Encoding object. We want to use a string for compatibility
                # with other tree builders.
                original_encoding = original_encoding.name
            doc.original_encoding = original_encoding 
Example #6
Source File: diagnose.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
    """
    parser = AnnouncingParser()
    parser.feed(data) 
Example #7
Source File: _html5lib.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def feed(self, markup):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)

        extra_kwargs = dict()
        if not isinstance(markup, str):
            if new_html5lib:
                extra_kwargs['override_encoding'] = self.user_specified_encoding
            else:
                extra_kwargs['encoding'] = self.user_specified_encoding
        doc = parser.parse(markup, **extra_kwargs)

        # Set the character encoding detected by the tokenizer.
        if isinstance(markup, str):
            # We need to special-case this because html5lib sets
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
        else:
            original_encoding = parser.tokenizer.stream.charEncoding[0]
            if not isinstance(original_encoding, str):
                # In 0.99999999 and up, the encoding is an html5lib
                # Encoding object. We want to use a string for compatibility
                # with other tree builders.
                original_encoding = original_encoding.name
            doc.original_encoding = original_encoding 
Example #8
Source File: diagnose.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
    """
    parser = AnnouncingParser()
    parser.feed(data) 
Example #9
Source File: _html5lib.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def feed(self, markup):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
        self.underlying_builder.parser = parser
        extra_kwargs = dict()
        if not isinstance(markup, str):
            if new_html5lib:
                extra_kwargs['override_encoding'] = self.user_specified_encoding
            else:
                extra_kwargs['encoding'] = self.user_specified_encoding
        doc = parser.parse(markup, **extra_kwargs)
        
        # Set the character encoding detected by the tokenizer.
        if isinstance(markup, str):
            # We need to special-case this because html5lib sets
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
        else:
            original_encoding = parser.tokenizer.stream.charEncoding[0]
            if not isinstance(original_encoding, str):
                # In 0.99999999 and up, the encoding is an html5lib
                # Encoding object. We want to use a string for compatibility
                # with other tree builders.
                original_encoding = original_encoding.name
            doc.original_encoding = original_encoding
        self.underlying_builder.parser = None 
Example #10
Source File: diagnose.py    From Tautulli with GNU General Public License v3.0 5 votes vote down vote up
def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
    data = rdoc(num_elements)
    print("Generated a large invalid HTML document (%d bytes)." % len(data))
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception as e:
            print("%s could not parse the markup." % parser)
            traceback.print_exc()
        if success:
            print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))

    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
    print("Raw lxml parsed the markup in %.2fs." % (b-a))

    import html5lib
    parser = html5lib.HTMLParser()
    a = time.time()
    parser.parse(data)
    b = time.time()
    print("Raw html5lib parsed the markup in %.2fs." % (b-a)) 
Example #11
Source File: diagnose.py    From MIA-Dictionary-Addon with GNU General Public License v3.0 5 votes vote down vote up
def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
    data = rdoc(num_elements)
    print("Generated a large invalid HTML document (%d bytes)." % len(data))
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception as e:
            print("%s could not parse the markup." % parser)
            traceback.print_exc()
        if success:
            print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))

    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
    print("Raw lxml parsed the markup in %.2fs." % (b-a))

    import html5lib
    parser = html5lib.HTMLParser()
    a = time.time()
    parser.parse(data)
    b = time.time()
    print("Raw html5lib parsed the markup in %.2fs." % (b-a)) 
Example #12
Source File: _html5lib.py    From moviegrabber with GNU General Public License v3.0 5 votes vote down vote up
def feed(self, markup):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
        doc = parser.parse(markup, encoding=self.user_specified_encoding)

        # Set the character encoding detected by the tokenizer.
        if isinstance(markup, unicode):
            # We need to special-case this because html5lib sets
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
        else:
            doc.original_encoding = parser.tokenizer.stream.charEncoding[0] 
Example #13
Source File: diagnose.py    From moviegrabber with GNU General Public License v3.0 5 votes vote down vote up
def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
    """
    parser = AnnouncingParser()
    parser.feed(data) 
Example #14
Source File: diagnose.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
    data = rdoc(num_elements)
    print("Generated a large invalid HTML document (%d bytes)." % len(data))
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception as e:
            print("%s could not parse the markup." % parser)
            traceback.print_exc()
        if success:
            print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))

    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
    print("Raw lxml parsed the markup in %.2fs." % (b-a))

    import html5lib
    parser = html5lib.HTMLParser()
    a = time.time()
    parser.parse(data)
    b = time.time()
    print("Raw html5lib parsed the markup in %.2fs." % (b-a)) 
Example #15
Source File: __init__.py    From Serverless-Deep-Learning-with-TensorFlow-and-AWS-Lambda with MIT License 5 votes vote down vote up
def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
          styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
          strip_comments=True):
    """Clean an HTML fragment of malicious content and return it

    This function is a security-focused function whose sole purpose is to
    remove malicious content from a string such that it can be displayed as
    content in a web page.

    This function is not designed to use to transform content to be used in
    non-web-page contexts.

    :arg text: the text to clean
    :arg tags: whitelist of allowed tags; defaults to
        ``bleach.ALLOWED_TAGS``
    :arg attributes: whitelist of allowed attributes; defaults to
        ``bleach.ALLOWED_ATTRIBUTES``
    :arg styles: whitelist of allowed css; defaults to
        ``bleach.ALLOWED_STYLES``
    :arg protocols: whitelist of allowed protocols for links; defaults
        to ``bleach.ALLOWED_PROTOCOLS``
    :arg strip: whether or not to strip disallowed elements
    :arg strip_comments: whether or not to strip HTML comments

    """
    if not text:
        return ''

    text = force_unicode(text)

    class s(BleachSanitizer):
        allowed_elements = tags
        allowed_attributes = attributes
        allowed_css_properties = styles
        allowed_protocols = protocols
        strip_disallowed_elements = strip
        strip_html_comments = strip_comments

    parser = html5lib.HTMLParser(tokenizer=s)

    return _render(parser.parseFragment(text)) 
Example #16
Source File: diagnose.py    From Tautulli with GNU General Public License v3.0 5 votes vote down vote up
def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
    """
    parser = AnnouncingParser()
    parser.feed(data) 
Example #17
Source File: diagnose.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.

    :param data: Some markup.
    """
    parser = AnnouncingParser()
    parser.feed(data) 
Example #18
Source File: test_parser2.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def test_maintain_duplicate_attribute_order():
    # This is here because we impl it in parser and not tokenizer
    p = HTMLParser()
    attrs = [(unichr(x), i) for i, x in enumerate(range(ord('a'), ord('z')))]
    token = {'name': 'html',
             'selfClosing': False,
             'selfClosingAcknowledged': False,
             'type': tokenTypes["StartTag"],
             'data': attrs + [('a', len(attrs))]}
    out = p.normalizeToken(token)
    attr_order = list(out["data"].keys())
    assert attr_order == [x for x, i in attrs] 
Example #19
Source File: diagnose.py    From MIA-Dictionary-Addon with GNU General Public License v3.0 5 votes vote down vote up
def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
    """
    parser = AnnouncingParser()
    parser.feed(data) 
Example #20
Source File: _html5lib.py    From MIA-Dictionary-Addon with GNU General Public License v3.0 5 votes vote down vote up
def feed(self, markup):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)

        extra_kwargs = dict()
        if not isinstance(markup, str):
            if new_html5lib:
                extra_kwargs['override_encoding'] = self.user_specified_encoding
            else:
                extra_kwargs['encoding'] = self.user_specified_encoding
        doc = parser.parse(markup, **extra_kwargs)

        # Set the character encoding detected by the tokenizer.
        if isinstance(markup, str):
            # We need to special-case this because html5lib sets
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
        else:
            original_encoding = parser.tokenizer.stream.charEncoding[0]
            if not isinstance(original_encoding, str):
                # In 0.99999999 and up, the encoding is an html5lib
                # Encoding object. We want to use a string for compatibility
                # with other tree builders.
                original_encoding = original_encoding.name
            doc.original_encoding = original_encoding 
Example #21
Source File: diagnose.py    From FastWordQuery with GNU General Public License v3.0 5 votes vote down vote up
def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
    """
    parser = AnnouncingParser()
    parser.feed(data) 
Example #22
Source File: _html5lib.py    From Crunchyroll-XML-Decoder with GNU General Public License v2.0 5 votes vote down vote up
def feed(self, markup):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
        doc = parser.parse(markup, encoding=self.user_specified_encoding)

        # Set the character encoding detected by the tokenizer.
        if isinstance(markup, unicode):
            # We need to special-case this because html5lib sets
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
        else:
            doc.original_encoding = parser.tokenizer.stream.charEncoding[0] 
Example #23
Source File: diagnose.py    From POC-EXP with GNU General Public License v3.0 5 votes vote down vote up
def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
    """
    parser = AnnouncingParser()
    parser.feed(data) 
Example #24
Source File: _html5lib.py    From POC-EXP with GNU General Public License v3.0 5 votes vote down vote up
def feed(self, markup):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
        doc = parser.parse(markup, encoding=self.user_specified_encoding)

        # Set the character encoding detected by the tokenizer.
        if isinstance(markup, unicode):
            # We need to special-case this because html5lib sets
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
        else:
            doc.original_encoding = parser.tokenizer.stream.charEncoding[0] 
Example #25
Source File: _html5lib.py    From python-for-android with Apache License 2.0 5 votes vote down vote up
def feed(self, markup):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
        doc = parser.parse(markup, encoding=self.user_specified_encoding)

        # Set the character encoding detected by the tokenizer.
        if isinstance(markup, str):
            # We need to special-case this because html5lib sets
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
        else:
            doc.original_encoding = parser.tokenizer.stream.charEncoding[0] 
Example #26
Source File: diagnose.py    From plugin.git.browser with GNU General Public License v3.0 5 votes vote down vote up
def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
    data = rdoc(num_elements)
    print("Generated a large invalid HTML document (%d bytes)." % len(data))
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception as e:
            print("%s could not parse the markup." % parser)
            traceback.print_exc()
        if success:
            print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))

    from lxml import etree
    a = time.time()
    etree.HTML(data)
    b = time.time()
    print("Raw lxml parsed the markup in %.2fs." % (b-a))

    import html5lib
    parser = html5lib.HTMLParser()
    a = time.time()
    parser.parse(data)
    b = time.time()
    print("Raw html5lib parsed the markup in %.2fs." % (b-a)) 
Example #27
Source File: diagnose.py    From plugin.git.browser with GNU General Public License v3.0 5 votes vote down vote up
def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
    """
    parser = AnnouncingParser()
    parser.feed(data) 
Example #28
Source File: _html5lib.py    From plugin.git.browser with GNU General Public License v3.0 5 votes vote down vote up
def feed(self, markup):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)

        extra_kwargs = dict()
        if not isinstance(markup, str):
            if new_html5lib:
                extra_kwargs['override_encoding'] = self.user_specified_encoding
            else:
                extra_kwargs['encoding'] = self.user_specified_encoding
        doc = parser.parse(markup, **extra_kwargs)

        # Set the character encoding detected by the tokenizer.
        if isinstance(markup, str):
            # We need to special-case this because html5lib sets
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
        else:
            original_encoding = parser.tokenizer.stream.charEncoding[0]
            if not isinstance(original_encoding, str):
                # In 0.99999999 and up, the encoding is an html5lib
                # Encoding object. We want to use a string for compatibility
                # with other tree builders.
                original_encoding = original_encoding.name
            doc.original_encoding = original_encoding 
Example #29
Source File: diagnose.py    From ru with GNU General Public License v2.0 5 votes vote down vote up
def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
    """
    parser = AnnouncingParser()
    parser.feed(data) 
Example #30
Source File: _html5lib.py    From ServerlessCrawler-VancouverRealState with MIT License 5 votes vote down vote up
def feed(self, markup):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)

        extra_kwargs = dict()
        if not isinstance(markup, unicode):
            if new_html5lib:
                extra_kwargs['override_encoding'] = self.user_specified_encoding
            else:
                extra_kwargs['encoding'] = self.user_specified_encoding
        doc = parser.parse(markup, **extra_kwargs)

        # Set the character encoding detected by the tokenizer.
        if isinstance(markup, unicode):
            # We need to special-case this because html5lib sets
            # charEncoding to UTF-8 if it gets Unicode input.
            doc.original_encoding = None
        else:
            original_encoding = parser.tokenizer.stream.charEncoding[0]
            if not isinstance(original_encoding, basestring):
                # In 0.99999999 and up, the encoding is an html5lib
                # Encoding object. We want to use a string for compatibility
                # with other tree builders.
                original_encoding = original_encoding.name
            doc.original_encoding = original_encoding