Python HTMLParser.HTMLParseError() Examples

The following are 30 code examples of HTMLParser.HTMLParseError(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module HTMLParser , or try the search function .
Example #1
Source File: _htmlparser.py    From svg-animation-tools with MIT License 6 votes vote down vote up
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3. 
Example #2
Source File: BeautifulSoup.py    From python-for-android with Apache License 2.0 6 votes vote down vote up
def parse_declaration(self, i):
        """Treat a bogus SGML declaration as raw data. Treat a CDATA
        declaration as a CData object."""
        j = None
        if self.rawdata[i:i+9] == '<![CDATA[':
             k = self.rawdata.find(']]>', i)
             if k == -1:
                 k = len(self.rawdata)
             data = self.rawdata[i+9:k]
             j = k+3
             self._toStringSubclass(data, CData)
        else:
            try:
                j = HTMLParser.parse_declaration(self, i)
            except HTMLParseError:
                toHandle = self.rawdata[i:]
                self.handle_data(toHandle)
                j = i + len(toHandle)
        return j 
Example #3
Source File: _htmlparser.py    From CrisisMappingToolkit with Apache License 2.0 6 votes vote down vote up
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3. 
Example #4
Source File: _htmlparser.py    From stopstalk-deployment with MIT License 6 votes vote down vote up
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3. 
Example #5
Source File: BeautifulSoup.py    From dirigible-spreadsheet with MIT License 6 votes vote down vote up
def parse_declaration(self, i):
        """Treat a bogus SGML declaration as raw data. Treat a CDATA
        declaration as a CData object."""
        j = None
        if self.rawdata[i:i+9] == '<![CDATA[':
             k = self.rawdata.find(']]>', i)
             if k == -1:
                 k = len(self.rawdata)
             data = self.rawdata[i+9:k]
             j = k+3
             self._toStringSubclass(data, CData)
        else:
            try:
                j = HTMLParser.parse_declaration(self, i)
            except HTMLParseError:
                toHandle = self.rawdata[i:]
                self.handle_data(toHandle)
                j = i + len(toHandle)
        return j 
Example #6
Source File: _http.py    From BruteXSS with GNU General Public License v3.0 6 votes vote down vote up
def http_response(self, request, response):
        if not hasattr(response, "seek"):
            response = response_seek_wrapper(response)
        http_message = response.info()
        url = response.geturl()
        ct_hdrs = http_message.getheaders("content-type")
        if is_html(ct_hdrs, url, self._allow_xhtml):
            try:
                try:
                    html_headers = parse_head(response,
                                              self.head_parser_class())
                finally:
                    response.seek(0)
            except (HTMLParser.HTMLParseError,
                    sgmllib.SGMLParseError):
                pass
            else:
                for hdr, val in html_headers:
                    # add a header
                    http_message.dict[hdr.lower()] = val
                    text = hdr + ": " + val
                    for line in text.split("\n"):
                        http_message.headers.append(line + "\n")
        return response 
Example #7
Source File: _htmlparser.py    From svg-animation-tools with MIT License 6 votes vote down vote up
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3. 
Example #8
Source File: _http.py    From pelisalacarta-ce with GNU General Public License v3.0 6 votes vote down vote up
def http_response(self, request, response):
        if not hasattr(response, "seek"):
            response = response_seek_wrapper(response)
        http_message = response.info()
        url = response.geturl()
        ct_hdrs = http_message.getheaders("content-type")
        if is_html(ct_hdrs, url, self._allow_xhtml):
            try:
                try:
                    html_headers = parse_head(response,
                                              self.head_parser_class())
                finally:
                    response.seek(0)
            except (HTMLParser.HTMLParseError,
                    sgmllib.SGMLParseError):
                pass
            else:
                for hdr, val in html_headers:
                    # add a header
                    http_message.dict[hdr.lower()] = val
                    text = hdr + ": " + val
                    for line in text.split("\n"):
                        http_message.headers.append(line + "\n")
        return response 
Example #9
Source File: _htmlparser.py    From weeman with GNU General Public License v3.0 6 votes vote down vote up
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3. 
Example #10
Source File: test_htmlparser.py    From CTFCrackTools with GNU General Public License v3.0 5 votes vote down vote up
def _parse_error(self, source):
        def parse(source=source):
            parser = HTMLParser.HTMLParser()
            parser.feed(source)
            parser.close()
        self.assertRaises(HTMLParser.HTMLParseError, parse) 
Example #11
Source File: _form.py    From pelisalacarta-ce with GNU General Public License v3.0 5 votes vote down vote up
def feed(self, data):
        try:
            HTMLParser.HTMLParser.feed(self, data)
        except HTMLParser.HTMLParseError, exc:
            raise ParseError(exc) 
Example #12
Source File: _htmlparser.py    From FastWordQuery with GNU General Public License v3.0 5 votes vote down vote up
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e 
Example #13
Source File: clientform.py    From POC-EXP with GNU General Public License v3.0 5 votes vote down vote up
def feed(self, data):
            try:
                HTMLParser.HTMLParser.feed(self, data)
            except HTMLParser.HTMLParseError, exc:
                raise ParseError(exc) 
Example #14
Source File: test_htmlparser.py    From CTFCrackTools-V2 with GNU General Public License v3.0 5 votes vote down vote up
def _parse_error(self, source):
        def parse(source=source):
            parser = HTMLParser.HTMLParser()
            parser.feed(source)
            parser.close()
        self.assertRaises(HTMLParser.HTMLParseError, parse) 
Example #15
Source File: _htmlparser.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e 
Example #16
Source File: clientform.py    From EasY_HaCk with Apache License 2.0 5 votes vote down vote up
def feed(self, data):
            try:
                HTMLParser.HTMLParser.feed(self, data)
            except HTMLParser.HTMLParseError, exc:
                raise ParseError(exc) 
Example #17
Source File: html.py    From sync-engine with GNU Affero General Public License v3.0 5 votes vote down vote up
def strip_tags(html):
    s = HTMLTagStripper()
    try:
        s.feed(html)
    except HTMLParseError:
        get_logger().error('error stripping tags', raw_html=html)
    return s.get_data()

# https://djangosnippets.org/snippets/19/ 
Example #18
Source File: _form.py    From BruteXSS with GNU General Public License v3.0 5 votes vote down vote up
def feed(self, data):
        try:
            HTMLParser.HTMLParser.feed(self, data)
        except HTMLParser.HTMLParseError, exc:
            raise ParseError(exc) 
Example #19
Source File: _htmlparser.py    From ServerlessCrawler-VancouverRealState with MIT License 5 votes vote down vote up
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e 
Example #20
Source File: test_htmlparser.py    From medicare-demo with Apache License 2.0 5 votes vote down vote up
def _parse_error(self, source):
        def parse(source=source):
            parser = HTMLParser.HTMLParser()
            parser.feed(source)
            parser.close()
        self.assertRaises(HTMLParser.HTMLParseError, parse) 
Example #21
Source File: test_htmlparser.py    From gcblue with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _parse_error(self, source):
        def parse(source=source):
            parser = HTMLParser.HTMLParser()
            parser.feed(source)
            parser.close()
        self.assertRaises(HTMLParser.HTMLParseError, parse) 
Example #22
Source File: clientform.py    From NoobSec-Toolkit with GNU General Public License v2.0 5 votes vote down vote up
def feed(self, data):
            try:
                HTMLParser.HTMLParser.feed(self, data)
            except HTMLParser.HTMLParseError, exc:
                raise ParseError(exc) 
Example #23
Source File: clientform.py    From NoobSec-Toolkit with GNU General Public License v2.0 5 votes vote down vote up
def feed(self, data):
            try:
                HTMLParser.HTMLParser.feed(self, data)
            except HTMLParser.HTMLParseError, exc:
                raise ParseError(exc) 
Example #24
Source File: clientform.py    From NoobSec-Toolkit with GNU General Public License v2.0 5 votes vote down vote up
def feed(self, data):
            try:
                HTMLParser.HTMLParser.feed(self, data)
            except HTMLParser.HTMLParseError, exc:
                raise ParseError(exc) 
Example #25
Source File: clientform.py    From NoobSec-Toolkit with GNU General Public License v2.0 5 votes vote down vote up
def feed(self, data):
            try:
                HTMLParser.HTMLParser.feed(self, data)
            except HTMLParser.HTMLParseError, exc:
                raise ParseError(exc) 
Example #26
Source File: test_htmlparser.py    From oss-ftp with MIT License 5 votes vote down vote up
def _parse_error(self, source):
        def parse(source=source):
            parser = HTMLParser.HTMLParser()
            parser.feed(source)
            parser.close()
        self.assertRaises(HTMLParser.HTMLParseError, parse) 
Example #27
Source File: test_htmlparser.py    From BinderFilter with MIT License 5 votes vote down vote up
def _parse_error(self, source):
        def parse(source=source):
            parser = HTMLParser.HTMLParser()
            parser.feed(source)
            parser.close()
        self.assertRaises(HTMLParser.HTMLParseError, parse) 
Example #28
Source File: htmlutil.py    From closure-linter with Apache License 2.0 5 votes vote down vote up
def StripTags(str):
  """Returns the string with HTML tags stripped.

  Args:
    str: An html string.

  Returns:
    The html string with all tags stripped. If there was a parse error, returns
    the text successfully parsed so far.
  """
  # Brute force approach to stripping as much HTML as possible. If there is a
  # parsing error, don't strip text before parse error position, and continue
  # trying from there.
  final_text = ''
  finished = False
  while not finished:
    try:
      strip = _HtmlStripper()
      strip.feed(str)
      strip.close()
      str = strip.get_output()
      final_text += str
      finished = True
    except HTMLParser.HTMLParseError, e:
      final_text += str[:e.offset]
      str = str[e.offset + 1:] 
Example #29
Source File: test_htmlparser.py    From ironpython2 with Apache License 2.0 5 votes vote down vote up
def _parse_error(self, source):
        def parse(source=source):
            parser = HTMLParser.HTMLParser()
            parser.feed(source)
            parser.close()
        self.assertRaises(HTMLParser.HTMLParseError, parse) 
Example #30
Source File: _htmlparser.py    From ServerlessCrawler-VancouverRealState with MIT License 5 votes vote down vote up
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e