Python lxml.etree.ParseError() Examples

The following are 16 code examples of lxml.etree.ParseError(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module lxml.etree , or try the search function .
Example #1
Source File: romeo.py    From dissemin with GNU Affero General Public License v3.0 6 votes vote down vote up
def perform_romeo_query(self, search_terms):
        search_terms = search_terms.copy()
        if self.api_key:
            search_terms['ak'] = self.api_key

        # Perform the query
        try:
            req = requests.get(self.base_url, params=search_terms, timeout=20)
        except requests.exceptions.RequestException as e:
            raise MetadataSourceException('Error while querying RoMEO.\n' +
                                          'URL was: '+self.base_url+'\n' +
                                          'Parameters were: '+str(search_terms)+'\n' +
                                          'Error is: '+str(e))

        # Parse it
        try:
            parser = ET.XMLParser(encoding='ISO-8859-1')
            root = ET.parse(BytesIO(req.content), parser)
        except ET.ParseError as e:
            raise MetadataSourceException('RoMEO returned an invalid XML response.\n' +
                                          'URL was: '+self.base_url+'\n' +
                                          'Parameters were: '+str(search_terms)+'\n' +
                                          'Error is: '+str(e))

        return root 
Example #2
Source File: recipe-578492.py    From code with MIT License 6 votes vote down vote up
def __init__(self, *a, **k):
        # Importing names from *a and **k or using defaults
        self.ffpath = k.setdefault('ffpath', None)
        self.root   = k.setdefault('root', None) if 'tree' not in k else k['tree'].getroot()

        if len(a) > 0:
            etype   = type(et.Element("a"))
            ettype  = type(et.ElementTree())
            for s in a:
                if isinstance(s, (etype,ettype)):
                    if self.root == None:
                        self.root = s.getroot() if isinstance(s,ettype) else s
                elif isinstance(s, str):
                    if self.ffpath == None:
                        self.ffpath = s
                else:
                    raise ValueError("XML\'s initializer only accepts string, ElementTree or Element")
        
        if self.ffpath != None and self.root == None:
            try:
                self.root = et.parse(self.ffpath).getroot()
            except (IOError, et.ParseError):
                # TODO Populate tree and save it
                raise 
Example #3
Source File: test_base.py    From CumulusCI with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_transform__xml_parse_error(self):
        task = create_task(
            ConcreteMetadataSingleEntityTransformTask,
            {"managed": False, "api_version": "47.0", "api_names": "Test"},
        )

        task.entity = "CustomApplication"

        with tempfile.TemporaryDirectory() as tmpdir:
            task._create_directories(tmpdir)

            test_path = task.retrieve_dir / "applications"
            test_path.mkdir()
            test_path = test_path / "Test.app"

            test_path.write_text(">>>>>NOT XML<<<<<")
            with pytest.raises(etree.ParseError):
                task._transform() 
Example #4
Source File: html.py    From ingestors with MIT License 6 votes vote down vote up
def extract_html_content(self, html_body, fix_html=True):
        """Ingestor implementation."""
        if html_body is None:
            return
        try:
            try:
                doc = html.fromstring(html_body)
            except ValueError:
                # Ship around encoding declarations.
                # https://stackoverflow.com/questions/3402520
                html_body = self.RE_XML_ENCODING.sub('', html_body, count=1)
                doc = html.fromstring(html_body)
        except (ParserError, ParseError, ValueError):
            raise ProcessingException("HTML could not be parsed.")

        self.extract_html_header(doc)
        self.cleaner(doc)
        text = self.extract_html_text(doc)
        self.result.flag(self.result.FLAG_HTML)
        self.result.emit_html_body(html_body, text) 
Example #5
Source File: xml.py    From ingestors with MIT License 6 votes vote down vote up
def ingest(self, file_path):
        """Ingestor implementation."""
        file_size = self.result.size or os.path.getsize(file_path)
        if file_size > self.MAX_SIZE:
            raise ProcessingException("XML file is too large.")

        try:
            doc = etree.parse(file_path)
        except (ParserError, ParseError):
            raise ProcessingException("XML could not be parsed.")

        text = self.extract_html_text(doc.getroot())
        transform = etree.XSLT(self.XSLT)
        html_doc = transform(doc)
        html_body = html.tostring(html_doc, encoding=str, pretty_print=True)
        self.result.flag(self.result.FLAG_HTML)
        self.result.emit_html_body(html_body, text) 
Example #6
Source File: translate.py    From odoo13-x64 with GNU General Public License v3.0 6 votes vote down vote up
def xml_translate(callback, value):
    """ Translate an XML value (string), using `callback` for translating text
        appearing in `value`.
    """
    if not value:
        return value

    try:
        root = parse_xml(value)
        result = translate_xml_node(root, callback, parse_xml, serialize_xml)
        return serialize_xml(result)
    except etree.ParseError:
        # fallback for translated terms: use an HTML parser and wrap the term
        root = parse_html(u"<div>%s</div>" % value)
        result = translate_xml_node(root, callback, parse_xml, serialize_xml)
        # remove tags <div> and </div> from result
        return serialize_xml(result)[5:-6] 
Example #7
Source File: connections.py    From python-gvm with GNU General Public License v3.0 5 votes vote down vote up
def _feed_xml(self, data):
        try:
            self._parser.feed(data)
        except etree.ParseError as e:
            raise GvmError(
                "Cannot parse XML response. Response data "
                "read {0}".format(data),
                e,
            ) 
Example #8
Source File: Feed.py    From python-in-practice with GNU General Public License v3.0 5 votes vote down vote up
def read(feed, limit, timeout=10):
    try:
        with urllib.request.urlopen(feed.url, None, timeout) as file:
            data = file.read()
        body = _parse(data, limit)
        if body:
            body = ["<h2>{}</h2>\n".format(escape(feed.title))] + body
            return True, body
        return True, None
    except (ValueError, urllib.error.HTTPError, urllib.error.URLError,
            etree.ParseError, socket.timeout) as err:
        return False, "Error: {}: {}".format(feed.url, err) 
Example #9
Source File: crawler.py    From spidy with GNU General Public License v3.0 5 votes vote down vote up
def crawl(url, thread_id=0):
    global WORDS, OVERRIDE_SIZE, HEADER, SAVE_PAGES, SAVE_WORDS
    if not OVERRIDE_SIZE:
        try:
            # Attempt to get the size in bytes of the document
            length = int(requests.head(url, headers=HEADER).headers['Content-Length'])
        except KeyError:  # Sometimes no Content-Length header is returned...
            length = 1
        if length > 524288000:  # If the page is larger than 500 MB
            raise SizeError
    # If the SizeError is raised it will be caught in the except block in the run section,
    # and the following code will not be run.
    page = requests.get(url, headers=HEADER)  # Get page
    word_list = []
    if SAVE_WORDS:
        word_list = make_words(page)
        for word in word_list:
            WORDS.put(word)
    try:
        # Pull out all links after resolving them using any <base> tags found in the document.
        links = [link for element, attribute, link, pos in iterlinks(resolve_base_href(page.content))]
    except etree.ParseError:
        # If the document is not HTML content this will return an empty list.
        links = []
    links = list(set(links))
    if SAVE_PAGES:
        save_page(url, page)
    if SAVE_WORDS:
        # Announce which link was crawled
        write_log('CRAWL', 'Found {0} links and {1} words on {2}'.format(len(links), len(word_list), url),
                  worker=thread_id)
    else:
        # Announce which link was crawled
        write_log('CRAWL', 'Found {0} links on {1}'.format(len(links), url),
                  worker=thread_id)
    return links 
Example #10
Source File: http.py    From memorious with MIT License 5 votes vote down vote up
def html(self):
        if not hasattr(self, '_html'):
            self._html = None
            if self.content_type in NON_HTML:
                return
            if self.raw is None or not len(self.raw):
                return
            try:
                self._html = html.fromstring(self.text)
            except ValueError as ve:
                if 'encoding declaration' in str(ve):
                    self._html = html.parse(self.file_path.as_posix())
            except (etree.ParserError, etree.ParseError):
                pass
        return self._html 
Example #11
Source File: http.py    From memorious with MIT License 5 votes vote down vote up
def json(self):
        if not hasattr(self, '_json'):
            if self.file_path is None:
                raise ParseError("Cannot parse failed download.")
            with open(self.file_path, 'r') as fh:
                self._json = json.load(fh)
        return self._json 
Example #12
Source File: confluence.py    From confluence-publisher with MIT License 5 votes vote down vote up
def feed(self, data):
        self._init_parser()
        try:
            return self._original_parser.feed(data)
        except etree.ParseError as _err:
            str_err = str(_err)
            # fix unknown entity
            if 'undefined entity' in str_err:
                log.warning('WARNING {}'.format(str_err))
                entity = re.search(r'&\w+;', str_err)
                if entity:
                    raw_entity = entity.group()[1:-1]
                    self.known_entity[raw_entity] = raw_entity
                    return self.feed(data)
            raise _err 
Example #13
Source File: wsman.py    From pypsrp with MIT License 4 votes vote down vote up
def invoke(self, action, resource_uri, resource, option_set=None,
               selector_set=None, timeout=None):
        """
        Send a generic WSMan request to the host.

        :param action: The action to run, this relates to the wsa:Action header
            field.
        :param resource_uri: The resource URI that the action relates to, this
          relates to the wsman:ResourceURI header field.
        :param resource: This is an optional xml.etree.ElementTree Element to
            be added to the s:Body section.
        :param option_set: a wsman.OptionSet to add to the request
        :param selector_set: a wsman.SelectorSet to add to the request
        :param timeout: Override the default wsman:OperationTimeout value for
            the request, this should be an int in seconds.
        :return: The ET Element of the response XML from the server
        """
        s = NAMESPACES['s']
        envelope = ET.Element("{%s}Envelope" % s)

        header = self._create_header(action, resource_uri, option_set,
                                     selector_set, timeout)
        envelope.append(header)

        body = ET.SubElement(envelope, "{%s}Body" % s)
        if resource is not None:
            body.append(resource)

        message_id = header.find("wsa:MessageID", namespaces=NAMESPACES).text
        xml = ET.tostring(envelope, encoding='utf-8', method='xml')

        try:
            response = self.transport.send(xml)
        except WinRMTransportError as err:
            try:
                # try and parse the XML and get the WSManFault
                raise self._parse_wsman_fault(err.response_text)
            except ET.ParseError:
                # no XML message is present so not a WSManFault error
                log.error("Failed to parse WSManFault message on WinRM error"
                          " response, raising original WinRMTransportError")
                raise err

        response_xml = ET.fromstring(response)
        relates_to = response_xml.find("s:Header/wsa:RelatesTo",
                                       namespaces=NAMESPACES).text

        if message_id != relates_to:
            raise WinRMError("Received related id does not match related "
                             "expected message id: Sent: %s, Received: %s"
                             % (message_id, relates_to))
        return response_xml 
Example #14
Source File: xmlformat.py    From ksconf with Apache License 2.0 4 votes vote down vote up
def run(self, args):
        formatter = SplunkSimpleXmlFormatter()
        # Should we read a list of conf files from STDIN?
        if len(args.xml) == 1 and args.xml[0] == "-":
            files = _stdin_iter()
        else:
            files = args.xml
        c = Counter()
        exit_code = EXIT_CODE_SUCCESS
        for fn in files:
            c["checked"] += 1
            if not os.path.isfile(fn):
                self.stderr.write("Skipping missing file:  {0}\n".format(fn))
                c["missing"] += 1
                continue
            try:
                if formatter.format_xml(fn, fn, args.indent):
                    self.stderr.write("Replaced file {0} with formatted content\n".format(fn))
                    c["changed"] += 1
                else:
                    if not args.quiet:
                        self.stderr.write("Already formatted {0}\n".format(fn))
                    c["no-action"] += 1
                self.stderr.flush()
            except etree.ParseError as e:
                self.stderr.write("Error parsing file {0}:  {1}\n".format(fn, e))
                self.stderr.flush()
                c["error"] += 1
                exit_code = EXIT_CODE_BAD_CONF_FILE
            except Exception as e:  # pragma: no cover
                self.stderr.write("Unhandled top-level exception while parsing {0}.  "
                                  "Aborting.\n{1}\n".format(fn, e))
                debug_traceback()
                c["error"] += 1
                exit_code = EXIT_CODE_INTERNAL_ERROR
                break

        if not exit_code and c["changed"] > 0:
            exit_code = EXIT_CODE_FORMAT_APPLIED

        if True:  # show stats or verbose
            self.stdout.write("Completed formatting {0[checked]} files.  rc={1} Breakdown:\n"
                              "   {0[changed]} files were formatted successfully.\n"
                              "   {0[no-action]} files were already formatted.\n"
                              "   {0[error]} files failed.\n".format(c, exit_code))
        return exit_code 
Example #15
Source File: feed.py    From castero with MIT License 4 votes vote down vote up
def __init__(self, url=None, file=None, text=None, **kwargs) -> None:
        """
        A feed can be provided as either a url or a file, but exactly one must
        be given. Realistically, users will almost universally use a url to
        retrieve feeds from. However, having support for handling files makes
        testing easier.

        Args:
            url: (optional) the url where the feed is located
            file: (optional) the file where the feed is located
            text: (optional) pre-retrieved text for the feed. Can be useful if
                multiple feeds were downloaded previously; a URL or file is
                still required, providing this field will only skip the
                download step
        """
        # * Don't allow providing both a url and a file, but must provide one.
        # Check that one of them is None, and that they are not both the same.
        # The second conditional can be read as checking that both variables
        # are not None.
        assert (url is None or file is None) and (url is not file)

        self._url = url
        self._file = file
        self._tree = None
        self._validated = False

        self._title = kwargs.get('title', None)
        self._description = kwargs.get('description', None)
        self._link = kwargs.get('link', None)
        self._last_build_date = kwargs.get('last_build_date', None)
        self._copyright = kwargs.get('copyright', None)

        # assume that if we have been passed the title then we have also been
        # passed everything else and that the feed is valid
        if self._title is None:
            if text:
                # the content of a document was already provided, but we need
                # to ensure it is valid RSS
                try:
                    self._tree = etree.fromstring(text)
                except etree.ParseError:
                    raise FeedParseError(
                        "Unable to parse text as an XML document")
            else:
                # retrieve the feed and parse to XML document
                self._download_feed()
            # check that the XML document is a properly structured RSS feed
            self._validate_feed()
            # set this object's metadata using rss feed
            self._parse_metadata()
        else:
            self._validated = True 
Example #16
Source File: feed.py    From castero with MIT License 4 votes vote down vote up
def _download_feed(self):
        """Parses the feed at the provided url or file into _tree.

        This method checks whether the url is valid and that there is a
        parse-able XML document at the url, but it does not check that the
        document is an RSS feed, nor whether the feed has all necessary tags.

        Raises:
            FeedParseError: unable to parse text as an XML document
            FeedDownloadError: (only when retrieving feed using url) did not
                receive an acceptable status code, or an exception occurred
                when attempting to download the page
            FeedLoadError: (only when retrieving feed using file) a feed could
                not be found at the file, or an exception occurred when
                attempting to load the file
        """
        if self._url is not None:
            # handle feed from url
            try:
                response = Net.Get(self._url)
                if response.status_code == 200:
                    try:
                        self._tree = etree.fromstring(response.content)
                    except etree.ParseError:
                        raise FeedParseError(
                            "Unable to parse text as an XML document")
                else:
                    raise FeedDownloadError(
                        "Did not receive an acceptable status code while"
                        " downloading the page. Expected 200, got: "
                        + str(response.status_code))
            except requests.exceptions.RequestException:
                raise FeedDownloadError(
                    "An exception occurred when attempting to download the"
                    " page")
        elif self._file is not None:
            # handle feed from file
            try:
                tree = etree.parse(self._file)
                self._tree = tree.getroot()
            except etree.ParseError:
                raise FeedParseError(
                    "Unable to parse text as an XML document")
            except IOError:
                raise FeedLoadError(
                    "An exception occurred when attempting to load the file")