Python lxml.etree.XMLSyntaxError() Examples

The following are 30 code examples of lxml.etree.XMLSyntaxError(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module lxml.etree , or try the search function .
Example #1
Source File: doctestcompare.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def check_output(self, want, got, optionflags):
        alt_self = getattr(self, '_temp_override_self', None)
        if alt_self is not None:
            super_method = self._temp_call_super_check_output
            self = alt_self
        else:
            super_method = OutputChecker.check_output
        parser = self.get_parser(want, got, optionflags)
        if not parser:
            return super_method(
                self, want, got, optionflags)
        try:
            want_doc = parser(want)
        except etree.XMLSyntaxError:
            return False
        try:
            got_doc = parser(got)
        except etree.XMLSyntaxError:
            return False
        return self.compare_docs(want_doc, got_doc) 
Example #2
Source File: test_tesseract.py    From nidaba with GNU General Public License v2.0 6 votes vote down vote up
def test_capi_file_output_jpg(self):
        """
        Test that tesseract CAPI calls create hocr output for jpgs.
        """

        try:
            t = ctypes.cdll.LoadLibrary('libtesseract.so.3')
        except:
            raise unittest.SkipTest

        self.tesseract.setup(tessdata=tessdata, implementation='capi')
        ocr = self.tesseract.ocr_tesseract.run(('test', 'segmentation_jpg.xml'),
                                           languages=['eng'],
                                           extended=False)
        outpath = os.path.join(self.storage_path, *ocr)
        self.assertTrue(os.path.isfile(outpath), msg='Tesseract did not '
                        'output a file!')
        try:
            etree.parse(open(os.path.join(self.storage_path, *ocr)))
        except etree.XMLSyntaxError:
            self.fail(msg='The output was not valid html/xml!') 
Example #3
Source File: base.py    From daf-recipes with GNU General Public License v3.0 6 votes vote down vote up
def _validate_document(self, document_string, harvest_object, validator=None):
        '''
        Validates an XML document with the default, or if present, the
        provided validators.

        It will create a HarvestObjectError for each validation error found,
        so they can be shown properly on the frontend.

        Returns a tuple, with a boolean showing whether the validation passed
        or not, the profile used and a list of errors (tuples with error
        message and error lines if present).
        '''
        if not validator:
            validator = self._get_validator()

        document_string = re.sub('<\?xml(.*)\?>', '', document_string)

        try:
            xml = etree.fromstring(document_string)
        except etree.XMLSyntaxError, e:
            self._save_object_error('Could not parse XML file: {0}'.format(str(e)), harvest_object, 'Import')
            return False, None, [] 
Example #4
Source File: test_tesseract.py    From nidaba with GNU General Public License v2.0 6 votes vote down vote up
def test_direct_file_output_png(self):
        """
        Test that direct tesseract calls create hocr output for pngs.
        """

        if not spawn.find_executable('tesseract'):
            raise unittest.SkipTest

        self.tesseract.setup(tessdata=tessdata, implementation='direct')
        ocr = self.tesseract.ocr_tesseract.run(('test', 'segmentation_png.xml'),
                                           languages=['eng'],
                                           extended=False)
        outpath = os.path.join(self.storage_path, *ocr)
        self.assertTrue(os.path.isfile(outpath), msg='Tesseract did not '
                        'output a file!')
        try:
            etree.parse(open(os.path.join(self.storage_path, *ocr)))
        except etree.XMLSyntaxError:
            self.fail(msg='The output was not valid html/xml!') 
Example #5
Source File: test_tesseract.py    From nidaba with GNU General Public License v2.0 6 votes vote down vote up
def test_direct_file_output_tiff(self):
        """
        Test that direct tesseract calls create hocr output for tiffs.
        """
        if not spawn.find_executable('tesseract'):
            raise unittest.SkipTest

        self.tesseract.setup(tessdata=tessdata, implementation='direct')
        ocr = self.tesseract.ocr_tesseract.run(('test', 'segmentation_tiff.xml'),
                                           languages=['eng'],
                                           extended=False)
        outpath = os.path.join(self.storage_path, *ocr)
        self.assertTrue(os.path.isfile(outpath), msg='Tesseract did not '
                        'output a file!')
        try:
            etree.parse(open(os.path.join(self.storage_path, *ocr)))
        except etree.XMLSyntaxError:
            self.fail(msg='The output was not valid html/xml!') 
Example #6
Source File: test_tesseract.py    From nidaba with GNU General Public License v2.0 6 votes vote down vote up
def test_direct_file_output_jpg(self):
        """
        Test that direct tesseract calls create hocr output for jpgs.
        """
        if not spawn.find_executable('tesseract'):
            raise unittest.SkipTest

        self.tesseract.setup(tessdata=tessdata, implementation='direct')
        ocr = self.tesseract.ocr_tesseract.run(('test', 'segmentation_jpg.xml'),
                                           languages=['eng'],
                                           extended=False)
        outpath = os.path.join(self.storage_path, *ocr)
        self.assertTrue(os.path.isfile(outpath), msg='Tesseract did not '
                        'output a file!')
        try:
            etree.parse(open(os.path.join(self.storage_path, *ocr)))
        except etree.XMLSyntaxError:
            self.fail(msg='The output was not valid html/xml!') 
Example #7
Source File: ironclaw_class.py    From warriorframework with Apache License 2.0 6 votes vote down vote up
def xml_to_xsd_validation(file_xml, file_xsd):
        """ Verify that the XML compliance with XSD
        Arguments:
            1. file_xml: Input xml file
            2. file_xsd: xsd file which needs to be validated against xml
        Return:
            No return value
        """
        try:
            print_info("Validating:{0}".format(file_xml))
            print_info("xsd_file:{0}".format(file_xsd))
            xml_doc = parse(file_xml)
            xsd_doc = parse(file_xsd)
            xmlschema = XMLSchema(xsd_doc)
            xmlschema.assert_(xml_doc)
            return True

        except XMLSyntaxError as err:
            print_error("PARSING ERROR:{0}".format(err))
            return False

        except AssertionError, err:
            print_error("Incorrect XML schema: {0}".format(err))
            return False 
Example #8
Source File: pyreact.py    From pypath with GNU General Public License v3.0 6 votes vote down vote up
def init_etree(self):
        """
        Creates the ``lxml.etree.iterparse`` object.
        This method should not be called directly,
        ``BioPaxReader.process()`` calls it.
        """
        try:

            self.bp = etree.iterparse(self._biopax, events=('start', 'end'))
            _, self.root = next(self.bp)

        except etree.XMLSyntaxError:

            self.bp = None

        self.used_elements = [] 
Example #9
Source File: __init__.py    From python-gvm with GNU General Public License v3.0 6 votes vote down vote up
def import_config(self, config: str) -> Any:
        """Import a scan config from XML

        Arguments:
            config: Scan Config XML as string to import. This XML must
                contain a :code:`<get_configs_response>` root element.

        Returns:
            The response. See :py:meth:`send_command` for details.
        """
        if not config:
            raise RequiredArgument(
                function=self.import_config.__name__, argument='config'
            )

        cmd = XmlCommand("create_config")

        try:
            cmd.append_xml_str(config)
        except etree.XMLSyntaxError:
            raise InvalidArgument(
                function=self.import_config.__name__, argument='config'
            )

        return self._send_xml_command(cmd) 
Example #10
Source File: test_tesseract.py    From nidaba with GNU General Public License v2.0 6 votes vote down vote up
def test_capi_file_output_png(self):
        """
        Test that tesseract CAPI calls create hocr output for pngs.
        """

        try:
            t = ctypes.cdll.LoadLibrary('libtesseract.so.3')
        except:
            raise unittest.SkipTest

        self.tesseract.setup(tessdata=tessdata, implementation='capi')
        ocr = self.tesseract.ocr_tesseract.run(('test', 'segmentation_png.xml'),
                                           languages=['eng'],
                                           extended=False)
        outpath = os.path.join(self.storage_path, *ocr)
        self.assertTrue(os.path.isfile(outpath), msg='Tesseract did not '
                        'output a file!')
        try:
            etree.parse(open(os.path.join(self.storage_path, *ocr)))
        except etree.XMLSyntaxError:
            self.fail(msg='The output was not valid html/xml!') 
Example #11
Source File: test_tesseract.py    From nidaba with GNU General Public License v2.0 6 votes vote down vote up
def test_capi_extended(self):
        """
        Test that the CAPI extended output contains character cuts in each
        ocr_line and character confidences in each ocrx_word.
        """

        try:
            ctypes.cdll.LoadLibrary('libtesseract.so.3')
        except:
            raise unittest.SkipTest
        self.tesseract.setup(tessdata=tessdata, implementation='capi')
        ocr = self.tesseract.ocr_tesseract.run(('test', 'segmentation_tiff.xml'),
                                           languages=['eng'],
                                           extended=True)
        outpath = os.path.join(self.storage_path, *ocr)
        self.assertTrue(os.path.isfile(outpath), msg='Tesseract did not '
                        'output a file!')

        try:
            h = etree.parse(open(os.path.join(self.storage_path, *ocr)))
        except etree.XMLSyntaxError:
            self.fail(msg='The output was not valid html/xml!')
        self.assertIsNotNone(h.findall(".//line"), msg='Tesseract did not write lines.')
        self.assertIsNotNone(h.findall(".//seg"), msg='Tesseract did not write segments.')
        self.assertIsNotNone(h.findall(".//g"), msg='Tesseract did not write graphemes.') 
Example #12
Source File: test_tesseract.py    From nidaba with GNU General Public License v2.0 6 votes vote down vote up
def test_direct_multiple(self):
        """
        Test that direct tesseract calls create hocr output for multiple
        languages.
        """
        if not spawn.find_executable('tesseract'):
            raise unittest.SkipTest

        self.tesseract.setup(tessdata=tessdata, implementation='direct')
        ocr = self.tesseract.ocr_tesseract.run(('test', 'segmentation_tiff.xml'),
                                           languages=['grc', 'eng'],
                                           extended=False)
        outpath = os.path.join(self.storage_path, *ocr)
        self.assertTrue(os.path.isfile(outpath), msg='Tesseract did not '
                        'output a file!')
        try:
            etree.parse(open(os.path.join(self.storage_path, *ocr)))
        except etree.XMLSyntaxError:
            self.fail(msg='The output was not valid html/xml!') 
Example #13
Source File: xml.py    From kraken with Apache License 2.0 6 votes vote down vote up
def parse_xml(filename):
    """
    Parses either a PageXML or ALTO file with autodetermination of the file
    format.

    Args:
        filename (str): path to an XML file.

    Returns:
        A dict {'image': impath, lines: [{'boundary': [[x0, y0], ...],
        'baseline': [[x0, y0], ...]}, {...], 'text': 'apdjfqpf', 'script':
        'script_type'}, regions: {'region_type_0': [[[x0, y0], ...], ...],
        ...}}
    """
    with open(filename, 'rb') as fp:
        try:
            doc = etree.parse(fp)
        except etree.XMLSyntaxError as e:
            raise KrakenInputException(f'Parsing {filename} failed: {e}')
    if doc.getroot().tag.endswith('alto'):
        return parse_alto(filename)
    elif doc.getroot().tag.endswith('PcGts'):
        return parse_page(filename)
    else:
        raise KrakenInputException(f'Unknown XML format in {filename}') 
Example #14
Source File: main.py    From parserator with MIT License 6 votes vote down vote up
def __call__(self, parser, namespace, string, option_string):
        try:
            with open(string, 'r') as f:
                tree = etree.parse(f)
                xml = tree.getroot()
        except (OSError, IOError):
            xml = None
        except etree.XMLSyntaxError as e:
            if 'Document is empty' not in str(e):
                raise argparse.ArgumentError(self,
                                             "%s does not seem to be a valid xml file"
                                             % string)
            xml = None

        setattr(namespace, self.dest, string)
        setattr(namespace, 'xml', xml) 
Example #15
Source File: files.py    From janeway with GNU Affero General Public License v3.0 6 votes vote down vote up
def transform_with_xsl(xml_path, xsl_path, recover=False):
    try:
        xml_dom = etree.parse(xml_path)
    except etree.XMLSyntaxError as e:
        if recover:
            logger.error(e)
            parser = etree.XMLParser(recover=True)
            xml_dom = etree.parse(xml_path, parser=parser)
        else:
            raise
    xsl_transform = etree.XSLT(etree.parse(xsl_path))
    try:
        transformed_dom = xsl_transform(xml_dom)
    except Exception as err:
        logger.error(err)
        for xsl_error in xsl_transform.error_log:
            logger.error(xsl_error)
        if not recover:
            raise

    return transformed_dom 
Example #16
Source File: nexpose.py    From nexpose-client-python with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def Open(self):
        """
        Opens a session to the nexpose appliance by logging in.
        This function with raise an exception on error or if the session is already open.
        """
        if self._session_id:
            raise SessionIsNotClosedException("Please close the session first!")
        try:
            response = self._Execute_APIv1d1(self._login_request)
        except NexposeConnectionException as ex:
            if isinstance(ex.inner_exception, etree.XMLSyntaxError):
                raise NexposeException("Unexpected error! Is the Nexpose appliance activated?")
            raise ex
        if response.tag == "LoginResponse":
            if response.attrib["success"] == "1":
                self._session_id = response.attrib["session-id"]
        if not self._session_id:
            raise NexposeFailureException("Login failure!") 
Example #17
Source File: doctestcompare.py    From aws-lambda-lxml with GNU General Public License v3.0 6 votes vote down vote up
def check_output(self, want, got, optionflags):
        alt_self = getattr(self, '_temp_override_self', None)
        if alt_self is not None:
            super_method = self._temp_call_super_check_output
            self = alt_self
        else:
            super_method = OutputChecker.check_output
        parser = self.get_parser(want, got, optionflags)
        if not parser:
            return super_method(
                self, want, got, optionflags)
        try:
            want_doc = parser(want)
        except etree.XMLSyntaxError:
            return False
        try:
            got_doc = parser(got)
        except etree.XMLSyntaxError:
            return False
        return self.compare_docs(want_doc, got_doc) 
Example #18
Source File: doctestcompare.py    From aws-lambda-lxml with GNU General Public License v3.0 6 votes vote down vote up
def check_output(self, want, got, optionflags):
        alt_self = getattr(self, '_temp_override_self', None)
        if alt_self is not None:
            super_method = self._temp_call_super_check_output
            self = alt_self
        else:
            super_method = OutputChecker.check_output
        parser = self.get_parser(want, got, optionflags)
        if not parser:
            return super_method(
                self, want, got, optionflags)
        try:
            want_doc = parser(want)
        except etree.XMLSyntaxError:
            return False
        try:
            got_doc = parser(got)
        except etree.XMLSyntaxError:
            return False
        return self.compare_docs(want_doc, got_doc) 
Example #19
Source File: doctestcompare.py    From aws-lambda-lxml with GNU General Public License v3.0 6 votes vote down vote up
def check_output(self, want, got, optionflags):
        alt_self = getattr(self, '_temp_override_self', None)
        if alt_self is not None:
            super_method = self._temp_call_super_check_output
            self = alt_self
        else:
            super_method = OutputChecker.check_output
        parser = self.get_parser(want, got, optionflags)
        if not parser:
            return super_method(
                self, want, got, optionflags)
        try:
            want_doc = parser(want)
        except etree.XMLSyntaxError:
            return False
        try:
            got_doc = parser(got)
        except etree.XMLSyntaxError:
            return False
        return self.compare_docs(want_doc, got_doc) 
Example #20
Source File: doctestcompare.py    From stopstalk-deployment with MIT License 6 votes vote down vote up
def check_output(self, want, got, optionflags):
        alt_self = getattr(self, '_temp_override_self', None)
        if alt_self is not None:
            super_method = self._temp_call_super_check_output
            self = alt_self
        else:
            super_method = OutputChecker.check_output
        parser = self.get_parser(want, got, optionflags)
        if not parser:
            return super_method(
                self, want, got, optionflags)
        try:
            want_doc = parser(want)
        except etree.XMLSyntaxError:
            return False
        try:
            got_doc = parser(got)
        except etree.XMLSyntaxError:
            return False
        return self.compare_docs(want_doc, got_doc) 
Example #21
Source File: doctestcompare.py    From aws-lambda-lxml with GNU General Public License v3.0 6 votes vote down vote up
def check_output(self, want, got, optionflags):
        alt_self = getattr(self, '_temp_override_self', None)
        if alt_self is not None:
            super_method = self._temp_call_super_check_output
            self = alt_self
        else:
            super_method = OutputChecker.check_output
        parser = self.get_parser(want, got, optionflags)
        if not parser:
            return super_method(
                self, want, got, optionflags)
        try:
            want_doc = parser(want)
        except etree.XMLSyntaxError:
            return False
        try:
            got_doc = parser(got)
        except etree.XMLSyntaxError:
            return False
        return self.compare_docs(want_doc, got_doc) 
Example #22
Source File: get_ui.py    From adbui with MIT License 6 votes vote down vote up
def get_uis_by_xpath(self, xpath, is_update=True):
        """
        通过xpath查找节点
        :param xpath: 
        :param is_update: 
        :return: 
        """
        if is_update:
            xml_str = None
            for _ in range(5):
                try:
                    xml_str = self.adb_ext.dump()  # 获取xml文件
                    self.__init_xml(xml_str)
                    break
                except etree.XMLSyntaxError:
                    logging.error('etree.XMLSyntaxError:\n')
                    if xml_str:
                        logging.error('xml str:{}'.format(xml_str))
        xpath = xpath.decode('utf-8') if sys.version_info[0] < 3 else xpath
        elements = self.xml.xpath(xpath)
        uis = []
        for element in elements:
            uis.append(self.get_ui_by_element(element))
        return uis 
Example #23
Source File: test_backends.py    From xblock-video with GNU General Public License v3.0 6 votes vote down vote up
def test_download_default_transcript(self, backend, download_transcript_mock, params):
        """
        Check default transcript is downloaded from a video platform API.
        """
        player = self.player[backend]
        for index, event in enumerate(download_transcript_mock.get_outcomes()):
            mock = download_transcript_mock(event=event)
            self.mocked_objects = mock.apply_mock(self.mocked_objects)
            try:
                res = player(self.xblock).download_default_transcript(**params[index])
                message = ''
                expected_default_transcript = mock.expected_value[0]
                self.assertIsInstance(res, unicode)
                self.assertEqual(res, expected_default_transcript)
            except VideoXBlockException as ex:
                message = ex.message
            except etree.XMLSyntaxError:
                message = 'XMLSyntaxError exception'
            expected_message = mock.expected_value[-1]
            self.assertIn(expected_message, message)
            self.restore_mocked() 
Example #24
Source File: test_tesseract.py    From nidaba with GNU General Public License v2.0 6 votes vote down vote up
def test_capi_multiple(self):
        """
        Test that tesseract CAPI calls create hocr output for multiple
        languages.
        """
        try:
            t = ctypes.cdll.LoadLibrary('libtesseract.so.3')
        except:
            raise unittest.SkipTest
        self.tesseract.setup(tessdata=tessdata, implementation='capi')
        ocr = self.tesseract.ocr_tesseract.run(('test', 'segmentation_tiff.xml'),
                                           languages=['grc', 'eng'],
                                           extended=False)
        outpath = os.path.join(self.storage_path, *ocr)
        self.assertTrue(os.path.isfile(outpath), msg='Tesseract did not '
                        'output a file!')
        try:
            doc = etree.parse(open(os.path.join(self.storage_path, *ocr)))
        except etree.XMLSyntaxError:
            self.fail(msg='The output was not valid html/xml!') 
Example #25
Source File: process_forest.py    From process-forest with Apache License 2.0 5 votes vote down vote up
def get_entries(evtx):
    """
    @rtype: generator of Entry
    """
    for xml, record in evtx_file_xml_view(evtx.get_file_header()):
        try:
            yield Entry(xml, record)
        except etree.XMLSyntaxError as e:
            continue 
Example #26
Source File: doctestcompare.py    From stopstalk-deployment with MIT License 5 votes vote down vote up
def output_difference(self, example, got, optionflags):
        want = example.want
        parser = self.get_parser(want, got, optionflags)
        errors = []
        if parser is not None:
            try:
                want_doc = parser(want)
            except etree.XMLSyntaxError:
                e = sys.exc_info()[1]
                errors.append('In example: %s' % e)
            try:
                got_doc = parser(got)
            except etree.XMLSyntaxError:
                e = sys.exc_info()[1]
                errors.append('In actual output: %s' % e)
        if parser is None or errors:
            value = OutputChecker.output_difference(
                self, example, got, optionflags)
            if errors:
                errors.append(value)
                return '\n'.join(errors)
            else:
                return value
        html = parser is html_fromstring
        diff_parts = []
        diff_parts.append('Expected:')
        diff_parts.append(self.format_doc(want_doc, html, 2))
        diff_parts.append('Got:')
        diff_parts.append(self.format_doc(got_doc, html, 2))
        diff_parts.append('Diff:')
        diff_parts.append(self.collect_diff(want_doc, got_doc, html, 2))
        return '\n'.join(diff_parts) 
Example #27
Source File: test_ocropus.py    From nidaba with GNU General Public License v2.0 5 votes vote down vote up
def test_file_outpath_jpg(self):
        """
        Test that ocropus creates hocr output for jpgs.
        """
        ocr = self.ocropus.ocr_ocropus.run((('test', 'segmentation.xml'),
                                            ('test', 'image_jpg.jpg')),
                                           model='ocropus')
        try:
            parser = etree.HTMLParser()
            etree.parse(open(os.path.join(self.storage_path, *ocr)), parser)
        except etree.XMLSyntaxError:
            self.fail(msg='The output was not valid html/xml!') 
Example #28
Source File: ooyala_player.py    From xblock-ooyala with GNU Affero General Public License v3.0 5 votes vote down vote up
def studio_submit(self, submissions, suffix=''):

        xml_config = submissions['xml_config']
        try:
            etree.parse(StringIO(xml_config))
        except etree.XMLSyntaxError as e:
            response = {
                'result': 'error',
                'message': e.message
            }
        else:
            response = {
                'result': 'success',
            }

            self.xml_config = xml_config
            self.display_name = submissions['display_name']
            self.content_id = submissions['content_id'].strip()
            self.transcript_file_id = submissions['transcript_file_id'].strip()
            self.enable_player_token = submissions['enable_player_token']
            self.partner_code = submissions['partner_code']
            self.api_key = submissions['api_key']
            self.api_secret_key = submissions['api_secret_key']
            self.api_key_3play = submissions['api_key_3play']
            self.expiration_time = submissions['expiration_time']
            self.width = submissions['width']
            self.height = submissions['height']
            self.disable_cc_and_translations = submissions['cc_disable']

        return response 
Example #29
Source File: test_metadata.py    From pikepdf with Mozilla Public License 2.0 5 votes vote down vote up
def test_truncated_xml(resources, idx):
    sandwich = Pdf.open(resources / 'sandwich.pdf')
    data = sandwich.Root.Metadata.read_bytes()
    assume(idx < len(data))

    sandwich.Root.Metadata = sandwich.make_stream(data[0:idx])
    try:
        with sandwich.open_metadata(strict=True) as xmp:
            xmp['pdfaid:part'] = '5'
    except (XMLSyntaxError, AssertionError):
        pass

    with sandwich.open_metadata(strict=False) as xmp:
        xmp['pdfaid:part'] = '7' 
Example #30
Source File: _fc.py    From pypowervm with Apache License 2.0 5 votes vote down vote up
def _parse_pg83_xml(xml_resp):
    """Parse LUARecovery XML response, looking for pg83 descriptor.

    :param xml_resp: Tuple containing OutputXML and StdOut results of the
                     LUARecovery Job
    :return: pg83 descriptor text, or None if not found.
    """
    # QUERY_INVENTORY response may contain more than one element.  Each will be
    # delimited by its own <?xml?> tag.  etree will only parse one at a time.
    for chunk in xml_resp.split('<?xml version="1.0"?>'):
        if not chunk:
            continue
        try:
            parsed = etree.fromstring(chunk)
        except etree.XMLSyntaxError as e:
            LOG.warning(_('QUERY_INVENTORY produced invalid chunk of XML '
                          '(%(chunk)s).  Error: %(err)s'),
                        {'chunk': chunk, 'err': e.args[0]})
            continue
        for elem in parsed.getiterator():
            if (etree.QName(elem.tag).localname == 'PhysicalVolume_base' and
                    elem.attrib.get('desType') == "NAA"):
                return elem.attrib.get('descriptor')
    LOG.warning(_('Failed to find pg83 descriptor in XML output:\n%s'),
                xml_resp)
    return None