Python lxml.etree.Comment() Examples

The following are 30 code examples of lxml.etree.Comment(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module lxml.etree , or try the search function .
Example #1
Source File: reference.py    From pyfixmsg with Apache License 2.0 6 votes vote down vote up
def _extract_composition(element, spec):
    """
    Parse XML spec to extract the composition of a nested structure (Component, Group or MsgType)
    """
    returned = []
    for elem in list(element):
        if elem.tag == "field":
            returned.append((spec.tags.by_name(elem.get('name')),
                             elem.get('required') == "Y"))
        elif elem.tag == 'component':
            returned.append((Component(elem, spec), elem.get('required') == "Y"))
        elif elem.tag == 'group':
            returned.append((Group.from_element(elem, spec), elem.get('required') == "Y"))
        elif (parse.__module__ == 'lxml.etree') and (elem.tag == Comment):
            pass
        else:
            raise ValueError("Could not process element '{}'".format(elem.tag))
    return returned 
Example #2
Source File: process_includes.py    From nyoka with Apache License 2.0 6 votes vote down vote up
def collect_inserts_aux(child, params, inserts, options):
    roots = []
    save_base_url = params.base_url
    string_content = resolve_ref(child, params, options)
    if string_content is not None:
        root = etree.fromstring(string_content, base_url=params.base_url)
        roots.append(root)
        for child1 in root:
            if not isinstance(child1, etree._Comment):
                namespace = child1.nsmap[child1.prefix]
                if (child1.tag != '{%s}include' % (namespace, ) and
                        child1.tag != '{%s' % (namespace, )):
                    comment = etree.Comment(etree.tostring(child))
                    comment.tail = '\n'
                    inserts.append(comment)
                    inserts.append(child1)
        insert_roots = collect_inserts(root, params, inserts, options)
        roots.extend(insert_roots)
    params.base_url = save_base_url
    return roots 
Example #3
Source File: etree_lxml.py    From stopstalk-deployment with MIT License 5 votes vote down vote up
def getNodeDetails(self, node):
        if isinstance(node, tuple):  # Text node
            node, key = node
            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
            return base.TEXT, ensure_str(getattr(node, key))

        elif isinstance(node, Root):
            return (base.DOCUMENT,)

        elif isinstance(node, Doctype):
            return base.DOCTYPE, node.name, node.public_id, node.system_id

        elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
            return base.TEXT, ensure_str(node.obj)

        elif node.tag == etree.Comment:
            return base.COMMENT, ensure_str(node.text)

        elif node.tag == etree.Entity:
            return base.ENTITY, ensure_str(node.text)[1:-1]  # strip &;

        else:
            # This is assumed to be an ordinary element
            match = tag_regexp.match(ensure_str(node.tag))
            if match:
                namespace, tag = match.groups()
            else:
                namespace = None
                tag = ensure_str(node.tag)
            attrs = {}
            for name, value in list(node.attrib.items()):
                name = ensure_str(name)
                value = ensure_str(value)
                match = tag_regexp.match(name)
                if match:
                    attrs[(match.group(1), match.group(2))] = value
                else:
                    attrs[(None, name)] = value
            return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
                    attrs, len(node) > 0 or node.text) 
Example #4
Source File: _html5builder.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def __init__(self, *args, **kwargs):
        html_builder = etree_builders.getETreeModule(html, fullTree=False)
        etree_builder = etree_builders.getETreeModule(etree, fullTree=False)
        self.elementClass = html_builder.Element
        self.commentClass = etree_builder.Comment
        _base.TreeBuilder.__init__(self, *args, **kwargs) 
Example #5
Source File: _html5builder.py    From stopstalk-deployment with MIT License 5 votes vote down vote up
def __init__(self, *args, **kwargs):
        html_builder = etree_builders.getETreeModule(html, fullTree=False)
        etree_builder = etree_builders.getETreeModule(etree, fullTree=False)
        self.elementClass = html_builder.Element
        self.commentClass = etree_builder.Comment
        _base.TreeBuilder.__init__(self, *args, **kwargs) 
Example #6
Source File: clean.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def kill_conditional_comments(self, doc):
        """
        IE conditional comments basically embed HTML that the parser
        doesn't normally see.  We can't allow anything like that, so
        we'll kill any comments that could be conditional.
        """
        bad = []
        self._kill_elements(
            doc, lambda el: _conditional_comment_re.search(el.text),
            etree.Comment) 
Example #7
Source File: markup.py    From ChemDataExtractor with MIT License 5 votes vote down vote up
def _parse_element_r(self, el, specials, refs, id=None, element_cls=Paragraph):
        """Recursively parse HTML/XML element and its children into a list of Document elements."""
        elements = []
        if el.tag in {etree.Comment, etree.ProcessingInstruction}:
            return []
        # if el in refs:
        #     return [element_cls('', references=refs[el])]
        if el in specials:
            return specials[el]
        id = el.get('id', id)
        references = refs.get(el, [])
        if el.text is not None:
            elements.append(element_cls(six.text_type(el.text), id=id, references=references))
        elif references:
            elements.append(element_cls('', id=id, references=references))
        for child in el:
            # br is a special case - technically inline, but we want to split
            if child.tag not in {etree.Comment, etree.ProcessingInstruction} and child.tag.lower() == 'br':
                elements.append(element_cls(''))

            child_elements = self._parse_element_r(child, specials=specials, refs=refs, id=id, element_cls=element_cls)
            if (self._is_inline(child) and len(elements) > 0 and len(child_elements) > 0 and
                    isinstance(elements[-1], (Text, Sentence)) and isinstance(child_elements[0], (Text, Sentence)) and
                    type(elements[-1]) == type(child_elements[0])):
                elements[-1] += child_elements.pop(0)
            elements.extend(child_elements)
            if child.tail is not None:
                if self._is_inline(child) and len(elements) > 0 and isinstance(elements[-1], element_cls):
                    elements[-1] += element_cls(six.text_type(child.tail), id=id)
                else:
                    elements.append(element_cls(six.text_type(child.tail), id=id))
        return elements 
Example #8
Source File: markup.py    From ChemDataExtractor with MIT License 5 votes vote down vote up
def _is_inline(self, element):
        """Return True if an element is inline."""
        if element.tag not in {etree.Comment, etree.ProcessingInstruction} and element.tag.lower() in self.inline_elements:
            return True
        return False 
Example #9
Source File: etree_lxml.py    From python with Apache License 2.0 5 votes vote down vote up
def getNodeDetails(self, node):
        if isinstance(node, tuple):  # Text node
            node, key = node
            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
            return base.TEXT, ensure_str(getattr(node, key))

        elif isinstance(node, Root):
            return (base.DOCUMENT,)

        elif isinstance(node, Doctype):
            return base.DOCTYPE, node.name, node.public_id, node.system_id

        elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
            return base.TEXT, ensure_str(node.obj)

        elif node.tag == etree.Comment:
            return base.COMMENT, ensure_str(node.text)

        elif node.tag == etree.Entity:
            return base.ENTITY, ensure_str(node.text)[1:-1]  # strip &;

        else:
            # This is assumed to be an ordinary element
            match = tag_regexp.match(ensure_str(node.tag))
            if match:
                namespace, tag = match.groups()
            else:
                namespace = None
                tag = ensure_str(node.tag)
            attrs = {}
            for name, value in list(node.attrib.items()):
                name = ensure_str(name)
                value = ensure_str(value)
                match = tag_regexp.match(name)
                if match:
                    attrs[(match.group(1), match.group(2))] = value
                else:
                    attrs[(None, name)] = value
            return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
                    attrs, len(node) > 0 or node.text) 
Example #10
Source File: lxmletree.py    From Flask-P2P with MIT License 5 votes vote down vote up
def getNodeDetails(self, node):
        if isinstance(node, tuple):  # Text node
            node, key = node
            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
            return _base.TEXT, ensure_str(getattr(node, key))

        elif isinstance(node, Root):
            return (_base.DOCUMENT,)

        elif isinstance(node, Doctype):
            return _base.DOCTYPE, node.name, node.public_id, node.system_id

        elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
            return _base.TEXT, node.obj

        elif node.tag == etree.Comment:
            return _base.COMMENT, ensure_str(node.text)

        elif node.tag == etree.Entity:
            return _base.ENTITY, ensure_str(node.text)[1:-1]  # strip &;

        else:
            # This is assumed to be an ordinary element
            match = tag_regexp.match(ensure_str(node.tag))
            if match:
                namespace, tag = match.groups()
            else:
                namespace = None
                tag = ensure_str(node.tag)
            attrs = {}
            for name, value in list(node.attrib.items()):
                name = ensure_str(name)
                value = ensure_str(value)
                match = tag_regexp.match(name)
                if match:
                    attrs[(match.group(1), match.group(2))] = value
                else:
                    attrs[(None, name)] = value
            return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
                    attrs, len(node) > 0 or node.text) 
Example #11
Source File: pcbdraw.py    From PcbDraw with MIT License 5 votes vote down vote up
def component_from_library(lib, name, value, ref, pos, comp, highlight, silent):
    if not name:
        return
    if comp["filter"] is not None and ref not in comp["filter"]:
        return
    f = get_model_file(comp["libraries"], lib, name, ref, comp["remapping"])
    if not f:
        if not silent:
            print("Warning: component '{}' for footprint '{}' from library '{}' was not found".format(name, ref, lib))
        if comp["placeholder"]:
            etree.SubElement(comp["container"], "rect", x=str(ki2dmil(pos[0]) - 150), y=str(ki2dmil(pos[1]) - 150),
                             width="300", height="300", style="fill:red;")
        return
    comp["container"].append(etree.Comment("{}:{}".format(lib, name)))
    r = etree.SubElement(comp["container"], "g")
    svg_tree = read_svg_unique(f)
    for x in extract_svg_content(svg_tree):
        r.append(x)
    origin_x = 0
    origin_y = 0
    origin = r.find(".//*[@id='origin']")
    if origin is not None:
        origin_x, origin_y = element_position(origin, root=r)
        origin.getparent().remove(origin)
    else:
        print("Warning: component '{}' from library '{}' has no ORIGIN".format(name, lib))
    r.attrib["transform"] = "translate({} {}) scale(393.700787402) rotate({}) translate({}, {})".format(
            ki2dmil(pos[0]), ki2dmil(pos[1]),
            -math.degrees(pos[2]), -origin_x, -origin_y)
    if ref in highlight["items"]:
        if "width" in svg_tree.attrib and "height" in svg_tree.attrib:
            w = to_user_units(svg_tree.attrib["width"])
            h = to_user_units(svg_tree.attrib["height"])
            build_highlight(highlight, w, h, pos, (origin_x, origin_y), ref)
        elif "viewBox" in svg_tree.attrib:
            viewbox = re.split(" |,", svg_tree.attrib["viewBox"])
            w = to_user_units(viewbox[2])
            h = to_user_units(viewbox[3])
            build_highlight(highlight, w, h, pos, (origin_x, origin_y), ref)
        else:
            print("Warning: component '{}' from library '{}' has no viewBox. Cannot highlight".format(name, lib)) 
Example #12
Source File: etree_lxml.py    From Weapon-Detection-And-Classification with MIT License 5 votes vote down vote up
def getNodeDetails(self, node):
        if isinstance(node, tuple):  # Text node
            node, key = node
            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
            return base.TEXT, ensure_str(getattr(node, key))

        elif isinstance(node, Root):
            return (base.DOCUMENT,)

        elif isinstance(node, Doctype):
            return base.DOCTYPE, node.name, node.public_id, node.system_id

        elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
            return base.TEXT, ensure_str(node.obj)

        elif node.tag == etree.Comment:
            return base.COMMENT, ensure_str(node.text)

        elif node.tag == etree.Entity:
            return base.ENTITY, ensure_str(node.text)[1:-1]  # strip &;

        else:
            # This is assumed to be an ordinary element
            match = tag_regexp.match(ensure_str(node.tag))
            if match:
                namespace, tag = match.groups()
            else:
                namespace = None
                tag = ensure_str(node.tag)
            attrs = {}
            for name, value in list(node.attrib.items()):
                name = ensure_str(name)
                value = ensure_str(value)
                match = tag_regexp.match(name)
                if match:
                    attrs[(match.group(1), match.group(2))] = value
                else:
                    attrs[(None, name)] = value
            return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
                    attrs, len(node) > 0 or node.text) 
Example #13
Source File: etree_lxml.py    From planespotter with MIT License 5 votes vote down vote up
def getNodeDetails(self, node):
        if isinstance(node, tuple):  # Text node
            node, key = node
            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
            return base.TEXT, ensure_str(getattr(node, key))

        elif isinstance(node, Root):
            return (base.DOCUMENT,)

        elif isinstance(node, Doctype):
            return base.DOCTYPE, node.name, node.public_id, node.system_id

        elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
            return base.TEXT, ensure_str(node.obj)

        elif node.tag == etree.Comment:
            return base.COMMENT, ensure_str(node.text)

        elif node.tag == etree.Entity:
            return base.ENTITY, ensure_str(node.text)[1:-1]  # strip &;

        else:
            # This is assumed to be an ordinary element
            match = tag_regexp.match(ensure_str(node.tag))
            if match:
                namespace, tag = match.groups()
            else:
                namespace = None
                tag = ensure_str(node.tag)
            attrs = {}
            for name, value in list(node.attrib.items()):
                name = ensure_str(name)
                value = ensure_str(value)
                match = tag_regexp.match(name)
                if match:
                    attrs[(match.group(1), match.group(2))] = value
                else:
                    attrs[(None, name)] = value
            return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
                    attrs, len(node) > 0 or node.text) 
Example #14
Source File: lxmletree.py    From MARA_Framework with GNU Lesser General Public License v3.0 5 votes vote down vote up
def getNodeDetails(self, node):
        if isinstance(node, tuple):  # Text node
            node, key = node
            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
            return _base.TEXT, ensure_str(getattr(node, key))

        elif isinstance(node, Root):
            return (_base.DOCUMENT,)

        elif isinstance(node, Doctype):
            return _base.DOCTYPE, node.name, node.public_id, node.system_id

        elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
            return _base.TEXT, node.obj

        elif node.tag == etree.Comment:
            return _base.COMMENT, ensure_str(node.text)

        elif node.tag == etree.Entity:
            return _base.ENTITY, ensure_str(node.text)[1:-1]  # strip &;

        else:
            # This is assumed to be an ordinary element
            match = tag_regexp.match(ensure_str(node.tag))
            if match:
                namespace, tag = match.groups()
            else:
                namespace = None
                tag = ensure_str(node.tag)
            attrs = {}
            for name, value in list(node.attrib.items()):
                name = ensure_str(name)
                value = ensure_str(value)
                match = tag_regexp.match(name)
                if match:
                    attrs[(match.group(1), match.group(2))] = value
                else:
                    attrs[(None, name)] = value
            return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
                    attrs, len(node) > 0 or node.text) 
Example #15
Source File: lxmletree.py    From Financial-Portfolio-Flask with MIT License 5 votes vote down vote up
def getNodeDetails(self, node):
        if isinstance(node, tuple):  # Text node
            node, key = node
            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
            return _base.TEXT, ensure_str(getattr(node, key))

        elif isinstance(node, Root):
            return (_base.DOCUMENT,)

        elif isinstance(node, Doctype):
            return _base.DOCTYPE, node.name, node.public_id, node.system_id

        elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
            return _base.TEXT, node.obj

        elif node.tag == etree.Comment:
            return _base.COMMENT, ensure_str(node.text)

        elif node.tag == etree.Entity:
            return _base.ENTITY, ensure_str(node.text)[1:-1]  # strip &;

        else:
            # This is assumed to be an ordinary element
            match = tag_regexp.match(ensure_str(node.tag))
            if match:
                namespace, tag = match.groups()
            else:
                namespace = None
                tag = ensure_str(node.tag)
            attrs = {}
            for name, value in list(node.attrib.items()):
                name = ensure_str(name)
                value = ensure_str(value)
                match = tag_regexp.match(name)
                if match:
                    attrs[(match.group(1), match.group(2))] = value
                else:
                    attrs[(None, name)] = value
            return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
                    attrs, len(node) > 0 or node.text) 
Example #16
Source File: etree_lxml.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def getNodeDetails(self, node):
        if isinstance(node, tuple):  # Text node
            node, key = node
            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
            return base.TEXT, ensure_str(getattr(node, key))

        elif isinstance(node, Root):
            return (base.DOCUMENT,)

        elif isinstance(node, Doctype):
            return base.DOCTYPE, node.name, node.public_id, node.system_id

        elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
            return base.TEXT, ensure_str(node.obj)

        elif node.tag == etree.Comment:
            return base.COMMENT, ensure_str(node.text)

        elif node.tag == etree.Entity:
            return base.ENTITY, ensure_str(node.text)[1:-1]  # strip &;

        else:
            # This is assumed to be an ordinary element
            match = tag_regexp.match(ensure_str(node.tag))
            if match:
                namespace, tag = match.groups()
            else:
                namespace = None
                tag = ensure_str(node.tag)
            attrs = {}
            for name, value in list(node.attrib.items()):
                name = ensure_str(name)
                value = ensure_str(value)
                match = tag_regexp.match(name)
                if match:
                    attrs[(match.group(1), match.group(2))] = value
                else:
                    attrs[(None, name)] = value
            return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
                    attrs, len(node) > 0 or node.text) 
Example #17
Source File: etree_lxml.py    From scylla with Apache License 2.0 5 votes vote down vote up
def getNodeDetails(self, node):
        if isinstance(node, tuple):  # Text node
            node, key = node
            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
            return base.TEXT, ensure_str(getattr(node, key))

        elif isinstance(node, Root):
            return (base.DOCUMENT,)

        elif isinstance(node, Doctype):
            return base.DOCTYPE, node.name, node.public_id, node.system_id

        elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
            return base.TEXT, ensure_str(node.obj)

        elif node.tag == etree.Comment:
            return base.COMMENT, ensure_str(node.text)

        elif node.tag == etree.Entity:
            return base.ENTITY, ensure_str(node.text)[1:-1]  # strip &;

        else:
            # This is assumed to be an ordinary element
            match = tag_regexp.match(ensure_str(node.tag))
            if match:
                namespace, tag = match.groups()
            else:
                namespace = None
                tag = ensure_str(node.tag)
            attrs = {}
            for name, value in list(node.attrib.items()):
                name = ensure_str(name)
                value = ensure_str(value)
                match = tag_regexp.match(name)
                if match:
                    attrs[(match.group(1), match.group(2))] = value
                else:
                    attrs[(None, name)] = value
            return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
                    attrs, len(node) > 0 or node.text) 
Example #18
Source File: clean.py    From aws-lambda-lxml with GNU General Public License v3.0 5 votes vote down vote up
def kill_conditional_comments(self, doc):
        """
        IE conditional comments basically embed HTML that the parser
        doesn't normally see.  We can't allow anything like that, so
        we'll kill any comments that could be conditional.
        """
        bad = []
        self._kill_elements(
            doc, lambda el: _conditional_comment_re.search(el.text),
            etree.Comment) 
Example #19
Source File: _html5builder.py    From aws-lambda-lxml with GNU General Public License v3.0 5 votes vote down vote up
def insertRoot(self, name):
        buf = []
        if self.doctype and self.doctype.name:
            buf.append('<!DOCTYPE %s' % self.doctype.name)
            if self.doctype.publicId is not None or self.doctype.systemId is not None:
                buf.append(' PUBLIC "%s" "%s"' % (self.doctype.publicId,
                                                  self.doctype.systemId))
            buf.append('>')
        buf.append('<html></html>')
        root = html.fromstring(''.join(buf))

        # Append the initial comments:
        for comment in self.initialComments:
            root.addprevious(etree.Comment(comment))

        # Create the root document and add the ElementTree to it
        self.document = self.documentClass()
        self.document._elementTree = root.getroottree()

        # Add the root element to the internal child/open data structures
        root_element = self.elementClass(name)
        root_element._element = root
        self.document.childNodes.append(root_element)
        self.openElements.append(root_element)

        self.rootInserted = True 
Example #20
Source File: _html5builder.py    From aws-lambda-lxml with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, *args, **kwargs):
        html_builder = etree_builders.getETreeModule(html, fullTree=False)
        etree_builder = etree_builders.getETreeModule(etree, fullTree=False)
        self.elementClass = html_builder.Element
        self.commentClass = etree_builder.Comment
        _base.TreeBuilder.__init__(self, *args, **kwargs) 
Example #21
Source File: _html5builder.py    From aws-lambda-lxml with GNU General Public License v3.0 5 votes vote down vote up
def insertRoot(self, name):
        buf = []
        if self.doctype and self.doctype.name:
            buf.append('<!DOCTYPE %s' % self.doctype.name)
            if self.doctype.publicId is not None or self.doctype.systemId is not None:
                buf.append(' PUBLIC "%s" "%s"' % (self.doctype.publicId,
                                                  self.doctype.systemId))
            buf.append('>')
        buf.append('<html></html>')
        root = html.fromstring(''.join(buf))

        # Append the initial comments:
        for comment in self.initialComments:
            root.addprevious(etree.Comment(comment))

        # Create the root document and add the ElementTree to it
        self.document = self.documentClass()
        self.document._elementTree = root.getroottree()

        # Add the root element to the internal child/open data structures
        root_element = self.elementClass(name)
        root_element._element = root
        self.document.childNodes.append(root_element)
        self.openElements.append(root_element)

        self.rootInserted = True 
Example #22
Source File: _html5builder.py    From aws-lambda-lxml with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, *args, **kwargs):
        html_builder = etree_builders.getETreeModule(html, fullTree=False)
        etree_builder = etree_builders.getETreeModule(etree, fullTree=False)
        self.elementClass = html_builder.Element
        self.commentClass = etree_builder.Comment
        _base.TreeBuilder.__init__(self, *args, **kwargs) 
Example #23
Source File: clean.py    From aws-lambda-lxml with GNU General Public License v3.0 5 votes vote down vote up
def kill_conditional_comments(self, doc):
        """
        IE conditional comments basically embed HTML that the parser
        doesn't normally see.  We can't allow anything like that, so
        we'll kill any comments that could be conditional.
        """
        bad = []
        self._kill_elements(
            doc, lambda el: _conditional_comment_re.search(el.text),
            etree.Comment) 
Example #24
Source File: _html5builder.py    From aws-lambda-lxml with GNU General Public License v3.0 5 votes vote down vote up
def insertRoot(self, name):
        buf = []
        if self.doctype and self.doctype.name:
            buf.append('<!DOCTYPE %s' % self.doctype.name)
            if self.doctype.publicId is not None or self.doctype.systemId is not None:
                buf.append(' PUBLIC "%s" "%s"' % (self.doctype.publicId,
                                                  self.doctype.systemId))
            buf.append('>')
        buf.append('<html></html>')
        root = html.fromstring(''.join(buf))

        # Append the initial comments:
        for comment in self.initialComments:
            root.addprevious(etree.Comment(comment))

        # Create the root document and add the ElementTree to it
        self.document = self.documentClass()
        self.document._elementTree = root.getroottree()

        # Add the root element to the internal child/open data structures
        root_element = self.elementClass(name)
        root_element._element = root
        self.document.childNodes.append(root_element)
        self.openElements.append(root_element)

        self.rootInserted = True 
Example #25
Source File: clean.py    From aws-lambda-lxml with GNU General Public License v3.0 5 votes vote down vote up
def kill_conditional_comments(self, doc):
        """
        IE conditional comments basically embed HTML that the parser
        doesn't normally see.  We can't allow anything like that, so
        we'll kill any comments that could be conditional.
        """
        bad = []
        self._kill_elements(
            doc, lambda el: _conditional_comment_re.search(el.text),
            etree.Comment) 
Example #26
Source File: _html5builder.py    From aws-lambda-lxml with GNU General Public License v3.0 5 votes vote down vote up
def insertRoot(self, name):
        buf = []
        if self.doctype and self.doctype.name:
            buf.append('<!DOCTYPE %s' % self.doctype.name)
            if self.doctype.publicId is not None or self.doctype.systemId is not None:
                buf.append(' PUBLIC "%s" "%s"' % (self.doctype.publicId,
                                                  self.doctype.systemId))
            buf.append('>')
        buf.append('<html></html>')
        root = html.fromstring(''.join(buf))

        # Append the initial comments:
        for comment in self.initialComments:
            root.addprevious(etree.Comment(comment))

        # Create the root document and add the ElementTree to it
        self.document = self.documentClass()
        self.document._elementTree = root.getroottree()

        # Add the root element to the internal child/open data structures
        root_element = self.elementClass(name)
        root_element._element = root
        self.document.childNodes.append(root_element)
        self.openElements.append(root_element)

        self.rootInserted = True 
Example #27
Source File: _html5builder.py    From aws-lambda-lxml with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, *args, **kwargs):
        html_builder = etree_builders.getETreeModule(html, fullTree=False)
        etree_builder = etree_builders.getETreeModule(etree, fullTree=False)
        self.elementClass = html_builder.Element
        self.commentClass = etree_builder.Comment
        _base.TreeBuilder.__init__(self, *args, **kwargs) 
Example #28
Source File: etree_lxml.py    From Building-Recommendation-Systems-with-Python with MIT License 5 votes vote down vote up
def getNodeDetails(self, node):
        if isinstance(node, tuple):  # Text node
            node, key = node
            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
            return base.TEXT, ensure_str(getattr(node, key))

        elif isinstance(node, Root):
            return (base.DOCUMENT,)

        elif isinstance(node, Doctype):
            return base.DOCTYPE, node.name, node.public_id, node.system_id

        elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
            return base.TEXT, ensure_str(node.obj)

        elif node.tag == etree.Comment:
            return base.COMMENT, ensure_str(node.text)

        elif node.tag == etree.Entity:
            return base.ENTITY, ensure_str(node.text)[1:-1]  # strip &;

        else:
            # This is assumed to be an ordinary element
            match = tag_regexp.match(ensure_str(node.tag))
            if match:
                namespace, tag = match.groups()
            else:
                namespace = None
                tag = ensure_str(node.tag)
            attrs = {}
            for name, value in list(node.attrib.items()):
                name = ensure_str(name)
                value = ensure_str(value)
                match = tag_regexp.match(name)
                if match:
                    attrs[(match.group(1), match.group(2))] = value
                else:
                    attrs[(None, name)] = value
            return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
                    attrs, len(node) > 0 or node.text) 
Example #29
Source File: etree_lxml.py    From pySINDy with MIT License 5 votes vote down vote up
def getNodeDetails(self, node):
        if isinstance(node, tuple):  # Text node
            node, key = node
            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
            return base.TEXT, ensure_str(getattr(node, key))

        elif isinstance(node, Root):
            return (base.DOCUMENT,)

        elif isinstance(node, Doctype):
            return base.DOCTYPE, node.name, node.public_id, node.system_id

        elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
            return base.TEXT, ensure_str(node.obj)

        elif node.tag == etree.Comment:
            return base.COMMENT, ensure_str(node.text)

        elif node.tag == etree.Entity:
            return base.ENTITY, ensure_str(node.text)[1:-1]  # strip &;

        else:
            # This is assumed to be an ordinary element
            match = tag_regexp.match(ensure_str(node.tag))
            if match:
                namespace, tag = match.groups()
            else:
                namespace = None
                tag = ensure_str(node.tag)
            attrs = {}
            for name, value in list(node.attrib.items()):
                name = ensure_str(name)
                value = ensure_str(value)
                match = tag_regexp.match(name)
                if match:
                    attrs[(match.group(1), match.group(2))] = value
                else:
                    attrs[(None, name)] = value
            return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
                    attrs, len(node) > 0 or node.text) 
Example #30
Source File: diff.py    From xmldiff with MIT License 5 votes vote down vote up
def node_ratio(self, left, right):
        if left.tag is etree.Comment or right.tag is etree.Comment:
            if left.tag is etree.Comment and right.tag is etree.Comment:
                # comments
                self._sequencematcher.set_seqs(left.text, right.text)
                return self._sequence_ratio()
            # One is a comment the other is not:
            return 0

        for attr in self.uniqueattrs:
            if not isinstance(attr, str):
                # If it's actually a sequence of (tag, attr), the tags must
                # match first.
                tag, attr = attr
                if tag != left.tag or tag != right.tag:
                    continue
            if attr in left.attrib or attr in right.attrib:
                # One of the nodes have a unique attribute, we check only that.
                # If only one node has it, it means they are not the same.
                return int(left.attrib.get(attr) == right.attrib.get(attr))

        match = self.leaf_ratio(left, right)
        child_ratio = self.child_ratio(left, right)

        if child_ratio is not None:
            match = (match + child_ratio) / 2
        return match