Python html5lib.parseFragment() Examples

The following are 12 code examples of html5lib.parseFragment(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module html5lib , or try the search function .
Example #1
Source File: sanitizer.py    From bazarr with GNU General Public License v3.0 6 votes vote down vote up
def runtest(self):
        input = self.test["input"]
        expected = self.test["output"]

        parsed = parseFragment(input)
        serialized = serialize(parsed,
                               sanitize=True,
                               omit_optional_tags=False,
                               use_trailing_solidus=True,
                               space_before_trailing_solidus=False,
                               quote_attr_values="always",
                               quote_char="'",
                               alphabetical_attributes=True)
        errorMsg = "\n".join(["\n\nInput:", input,
                              "\nExpected:", expected,
                              "\nReceived:", serialized])
        assert expected == serialized, errorMsg 
Example #2
Source File: chunk.py    From budou with Apache License 2.0 5 votes vote down vote up
def span_serialize(self, attributes, max_length=None):
    """Returns concatenated HTML code with SPAN tag.

    Args:
      attributes (dict): A map of name-value pairs for attributes of output
          SPAN tags.
      max_length (int, optional): Maximum length of span enclosed chunk.

    Returns:
      The organized HTML code. (str)
    """
    doc = ET.Element('span')
    for chunk in self:
      if (chunk.has_cjk() and
          not (max_length and len(chunk.word) > max_length)):
        ele = ET.Element('span')
        ele.text = chunk.word
        for key, val in attributes.items():
          ele.attrib[key] = val
        doc.append(ele)
      else:
        # add word without span tag for non-CJK text (e.g. English)
        # by appending it after the last element
        if doc.getchildren():
          if doc.getchildren()[-1].tail is None:
            doc.getchildren()[-1].tail = chunk.word
          else:
            doc.getchildren()[-1].tail += chunk.word
        else:
          if doc.text is None:
            doc.text = chunk.word
          else:
            doc.text += chunk.word
    result = ET.tostring(doc, encoding='utf-8').decode('utf-8')
    result = html5lib.serialize(
        html5lib.parseFragment(result), sanitize=True,
        quote_attr_values='always')
    return result 
Example #3
Source File: chunk.py    From budou with Apache License 2.0 5 votes vote down vote up
def wbr_serialize(self):
    """Returns concatenated HTML code with WBR tag. This is still experimental.

    Returns:
      The organized HTML code. (str)
    """
    doc = ET.Element('span')
    doc.attrib['style'] = 'word-break: keep-all'
    for chunk in self:
      if (chunk.has_cjk() and doc.text):
        ele = ET.Element('wbr')
        doc.append(ele)
        doc.getchildren()[-1].tail = chunk.word
      else:
        # add word without span tag for non-CJK text (e.g. English)
        # by appending it after the last element
        if doc.getchildren():
          if doc.getchildren()[-1].tail is None:
            doc.getchildren()[-1].tail = chunk.word
          else:
            doc.getchildren()[-1].tail += chunk.word
        else:
          if doc.text is None:
            doc.text = chunk.word
          else:
            doc.text += chunk.word
    content = ET.tostring(doc, encoding='utf-8').decode('utf-8')
    dom = html5lib.parseFragment(content)
    treewalker = getTreeWalker('etree')
    stream = treewalker(dom)
    serializer = html5lib.serializer.HTMLSerializer(
            quote_attr_values='always')
    allowed_elements = set(sanitizer.allowed_elements)
    allowed_elements.add((namespaces['html'], 'wbr'))
    allowed_css_properties = set(sanitizer.allowed_css_properties)
    allowed_css_properties.add('word-break')
    result = serializer.render(sanitizer.Filter(
        stream, allowed_elements=allowed_elements,
        allowed_css_properties=allowed_css_properties,
        ))
    return result 
Example #4
Source File: parser.py    From budou with Apache License 2.0 5 votes vote down vote up
def preprocess(source):
  """Removes unnecessary break lines and white spaces.

  Args:
    source (str): Input sentence.

  Returns:
    Preprocessed sentence. (str)
  """
  doc = html5lib.parseFragment(source)
  source = ET.tostring(doc, encoding='utf-8', method='text').decode('utf-8')
  source = source.replace(u'\n', u'').strip()
  source = re.sub(r'\s\s+', u' ', source)
  return source 
Example #5
Source File: ckeditor_tags.py    From adhocracy4 with GNU Affero General Public License v3.0 5 votes vote down vote up
def transform_collapsibles(text):
    """Find simple collapsible elements and transform them to full html."""
    tree = parseFragment(text, container='div', treebuilder='etree',
                         namespaceHTMLElements=False)

    base_id = ''.join(filter(str.isdigit, str(time.time())))
    collapsibles = tree.findall('./div[@class="collapsible-item"]')
    for i, collapsible in enumerate(collapsibles):
        title = collapsible.find('./div[@class="collapsible-item-title"]')
        body = collapsible.find('./div[@class="collapsible-item-body"]')

        if title is not None and body is not None:
            title.tag = 'span'
            del title.attrib['class']

            body.tag = 'div'
            del body.attrib['class']

            final_html = render_to_string(
                'a4ckeditor/collapsible_fragment.html',
                dict(
                    id='a4ckeditor-collapsible-{}_{}'.format(base_id, i),
                    title=serialize(title),
                    body=serialize(body))
            )

            collapsible.clear()
            collapsible.append(parseFragment(final_html, treebuilder='etree',
                                             namespaceHTMLElements=False))

    return serialize(tree) 
Example #6
Source File: test_utils.py    From allura with Apache License 2.0 5 votes vote down vote up
def walker_from_text(self, text):
        parsed = html5lib.parseFragment(text)
        TreeWalker = html5lib.treewalkers.getTreeWalker("etree")
        walker = TreeWalker(parsed)
        return walker 
Example #7
Source File: markdown_extensions.py    From allura with Apache License 2.0 5 votes vote down vote up
def run(self, text):
        parsed = html5lib.parseFragment(text)

        # if we didn't have to customize our sanitization, could just do:
        # return html5lib.serialize(parsed, sanitize=True)

        # instead we do the same steps as that function,
        # but add our ForgeHTMLSanitizerFilter instead of sanitize=True which would use the standard one
        TreeWalker = html5lib.treewalkers.getTreeWalker("etree")
        walker = TreeWalker(parsed)
        walker = ForgeHTMLSanitizerFilter(walker)  # this is our custom step
        s = html5lib.serializer.HTMLSerializer()
        return s.render(walker) 
Example #8
Source File: test_parser2.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def test_no_duplicate_clone():
    frag = parseFragment("<b><em><foo><foob><fooc><aside></b></em>")
    assert len(frag) == 2 
Example #9
Source File: test_parser2.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def test_self_closing_col():
    parser = HTMLParser()
    parser.parseFragment('<table><colgroup><col /></colgroup></table>')
    assert not parser.errors 
Example #10
Source File: test_sanitizer.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def runSanitizerTest(_, expected, input):
    parsed = parseFragment(expected)
    expected = serialize(parsed,
                         omit_optional_tags=False,
                         use_trailing_solidus=True,
                         space_before_trailing_solidus=False,
                         quote_attr_values="always",
                         quote_char='"',
                         alphabetical_attributes=True)
    assert expected == sanitize_html(input) 
Example #11
Source File: test_sanitizer.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def sanitize_html(stream):
    parsed = parseFragment(stream)
    serialized = serialize(parsed,
                           sanitize=True,
                           omit_optional_tags=False,
                           use_trailing_solidus=True,
                           space_before_trailing_solidus=False,
                           quote_attr_values="always",
                           quote_char='"',
                           alphabetical_attributes=True)
    return serialized 
Example #12
Source File: html5lib.py    From canvas with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def html(self):
        try:
            import html5lib
            self.html5lib = html5lib
            return html5lib.parseFragment(self.content)
        except ImportError, err:
            raise ImproperlyConfigured("Error while importing html5lib: %s" % err)