Python html5lib.parseFragment() Examples

The following are 12 code examples of html5lib.parseFragment(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module html5lib , or try the search function

Example #1

Source File: sanitizer.py From bazarr with GNU General Public License v3.0

6 votes

def runtest(self):
        input = self.test["input"]
        expected = self.test["output"]

        parsed = parseFragment(input)
        serialized = serialize(parsed,
                               sanitize=True,
                               omit_optional_tags=False,
                               use_trailing_solidus=True,
                               space_before_trailing_solidus=False,
                               quote_attr_values="always",
                               quote_char="'",
                               alphabetical_attributes=True)
        errorMsg = "\n".join(["\n\nInput:", input,
                              "\nExpected:", expected,
                              "\nReceived:", serialized])
        assert expected == serialized, errorMsg

Example #2

Source File: chunk.py From budou with Apache License 2.0

5 votes

def span_serialize(self, attributes, max_length=None):
    """Returns concatenated HTML code with SPAN tag.

    Args:
      attributes (dict): A map of name-value pairs for attributes of output
          SPAN tags.
      max_length (int, optional): Maximum length of span enclosed chunk.

    Returns:
      The organized HTML code. (str)
    """
    doc = ET.Element('span')
    for chunk in self:
      if (chunk.has_cjk() and
          not (max_length and len(chunk.word) > max_length)):
        ele = ET.Element('span')
        ele.text = chunk.word
        for key, val in attributes.items():
          ele.attrib[key] = val
        doc.append(ele)
      else:
        # add word without span tag for non-CJK text (e.g. English)
        # by appending it after the last element
        if doc.getchildren():
          if doc.getchildren()[-1].tail is None:
            doc.getchildren()[-1].tail = chunk.word
          else:
            doc.getchildren()[-1].tail += chunk.word
        else:
          if doc.text is None:
            doc.text = chunk.word
          else:
            doc.text += chunk.word
    result = ET.tostring(doc, encoding='utf-8').decode('utf-8')
    result = html5lib.serialize(
        html5lib.parseFragment(result), sanitize=True,
        quote_attr_values='always')
    return result

Example #3

Source File: chunk.py From budou with Apache License 2.0

5 votes

def wbr_serialize(self):
    """Returns concatenated HTML code with WBR tag. This is still experimental.

    Returns:
      The organized HTML code. (str)
    """
    doc = ET.Element('span')
    doc.attrib['style'] = 'word-break: keep-all'
    for chunk in self:
      if (chunk.has_cjk() and doc.text):
        ele = ET.Element('wbr')
        doc.append(ele)
        doc.getchildren()[-1].tail = chunk.word
      else:
        # add word without span tag for non-CJK text (e.g. English)
        # by appending it after the last element
        if doc.getchildren():
          if doc.getchildren()[-1].tail is None:
            doc.getchildren()[-1].tail = chunk.word
          else:
            doc.getchildren()[-1].tail += chunk.word
        else:
          if doc.text is None:
            doc.text = chunk.word
          else:
            doc.text += chunk.word
    content = ET.tostring(doc, encoding='utf-8').decode('utf-8')
    dom = html5lib.parseFragment(content)
    treewalker = getTreeWalker('etree')
    stream = treewalker(dom)
    serializer = html5lib.serializer.HTMLSerializer(
            quote_attr_values='always')
    allowed_elements = set(sanitizer.allowed_elements)
    allowed_elements.add((namespaces['html'], 'wbr'))
    allowed_css_properties = set(sanitizer.allowed_css_properties)
    allowed_css_properties.add('word-break')
    result = serializer.render(sanitizer.Filter(
        stream, allowed_elements=allowed_elements,
        allowed_css_properties=allowed_css_properties,
        ))
    return result

Example #4

Source File: parser.py From budou with Apache License 2.0

5 votes

def preprocess(source):
  """Removes unnecessary break lines and white spaces.

  Args:
    source (str): Input sentence.

  Returns:
    Preprocessed sentence. (str)
  """
  doc = html5lib.parseFragment(source)
  source = ET.tostring(doc, encoding='utf-8', method='text').decode('utf-8')
  source = source.replace(u'\n', u'').strip()
  source = re.sub(r'\s\s+', u' ', source)
  return source

Example #5

Source File: ckeditor_tags.py From adhocracy4 with GNU Affero General Public License v3.0

5 votes

def transform_collapsibles(text):
    """Find simple collapsible elements and transform them to full html."""
    tree = parseFragment(text, container='div', treebuilder='etree',
                         namespaceHTMLElements=False)

    base_id = ''.join(filter(str.isdigit, str(time.time())))
    collapsibles = tree.findall('./div[@class="collapsible-item"]')
    for i, collapsible in enumerate(collapsibles):
        title = collapsible.find('./div[@class="collapsible-item-title"]')
        body = collapsible.find('./div[@class="collapsible-item-body"]')

        if title is not None and body is not None:
            title.tag = 'span'
            del title.attrib['class']

            body.tag = 'div'
            del body.attrib['class']

            final_html = render_to_string(
                'a4ckeditor/collapsible_fragment.html',
                dict(
                    id='a4ckeditor-collapsible-{}_{}'.format(base_id, i),
                    title=serialize(title),
                    body=serialize(body))
            )

            collapsible.clear()
            collapsible.append(parseFragment(final_html, treebuilder='etree',
                                             namespaceHTMLElements=False))

    return serialize(tree)

Example #6

Source File: test_utils.py From allura with Apache License 2.0

5 votes

def walker_from_text(self, text):
        parsed = html5lib.parseFragment(text)
        TreeWalker = html5lib.treewalkers.getTreeWalker("etree")
        walker = TreeWalker(parsed)
        return walker

Example #7

Source File: markdown_extensions.py From allura with Apache License 2.0

5 votes

def run(self, text):
        parsed = html5lib.parseFragment(text)

        # if we didn't have to customize our sanitization, could just do:
        # return html5lib.serialize(parsed, sanitize=True)

        # instead we do the same steps as that function,
        # but add our ForgeHTMLSanitizerFilter instead of sanitize=True which would use the standard one
        TreeWalker = html5lib.treewalkers.getTreeWalker("etree")
        walker = TreeWalker(parsed)
        walker = ForgeHTMLSanitizerFilter(walker)  # this is our custom step
        s = html5lib.serializer.HTMLSerializer()
        return s.render(walker)

Example #8

Source File: test_parser2.py From bazarr with GNU General Public License v3.0

5 votes

def test_no_duplicate_clone():
    frag = parseFragment("<b><em><foo><foob><fooc><aside></b></em>")
    assert len(frag) == 2

Example #9

Source File: test_parser2.py From bazarr with GNU General Public License v3.0

5 votes

def test_self_closing_col():
    parser = HTMLParser()
    parser.parseFragment('<table><colgroup><col /></colgroup></table>')
    assert not parser.errors

Example #10

Source File: test_sanitizer.py From bazarr with GNU General Public License v3.0

5 votes

def runSanitizerTest(_, expected, input):
    parsed = parseFragment(expected)
    expected = serialize(parsed,
                         omit_optional_tags=False,
                         use_trailing_solidus=True,
                         space_before_trailing_solidus=False,
                         quote_attr_values="always",
                         quote_char='"',
                         alphabetical_attributes=True)
    assert expected == sanitize_html(input)

Example #11

Source File: test_sanitizer.py From bazarr with GNU General Public License v3.0

5 votes

def sanitize_html(stream):
    parsed = parseFragment(stream)
    serialized = serialize(parsed,
                           sanitize=True,
                           omit_optional_tags=False,
                           use_trailing_solidus=True,
                           space_before_trailing_solidus=False,
                           quote_attr_values="always",
                           quote_char='"',
                           alphabetical_attributes=True)
    return serialized

Example #12

Source File: html5lib.py From canvas with BSD 3-Clause "New" or "Revised" License

5 votes

def html(self):
        try:
            import html5lib
            self.html5lib = html5lib
            return html5lib.parseFragment(self.content)
        except ImportError, err:
            raise ImproperlyConfigured("Error while importing html5lib: %s" % err)