org.cyberneko.html.parsers.DOMFragmentParser Java Examples

The following examples show how to use org.cyberneko.html.parsers.DOMFragmentParser. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: MessageParser.java From translationstudio8 with GNU General Public License v2.0

6 votes

/**
 * 将 html 格式的文本过滤掉标签.
 * @param html
 *            html 格式的字符串
 * @return String
 * 			  过滤掉 html 标签后的文本。如果 html 为空，返回空串""
 */
private String htmlToText(String html) {
	if (html == null) {
		return "";
	}
	DOMFragmentParser parser = new DOMFragmentParser();
	CoreDocumentImpl codeDoc = new CoreDocumentImpl();
	InputSource inSource = new InputSource(new ByteArrayInputStream(html.getBytes()));
	inSource.setEncoding(textCharset);
	DocumentFragment doc = codeDoc.createDocumentFragment();

	try {
		parser.parse(inSource, doc);
	} catch (Exception e) {
		return "";
	}

	textBuffer = new StringBuffer();
	processNode(doc);
	return textBuffer.toString();
}

Example #2

Source File: MessageParser.java From tmxeditor8 with GNU General Public License v2.0

6 votes

/**
 * 将 html 格式的文本过滤掉标签.
 * @param html
 *            html 格式的字符串
 * @return String
 * 			  过滤掉 html 标签后的文本。如果 html 为空，返回空串""
 */
private String htmlToText(String html) {
	if (html == null) {
		return "";
	}
	DOMFragmentParser parser = new DOMFragmentParser();
	CoreDocumentImpl codeDoc = new CoreDocumentImpl();
	InputSource inSource = new InputSource(new ByteArrayInputStream(html.getBytes()));
	inSource.setEncoding(textCharset);
	DocumentFragment doc = codeDoc.createDocumentFragment();

	try {
		parser.parse(inSource, doc);
	} catch (Exception e) {
		return "";
	}

	textBuffer = new StringBuffer();
	processNode(doc);
	return textBuffer.toString();
}

Example #3

Source File: CarteIT.java From pentaho-kettle with Apache License 2.0

5 votes

public static Node parse( String content ) throws SAXException, IOException {
  DOMFragmentParser parser = new DOMFragmentParser();
  HTMLDocument document = new HTMLDocumentImpl();
  DocumentFragment fragment = document.createDocumentFragment();

  InputSource is = new InputSource( new StringReader( content ) );
  parser.parse( is, fragment );
  return fragment;
}

Example #4

Source File: TestDOMContentUtils.java From nutch-htmlunit with Apache License 2.0

4 votes

private static void setup() throws Exception {
  conf = NutchConfiguration.create();
  conf.setBoolean("parser.html.form.use_action", true);
  utils = new DOMContentUtils(conf);
  DOMFragmentParser parser = new DOMFragmentParser();
  parser.setFeature(
      "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
      true);
  for (int i = 0; i < testPages.length; i++) {
    DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
    try {
      parser.parse(
          new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
          node);
      testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
    } catch (Exception e) {
      assertTrue("caught exception: " + e, false);
    }
    testDOMs[i] = node;
  }
  answerOutlinks = new Outlink[][] {
      { new Outlink("http://www.nutch.org", "anchor"), },
      { new Outlink("http://www.nutch.org/", "home"),
          new Outlink("http://www.nutch.org/docs/bot.html", "bots"), },
      { new Outlink("http://www.nutch.org/", "separate this"),
          new Outlink("http://www.nutch.org/docs/ok", "from this"), },
      { new Outlink("http://www.nutch.org/", "home"),
          new Outlink("http://www.nutch.org/docs/1", "1"),
          new Outlink("http://www.nutch.org/docs/2", "2"), },
      { new Outlink("http://www.nutch.org/frames/top.html", ""),
          new Outlink("http://www.nutch.org/frames/left.html", ""),
          new Outlink("http://www.nutch.org/frames/invalid.html", ""),
          new Outlink("http://www.nutch.org/frames/right.html", ""), },
      { new Outlink("http://www.nutch.org/maps/logo.gif", ""),
          new Outlink("http://www.nutch.org/index.html", ""),
          new Outlink("http://www.nutch.org/maps/#bottom", ""),
          new Outlink("http://www.nutch.org/bot.html", ""),
          new Outlink("http://www.nutch.org/docs/index.html", ""), },
      { new Outlink("http://www.nutch.org/index.html", "whitespace test"), },
      {},
      { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), },
      {},
      { new Outlink("http://www.nutch.org/;x", "anchor1"),
          new Outlink("http://www.nutch.org/g;x", "anchor2"),
          new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") },
      {
          // this is tricky - see RFC3986 section 5.4.1 example 7
          new Outlink("http://www.nutch.org/g", "anchor1"),
          new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
          new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
          new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
          new Outlink("http://www.nutch.org/;something?y=1;somethingelse",
              "anchor5") } };

}