org.cyberneko.html.parsers.DOMFragmentParser Java Examples

The following examples show how to use org.cyberneko.html.parsers.DOMFragmentParser. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MessageParser.java    From translationstudio8 with GNU General Public License v2.0 6 votes vote down vote up
/**
 * 将 html 格式的文本过滤掉标签.
 * @param html
 *            html 格式的字符串
 * @return String
 * 			  过滤掉 html 标签后的文本。如果 html 为空,返回空串""
 */
private String htmlToText(String html) {
	if (html == null) {
		return "";
	}
	DOMFragmentParser parser = new DOMFragmentParser();
	CoreDocumentImpl codeDoc = new CoreDocumentImpl();
	InputSource inSource = new InputSource(new ByteArrayInputStream(html.getBytes()));
	inSource.setEncoding(textCharset);
	DocumentFragment doc = codeDoc.createDocumentFragment();

	try {
		parser.parse(inSource, doc);
	} catch (Exception e) {
		return "";
	}

	textBuffer = new StringBuffer();
	processNode(doc);
	return textBuffer.toString();
}
 
Example #2
Source File: MessageParser.java    From tmxeditor8 with GNU General Public License v2.0 6 votes vote down vote up
/**
 * 将 html 格式的文本过滤掉标签.
 * @param html
 *            html 格式的字符串
 * @return String
 * 			  过滤掉 html 标签后的文本。如果 html 为空,返回空串""
 */
private String htmlToText(String html) {
	if (html == null) {
		return "";
	}
	DOMFragmentParser parser = new DOMFragmentParser();
	CoreDocumentImpl codeDoc = new CoreDocumentImpl();
	InputSource inSource = new InputSource(new ByteArrayInputStream(html.getBytes()));
	inSource.setEncoding(textCharset);
	DocumentFragment doc = codeDoc.createDocumentFragment();

	try {
		parser.parse(inSource, doc);
	} catch (Exception e) {
		return "";
	}

	textBuffer = new StringBuffer();
	processNode(doc);
	return textBuffer.toString();
}
 
Example #3
Source File: CarteIT.java    From pentaho-kettle with Apache License 2.0 5 votes vote down vote up
public static Node parse( String content ) throws SAXException, IOException {
  DOMFragmentParser parser = new DOMFragmentParser();
  HTMLDocument document = new HTMLDocumentImpl();
  DocumentFragment fragment = document.createDocumentFragment();

  InputSource is = new InputSource( new StringReader( content ) );
  parser.parse( is, fragment );
  return fragment;
}
 
Example #4
Source File: TestDOMContentUtils.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
private static void setup() throws Exception {
  conf = NutchConfiguration.create();
  conf.setBoolean("parser.html.form.use_action", true);
  utils = new DOMContentUtils(conf);
  DOMFragmentParser parser = new DOMFragmentParser();
  parser.setFeature(
      "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
      true);
  for (int i = 0; i < testPages.length; i++) {
    DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
    try {
      parser.parse(
          new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
          node);
      testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
    } catch (Exception e) {
      assertTrue("caught exception: " + e, false);
    }
    testDOMs[i] = node;
  }
  answerOutlinks = new Outlink[][] {
      { new Outlink("http://www.nutch.org", "anchor"), },
      { new Outlink("http://www.nutch.org/", "home"),
          new Outlink("http://www.nutch.org/docs/bot.html", "bots"), },
      { new Outlink("http://www.nutch.org/", "separate this"),
          new Outlink("http://www.nutch.org/docs/ok", "from this"), },
      { new Outlink("http://www.nutch.org/", "home"),
          new Outlink("http://www.nutch.org/docs/1", "1"),
          new Outlink("http://www.nutch.org/docs/2", "2"), },
      { new Outlink("http://www.nutch.org/frames/top.html", ""),
          new Outlink("http://www.nutch.org/frames/left.html", ""),
          new Outlink("http://www.nutch.org/frames/invalid.html", ""),
          new Outlink("http://www.nutch.org/frames/right.html", ""), },
      { new Outlink("http://www.nutch.org/maps/logo.gif", ""),
          new Outlink("http://www.nutch.org/index.html", ""),
          new Outlink("http://www.nutch.org/maps/#bottom", ""),
          new Outlink("http://www.nutch.org/bot.html", ""),
          new Outlink("http://www.nutch.org/docs/index.html", ""), },
      { new Outlink("http://www.nutch.org/index.html", "whitespace test"), },
      {},
      { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), },
      {},
      { new Outlink("http://www.nutch.org/;x", "anchor1"),
          new Outlink("http://www.nutch.org/g;x", "anchor2"),
          new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") },
      {
          // this is tricky - see RFC3986 section 5.4.1 example 7
          new Outlink("http://www.nutch.org/g", "anchor1"),
          new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
          new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
          new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
          new Outlink("http://www.nutch.org/;something?y=1;somethingelse",
              "anchor5") } };

}