org.htmlparser.util.ParserException Java Examples

The following examples show how to use org.htmlparser.util.ParserException. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HTMLParser.java    From PADListener with GNU General Public License v2.0 6 votes vote down vote up
/**
   * parses the body of the message, and returns a parsed representation
   * See {@link http://htmlparser.sourceforge.net/} for details
   * @param url the url that the message resulted from
   * @param message the Message to parse
   * @return a NodeList containing the various Nodes making up the page
   */
  public Object parseMessage(HttpUrl url, Message message) {
      String contentType = message.getHeader("Content-Type");
      if (contentType == null || !contentType.matches("text/html.*")) {
          return null;
      }
      byte[] content = message.getContent();
      if (content == null || content.length == 0) {
          return null;
      }
      Parser parser = Parser.createParser(new String(content), null);
      try {
          NodeList nodelist = parser.extractAllNodesThatMatch(new NodeFilter() {
public boolean accept(Node node) {
                  return true;
              }
          });
          return nodelist;
      } catch (ParserException pe) {
          _logger.severe(pe.toString());
          return null;
      }
  }
 
Example #2
Source File: StrUtils.java    From Lottery with GNU General Public License v2.0 6 votes vote down vote up
public static String html2Text(String html, int len) {
	try {
		Lexer lexer = new Lexer(html);
		Node node;
		StringBuilder sb = new StringBuilder(html.length());
		while ((node = lexer.nextNode()) != null) {
			if (node instanceof TextNode) {
				sb.append(node.toHtml());
			}
			if (sb.length() > len) {
				break;
			}
		}
		return sb.toString();
	} catch (ParserException e) {
		throw new RuntimeException(e);
	}
}
 
Example #3
Source File: CDATALexerTest.java    From webarchive-commons with Apache License 2.0 6 votes vote down vote up
private void assertJSContentWorks(String js) throws ParserException {
	String html = String.format("<script>%s</script>",js);
	l = makeLexer(html);
	assertFalse(l.inCSS());
	assertFalse(l.inJS());
	n = l.nextNode();
	assertFalse(l.inCSS());
	assertFalse(l.inJS());
	assertTrue(NodeUtils.isNonEmptyOpenTagNodeNamed(n, "SCRIPT"));
	n = l.nextNode();
	assertFalse(l.inCSS());
	assertTrue(l.inJS());
	assertTrue(NodeUtils.isTextNode(n));
	assertEquals(js,((TextNode)n).getText());
	n = l.nextNode();
	assertFalse(l.inCSS());
	assertFalse(l.inJS());
	assertTrue(NodeUtils.isCloseTagNodeNamed(n, "SCRIPT"));
}
 
Example #4
Source File: CDATALexerTest.java    From webarchive-commons with Apache License 2.0 6 votes vote down vote up
public void testInJSComment() throws ParserException {
		
//		dumpParse("<script>//<!--\n foo bar baz\n //--></script>");
//		dumpParse("<script><!-- foo bar baz --></script>");
//		dumpParse("<script>//<!-- foo bar baz --></script>");
//		dumpParse("<script><!-- foo bar baz //--></script>");
//		dumpParse("<script>\n//<!-- foo bar baz\n //--></script>");
//		dumpParse("<script> if(1 < 2) { foo(); } </script>");
//		dumpParse("<script> if(1 <n) { foo(); } </script>");
//		dumpParse("<script> document.write(\"<b>bold</b>\"); </script>");
//		dumpParse("<script> document.write(\"<script>bold</script>\"); </script>");
//		dumpParse("<script> <![CDATA[\n if(i<n) { foo() } // content of your Javascript goes here \n ]]> </script>");

		assertJSContentWorks("//<!--\n foo bar baz\n //-->");
		assertJSContentWorks("<!-- foo bar baz -->");
		assertJSContentWorks("//<!-- foo bar baz -->");
		assertJSContentWorks("<!-- foo bar baz //-->");
		assertJSContentWorks("\n//<!-- foo bar baz\n //-->");
		assertJSContentWorks("if(1 < 2) { foo(); } ");
		assertJSContentWorks("if(1 <n) { foo(); } ");
		assertJSContentWorks("document.write(\"<b>bold</b>\"); ");
		assertJSContentWorks("document.write(\"<script>bold</script>\"); ");
		assertJSContentWorks("<![CDATA[\n if(i<n) { foo() } // a comment \n ]]> ");

	}
 
Example #5
Source File: CDATALexerTest.java    From webarchive-commons with Apache License 2.0 6 votes vote down vote up
public void testInCSS() throws ParserException {
	l = makeLexer("<style>foo bar baz</style>");
	assertFalse(l.inCSS());
	assertFalse(l.inJS());
	n = l.nextNode();
	assertFalse(l.inCSS());
	assertFalse(l.inJS());
	assertTrue(NodeUtils.isNonEmptyOpenTagNodeNamed(n, "STYLE"));
	n = l.nextNode();
	assertTrue(l.inCSS());
	assertFalse(l.inJS());
	assertTrue(NodeUtils.isTextNode(n));
	assertEquals("foo bar baz",((TextNode)n).getText());
	n = l.nextNode();
	assertFalse(l.inCSS());
	assertFalse(l.inJS());
	assertTrue(NodeUtils.isCloseTagNodeNamed(n, "STYLE"));
}
 
Example #6
Source File: CDATALexerTest.java    From webarchive-commons with Apache License 2.0 6 votes vote down vote up
public void testInJS() throws ParserException {
	l = makeLexer("<script>foo bar baz</script>");
	assertFalse(l.inCSS());
	assertFalse(l.inJS());
	n = l.nextNode();
	assertFalse(l.inCSS());
	assertFalse(l.inJS());
	assertTrue(NodeUtils.isNonEmptyOpenTagNodeNamed(n, "SCRIPT"));
	n = l.nextNode();
	assertFalse(l.inCSS());
	assertTrue(l.inJS());
	assertTrue(NodeUtils.isTextNode(n));
	assertEquals("foo bar baz",((TextNode)n).getText());
	n = l.nextNode();
	assertFalse(l.inCSS());
	assertFalse(l.inJS());
	assertTrue(NodeUtils.isCloseTagNodeNamed(n, "SCRIPT"));
}
 
Example #7
Source File: CDATALexer.java    From webarchive-commons with Apache License 2.0 6 votes vote down vote up
@Override
public Node nextNode() throws ParserException {
	inJS = false;
	inCSS = false;
	if(cached != null) {
		Node tmp = cached;
		cached = null;
		inJS = cachedJS;
		inCSS = !cachedJS;
		return tmp;
	}
	Node got = super.nextNode();
	if(NodeUtils.isNonEmptyOpenTagNodeNamed(got, "SCRIPT")) {
		cached = super.parseCDATA(true);
		cachedJS = true;
	} else if (NodeUtils.isNonEmptyOpenTagNodeNamed(got, "STYLE")) {
		cached = super.parseCDATA(true);
		cachedJS = false;
	}
	return got;
}
 
Example #8
Source File: HTMLConverter.java    From OpenEphyra with GNU General Public License v2.0 6 votes vote down vote up
/**
 * Reads an HTML document from a file and converts it into plain text.
 * 
 * @param filename name of file containing HTML documents
 * @return plain text or <code>null</code> if the reading or conversion failed
 */
public static synchronized String file2text(String filename) {
	// read from file and convert HTML document
	StringBean sb = new StringBean();
	sb.setLinks(false);  // no links
	sb.setReplaceNonBreakingSpaces (true); // replace non-breaking spaces
    sb.setCollapse(true);  // replace sequences of whitespaces
	Parser parser = new Parser();
	try {
		parser.setResource(filename);
		parser.visitAllNodesWith(sb);
	} catch (ParserException e) {
		return null;
	}
	String docText = sb.getStrings();
	
	return docText;
}
 
Example #9
Source File: HTMLConverter.java    From OpenEphyra with GNU General Public License v2.0 6 votes vote down vote up
/**
 * Converts an HTML document into plain text.
 * 
 * @param html HTML document
 * @return plain text or <code>null</code> if the conversion failed
 */
public static synchronized String html2text(String html) {
	// convert HTML document
	StringBean sb = new StringBean();
	sb.setLinks(false);  // no links
	sb.setReplaceNonBreakingSpaces (true); // replace non-breaking spaces
    sb.setCollapse(true);  // replace sequences of whitespaces
	Parser parser = new Parser();
	try {
		parser.setInputHTML(html);
		parser.visitAllNodesWith(sb);
	} catch (ParserException e) {
		return null;
	}
	String docText = sb.getStrings();
	
	if (docText == null) docText = "";  // no content
	
	return docText;
}
 
Example #10
Source File: GangliaHttpParser.java    From Hue-Ctrip-DI with MIT License 6 votes vote down vote up
public List<String> getGangliaAttribute(String clusterName)
		throws ParserException, MalformedURLException, IOException {
	String url = gangliaMetricUrl.replaceAll(clusterPattern, clusterName);
	Parser parser = new Parser(new URL(url).openConnection());
	NodeFilter nodeFilter = new AndFilter(new TagNameFilter("select"),
			new HasAttributeFilter("id", "metrics-picker"));
	NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
	SimpleNodeIterator iterator = nodeList.elements();
	List<String> metricList = new ArrayList<String>();
	while (iterator.hasMoreNodes()) {
		Node node = iterator.nextNode();

		SimpleNodeIterator childIterator = node.getChildren().elements();
		while (childIterator.hasMoreNodes()) {
			OptionTag children = (OptionTag) childIterator.nextNode();
			metricList.add(children.getOptionText());
		}
	}

	return metricList;

}
 
Example #11
Source File: HtmlTextParser.java    From onboard with Apache License 2.0 6 votes vote down vote up
public static String getPlainText(String htmlStr) {
    Parser parser = new Parser();
    String plainText = "";
    try {
        parser.setInputHTML(htmlStr);

        StringBean stringBean = new StringBean();
        // 设置不需要得到页面所包含的链接信息
        stringBean.setLinks(false);
        // 设置将不间断空格由正规空格所替代
        stringBean.setReplaceNonBreakingSpaces(true);
        // 设置将一序列空格由单一空格替代
        stringBean.setCollapse(true);

        parser.visitAllNodesWith(stringBean);
        plainText = stringBean.getStrings();

    } catch (ParserException e) {
        e.printStackTrace();
    }

    return plainText;
}
 
Example #12
Source File: LexParser.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
public void doParse(CDATALexer lex, Writer w) throws ParserException, IOException {
	obs.handleDocumentStart();
	Node n;
	TextNode tx;
	TagNode tn;
	while(true) {
		n = lex.nextNode();
		if(n == null) {
			break;
		}
		if(isRemarkNode(n)) {
			obs.handleRemarkNode((RemarkNode)n);
		} else if(isTextNode(n)) {
			tx = (TextNode) n;
			if(lex.inCSS()) {
				obs.handleStyleNode(tx);
			} else if(lex.inJS()) {
				obs.handleScriptNode(tx);
			} else {
				obs.handleTextNode(tx);
			}
		} else {
			tn = (TagNode) n;
			if(tn.isEmptyXmlTag()) {
				obs.handleTagEmpty(tn);
			} else if(tn.isEndTag()) {
				obs.handleTagClose(tn);
			} else {
				obs.handleTagOpen(tn);
			}
		}
		if(w != null) {
			w.write(n.toHtml(true));
		}
	}
	obs.handleDocumentComplete();
}
 
Example #13
Source File: CDATALexerTest.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
public void testNextNode() throws ParserException {
	l = makeLexer("<a href=\"foo\">blem</a>");
	n = l.nextNode();
	assertFalse(l.inCSS());
	assertFalse(l.inJS());
	assertTrue(NodeUtils.isNonEmptyOpenTagNodeNamed(n, "A"));
	assertEquals("foo",((TagNode)n).getAttribute("HREF"));
	n = l.nextNode();
	assertTrue(NodeUtils.isTextNode(n));
	assertEquals("blem",((TextNode)n).getText());
	n = l.nextNode();
	assertTrue(NodeUtils.isCloseTagNodeNamed(n, "A"));
	assertNull(l.nextNode());
}
 
Example #14
Source File: SendMailService.java    From cs-actions with Apache License 2.0 5 votes vote down vote up
private void processHTMLBodyWithBASE64Images(MimeMultipart multipart) throws ParserException,
        MessagingException, NoSuchAlgorithmException, SMIMEException, java.security.NoSuchProviderException {
    if (null != input.getBody() && input.getBody().contains(Encodings.BASE64)) {
        Parser parser = new Parser(input.getBody());
        NodeList nodeList = parser.parse(null);
        HtmlImageNodeVisitor htmlImageNodeVisitor = new HtmlImageNodeVisitor();
        nodeList.visitAllNodesWith(htmlImageNodeVisitor);
        input.setBody(nodeList.toHtml());

        addAllBase64ImagesToMimeMultipart(multipart, htmlImageNodeVisitor.getBase64Images());
    }
}
 
Example #15
Source File: Spider.java    From PADListener with GNU General Public License v2.0 5 votes vote down vote up
private void processHtml(HttpUrl base, NodeList nodelist) {
    NodeFilter filter = new HasAttributeFilter("href");
    filter = new OrFilter(filter, new HasAttributeFilter("src"));
    filter = new OrFilter(filter, new HasAttributeFilter("onclick"));
    filter = new OrFilter(filter, new HasAttributeFilter("onblur"));
    try {
        NodeList links = nodelist.extractAllNodesThatMatch(filter);
        for (NodeIterator ni = links.elements(); ni.hasMoreNodes(); ) {
            Node node = ni.nextNode();
            if (node instanceof Tag) {
                boolean got = false;
                Tag tag = (Tag) node;
                String src = tag.getAttribute("src");
                if (src != null) {
                    processLink(base, src);
                    got = true;
                }
                String href = tag.getAttribute("href");
                if (href != null) {
                    processLink(base, href);
                    got = true;
                }
                if (!got) {
                    // _logger.info("Didn't get anything from " + tag.getClass().getName() + ": " + tag);
                }
            }
        }
    } catch (ParserException pe) {
        _logger.warning("ParserException : " + pe);
    }
}
 
Example #16
Source File: CmsKeywordMngImpl.java    From Lottery with GNU General Public License v2.0 5 votes vote down vote up
@Transactional(readOnly = true)
public String attachKeyword(Integer siteId, String txt) {
	if (StringUtils.isBlank(txt)) {
		return txt;
	}
	List<CmsKeyword> list = getListBySiteId(siteId, true, true);
	int len = list.size();
	if (len <= 0) {
		return txt;
	}
	String[] searchArr = new String[len];
	String[] replacementArr = new String[len];
	int i = 0;
	for (CmsKeyword k : list) {
		searchArr[i] = k.getName();
		replacementArr[i] = k.getUrl();
		i++;
	}
	try {
		Lexer lexer = new Lexer(txt);
		Node node;
		StringBuilder sb = new StringBuilder((int) (txt.length() * 1.2));
		while ((node = lexer.nextNode()) != null) {
			if (node instanceof TextNode) {
				sb.append(StringUtils.replaceEach(node.toHtml(), searchArr,
						replacementArr));
			} else {
				sb.append(node.toHtml());
			}
		}
		return sb.toString();
	} catch (ParserException e) {
		throw new RuntimeException(e);
	}
}
 
Example #17
Source File: LexParser.java    From webarchive-commons with Apache License 2.0 4 votes vote down vote up
public void doParse(CDATALexer lex) throws ParserException, IOException {
	doParse(lex,null);
}