org.htmlparser.Parser Java Examples

The following examples show how to use org.htmlparser.Parser. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HtmlTextParser.java    From onboard with Apache License 2.0 6 votes vote down vote up
public static String getPlainText(String htmlStr) {
    Parser parser = new Parser();
    String plainText = "";
    try {
        parser.setInputHTML(htmlStr);

        StringBean stringBean = new StringBean();
        // 设置不需要得到页面所包含的链接信息
        stringBean.setLinks(false);
        // 设置将不间断空格由正规空格所替代
        stringBean.setReplaceNonBreakingSpaces(true);
        // 设置将一序列空格由单一空格替代
        stringBean.setCollapse(true);

        parser.visitAllNodesWith(stringBean);
        plainText = stringBean.getStrings();

    } catch (ParserException e) {
        e.printStackTrace();
    }

    return plainText;
}
 
Example #2
Source File: HTMLParser.java    From PADListener with GNU General Public License v2.0 6 votes vote down vote up
/**
   * parses the body of the message, and returns a parsed representation
   * See {@link http://htmlparser.sourceforge.net/} for details
   * @param url the url that the message resulted from
   * @param message the Message to parse
   * @return a NodeList containing the various Nodes making up the page
   */
  public Object parseMessage(HttpUrl url, Message message) {
      String contentType = message.getHeader("Content-Type");
      if (contentType == null || !contentType.matches("text/html.*")) {
          return null;
      }
      byte[] content = message.getContent();
      if (content == null || content.length == 0) {
          return null;
      }
      Parser parser = Parser.createParser(new String(content), null);
      try {
          NodeList nodelist = parser.extractAllNodesThatMatch(new NodeFilter() {
public boolean accept(Node node) {
                  return true;
              }
          });
          return nodelist;
      } catch (ParserException pe) {
          _logger.severe(pe.toString());
          return null;
      }
  }
 
Example #3
Source File: GangliaHttpParser.java    From Hue-Ctrip-DI with MIT License 6 votes vote down vote up
public List<String> getGangliaAttribute(String clusterName)
		throws ParserException, MalformedURLException, IOException {
	String url = gangliaMetricUrl.replaceAll(clusterPattern, clusterName);
	Parser parser = new Parser(new URL(url).openConnection());
	NodeFilter nodeFilter = new AndFilter(new TagNameFilter("select"),
			new HasAttributeFilter("id", "metrics-picker"));
	NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
	SimpleNodeIterator iterator = nodeList.elements();
	List<String> metricList = new ArrayList<String>();
	while (iterator.hasMoreNodes()) {
		Node node = iterator.nextNode();

		SimpleNodeIterator childIterator = node.getChildren().elements();
		while (childIterator.hasMoreNodes()) {
			OptionTag children = (OptionTag) childIterator.nextNode();
			metricList.add(children.getOptionText());
		}
	}

	return metricList;

}
 
Example #4
Source File: TestGangliaHttpParser.java    From Hue-Ctrip-DI with MIT License 6 votes vote down vote up
public static void main(String[] args) throws Exception {
	Parser parser = new Parser(new URL("http://10.8.75.3/ganglia/?r=hour&cs=&ce=&s=by+name&c=Zookeeper_Cluster&tab=m&vn=&hide-hf=false").openConnection());
	NodeFilter nodeFilter = new AndFilter(new TagNameFilter("select"),
			new HasAttributeFilter("id", "metrics-picker"));
	NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
	SimpleNodeIterator iterator = nodeList.elements();
	while (iterator.hasMoreNodes()) {
		Node node = iterator.nextNode();

		SimpleNodeIterator childIterator = node.getChildren().elements();
		while (childIterator.hasMoreNodes()) {
			OptionTag children = (OptionTag) childIterator.nextNode();
			System.out.println(children.getOptionText());
		}
	}

}
 
Example #5
Source File: HTMLConverter.java    From OpenEphyra with GNU General Public License v2.0 6 votes vote down vote up
/**
 * Converts an HTML document into plain text.
 * 
 * @param html HTML document
 * @return plain text or <code>null</code> if the conversion failed
 */
public static synchronized String html2text(String html) {
	// convert HTML document
	StringBean sb = new StringBean();
	sb.setLinks(false);  // no links
	sb.setReplaceNonBreakingSpaces (true); // replace non-breaking spaces
    sb.setCollapse(true);  // replace sequences of whitespaces
	Parser parser = new Parser();
	try {
		parser.setInputHTML(html);
		parser.visitAllNodesWith(sb);
	} catch (ParserException e) {
		return null;
	}
	String docText = sb.getStrings();
	
	if (docText == null) docText = "";  // no content
	
	return docText;
}
 
Example #6
Source File: HTMLConverter.java    From OpenEphyra with GNU General Public License v2.0 6 votes vote down vote up
/**
 * Reads an HTML document from a file and converts it into plain text.
 * 
 * @param filename name of file containing HTML documents
 * @return plain text or <code>null</code> if the reading or conversion failed
 */
public static synchronized String file2text(String filename) {
	// read from file and convert HTML document
	StringBean sb = new StringBean();
	sb.setLinks(false);  // no links
	sb.setReplaceNonBreakingSpaces (true); // replace non-breaking spaces
    sb.setCollapse(true);  // replace sequences of whitespaces
	Parser parser = new Parser();
	try {
		parser.setResource(filename);
		parser.visitAllNodesWith(sb);
	} catch (ParserException e) {
		return null;
	}
	String docText = sb.getStrings();
	
	return docText;
}
 
Example #7
Source File: SendMailService.java    From cs-actions with Apache License 2.0 5 votes vote down vote up
private void processHTMLBodyWithBASE64Images(MimeMultipart multipart) throws ParserException,
        MessagingException, NoSuchAlgorithmException, SMIMEException, java.security.NoSuchProviderException {
    if (null != input.getBody() && input.getBody().contains(Encodings.BASE64)) {
        Parser parser = new Parser(input.getBody());
        NodeList nodeList = parser.parse(null);
        HtmlImageNodeVisitor htmlImageNodeVisitor = new HtmlImageNodeVisitor();
        nodeList.visitAllNodesWith(htmlImageNodeVisitor);
        input.setBody(nodeList.toHtml());

        addAllBase64ImagesToMimeMultipart(multipart, htmlImageNodeVisitor.getBase64Images());
    }
}
 
Example #8
Source File: DouBanParsePage.java    From JewelCrawler with GNU General Public License v3.0 4 votes vote down vote up
public static void parseFromString(String content, Connection conn) throws Exception {

        Parser parser = new Parser(content);
        HasAttributeFilter filter = new HasAttributeFilter("href");

        String sql1 = null;
        ResultSet rs1 = null;
        PreparedStatement pstmt1 = null;
        Statement stmt1 = null;

        List<String> nextLinkList = new ArrayList<String>();

        int rowCount = 0;
        sql1 = "select count(*) as rowCount from record";
        stmt1 = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_UPDATABLE);
        rs1 = stmt1.executeQuery(sql1);
        if (rs1.next()) {
            rowCount = rs1.getString("rowCount") != null ? Integer.parseInt(rs1.getString("rowCount")) : 0;
        }

        if (rowCount <= Constants.maxCycle) { //once rowCount is bigger than maxCycle, the new crawled link will not insert into record table
            try {
                NodeList list = parser.parse(filter);
                int count = list.size();

                //process every link on this page
                for (int i = 0; i < count; i++) {
                    Node node = list.elementAt(i);

                    if (node instanceof LinkTag) {
                        LinkTag link = (LinkTag) node;
                        String nextLink = link.extractLink();
                        String mainUrl = Constants.MAINURL;

                        if (nextLink.startsWith(mainUrl)) {
                                //check if the link already exists in the database
                                sql1 = "SELECT * FROM record WHERE URL = '" + nextLink + "'";
                                stmt1 = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_UPDATABLE);
                                rs1 = stmt1.executeQuery(sql1);
                                if (rs1.next()) {

                                } else {
                                    Pattern moviePattern = Pattern.compile(Constants.MOVIE_REGULAR_EXP);
                                    Matcher movieMatcher = moviePattern.matcher(nextLink);

                                    Pattern commentPattern = Pattern.compile(Constants.COMMENT_REGULAR_EXP);
                                    Matcher commentMatcher = commentPattern.matcher(nextLink);

                                    if (movieMatcher.find() || commentMatcher.find()) {
                                        nextLinkList.add(nextLink);
                                    }
                                }
                        }
                    }
                }
                if (nextLinkList.size() > 0) {
                    conn.setAutoCommit(false);
                    //if the link does not exist in the database, insert it
                    sql1 = "INSERT INTO record (URL, crawled) VALUES (?,0)";
                    pstmt1 = conn.prepareStatement(sql1, Statement.RETURN_GENERATED_KEYS);
                    for (String nextLinkStr : nextLinkList) {
                        pstmt1.setString(1, nextLinkStr);
                        pstmt1.addBatch();
                        System.out.println(nextLinkStr);
                    }
                    pstmt1.executeBatch();
                    conn.commit();
                }
            } catch (Exception e) {
                //handle the exceptions
                e.printStackTrace();
                System.out.println("SQLException: " + e.getMessage());
            } finally {
                //close and release the resources of PreparedStatement, ResultSet and Statement
                if (pstmt1 != null) {
                    try {
                        pstmt1.close();
                    } catch (SQLException e2) {
                    }
                }
                pstmt1 = null;

                if (rs1 != null) {
                    try {
                        rs1.close();
                    } catch (SQLException e1) {
                    }
                }
                rs1 = null;

                if (stmt1 != null) {
                    try {
                        stmt1.close();
                    } catch (SQLException e3) {
                    }
                }
                stmt1 = null;
            }
        }
    }