org.htmlparser.Node Java Examples

The following examples show how to use org.htmlparser.Node. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: StrUtils.java    From Lottery with GNU General Public License v2.0 6 votes vote down vote up
public static String html2Text(String html, int len) {
	try {
		Lexer lexer = new Lexer(html);
		Node node;
		StringBuilder sb = new StringBuilder(html.length());
		while ((node = lexer.nextNode()) != null) {
			if (node instanceof TextNode) {
				sb.append(node.toHtml());
			}
			if (sb.length() > len) {
				break;
			}
		}
		return sb.toString();
	} catch (ParserException e) {
		throw new RuntimeException(e);
	}
}
 
Example #2
Source File: HTMLForm.java    From navex with GNU General Public License v3.0 6 votes vote down vote up
public HTMLForm(Node form, String url,  String commonJS) {
	this.form = form;

	this.formName = ((FormTag) form).getFormName();
	if (this.formName == null)
		this.formName = "form_"+Integer.toString(formCounter);
	this.url = url; 
	this.z3FormFormulas = new HashSet<Formula>();

	this.commonJS = commonJS;
	this.jsValidation = new String("");
	this.domRepresentation = new String();
	this.windowRepresentation = new String();
	this.helperFuns = new String();

}
 
Example #3
Source File: GridUtils.java    From dhis2-core with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Returns the number of columns/cells in the given row, including cell spacing.
 */
private static int getColumnCount( TableRow row )
{
    Node[] cells = row.getChildren().extractAllNodesThatMatch( HTML_ROW_FILTER ).toNodeArray();

    int cols = 0;

    for ( Node cell : cells )
    {
        Integer colSpan = MathUtils.parseInt( ((TagNode) cell).getAttribute( "colspan" ) );

        cols += colSpan != null ? colSpan : 1;
    }

    return cols;
}
 
Example #4
Source File: GridUtils.java    From dhis2-core with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Retrieves the value of a table cell. Appends the text of child nodes of
 * the cell. In case of composite tags like span or div the inner text is
 * appended.
 */
public static String getValue( TagNode cell )
{
    StringBuilder builder = new StringBuilder();

    for ( Node child : cell.getChildren().toNodeArray() )
    {
        if ( child instanceof CompositeTag )
        {
            builder.append( ((CompositeTag) child).getStringText() );
        }
        else
        {
            builder.append( child.getText() );
        }
    }

    return builder.toString().trim().replaceAll( "&nbsp;", EMPTY );
}
 
Example #5
Source File: HTMLParser.java    From PADListener with GNU General Public License v2.0 6 votes vote down vote up
/**
   * parses the body of the message, and returns a parsed representation
   * See {@link http://htmlparser.sourceforge.net/} for details
   * @param url the url that the message resulted from
   * @param message the Message to parse
   * @return a NodeList containing the various Nodes making up the page
   */
  public Object parseMessage(HttpUrl url, Message message) {
      String contentType = message.getHeader("Content-Type");
      if (contentType == null || !contentType.matches("text/html.*")) {
          return null;
      }
      byte[] content = message.getContent();
      if (content == null || content.length == 0) {
          return null;
      }
      Parser parser = Parser.createParser(new String(content), null);
      try {
          NodeList nodelist = parser.extractAllNodesThatMatch(new NodeFilter() {
public boolean accept(Node node) {
                  return true;
              }
          });
          return nodelist;
      } catch (ParserException pe) {
          _logger.severe(pe.toString());
          return null;
      }
  }
 
Example #6
Source File: CDATALexer.java    From webarchive-commons with Apache License 2.0 6 votes vote down vote up
@Override
public Node nextNode() throws ParserException {
	inJS = false;
	inCSS = false;
	if(cached != null) {
		Node tmp = cached;
		cached = null;
		inJS = cachedJS;
		inCSS = !cachedJS;
		return tmp;
	}
	Node got = super.nextNode();
	if(NodeUtils.isNonEmptyOpenTagNodeNamed(got, "SCRIPT")) {
		cached = super.parseCDATA(true);
		cachedJS = true;
	} else if (NodeUtils.isNonEmptyOpenTagNodeNamed(got, "STYLE")) {
		cached = super.parseCDATA(true);
		cachedJS = false;
	}
	return got;
}
 
Example #7
Source File: GangliaHttpParser.java    From Hue-Ctrip-DI with MIT License 6 votes vote down vote up
public List<String> getGangliaAttribute(String clusterName)
		throws ParserException, MalformedURLException, IOException {
	String url = gangliaMetricUrl.replaceAll(clusterPattern, clusterName);
	Parser parser = new Parser(new URL(url).openConnection());
	NodeFilter nodeFilter = new AndFilter(new TagNameFilter("select"),
			new HasAttributeFilter("id", "metrics-picker"));
	NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
	SimpleNodeIterator iterator = nodeList.elements();
	List<String> metricList = new ArrayList<String>();
	while (iterator.hasMoreNodes()) {
		Node node = iterator.nextNode();

		SimpleNodeIterator childIterator = node.getChildren().elements();
		while (childIterator.hasMoreNodes()) {
			OptionTag children = (OptionTag) childIterator.nextNode();
			metricList.add(children.getOptionText());
		}
	}

	return metricList;

}
 
Example #8
Source File: TestGangliaHttpParser.java    From Hue-Ctrip-DI with MIT License 6 votes vote down vote up
public static void main(String[] args) throws Exception {
	Parser parser = new Parser(new URL("http://10.8.75.3/ganglia/?r=hour&cs=&ce=&s=by+name&c=Zookeeper_Cluster&tab=m&vn=&hide-hf=false").openConnection());
	NodeFilter nodeFilter = new AndFilter(new TagNameFilter("select"),
			new HasAttributeFilter("id", "metrics-picker"));
	NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
	SimpleNodeIterator iterator = nodeList.elements();
	while (iterator.hasMoreNodes()) {
		Node node = iterator.nextNode();

		SimpleNodeIterator childIterator = node.getChildren().elements();
		while (childIterator.hasMoreNodes()) {
			OptionTag children = (OptionTag) childIterator.nextNode();
			System.out.println(children.getOptionText());
		}
	}

}
 
Example #9
Source File: LexParser.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
public void doParse(CDATALexer lex, Writer w) throws ParserException, IOException {
	obs.handleDocumentStart();
	Node n;
	TextNode tx;
	TagNode tn;
	while(true) {
		n = lex.nextNode();
		if(n == null) {
			break;
		}
		if(isRemarkNode(n)) {
			obs.handleRemarkNode((RemarkNode)n);
		} else if(isTextNode(n)) {
			tx = (TextNode) n;
			if(lex.inCSS()) {
				obs.handleStyleNode(tx);
			} else if(lex.inJS()) {
				obs.handleScriptNode(tx);
			} else {
				obs.handleTextNode(tx);
			}
		} else {
			tn = (TagNode) n;
			if(tn.isEmptyXmlTag()) {
				obs.handleTagEmpty(tn);
			} else if(tn.isEndTag()) {
				obs.handleTagClose(tn);
			} else {
				obs.handleTagOpen(tn);
			}
		}
		if(w != null) {
			w.write(n.toHtml(true));
		}
	}
	obs.handleDocumentComplete();
}
 
Example #10
Source File: CmsKeywordMngImpl.java    From Lottery with GNU General Public License v2.0 5 votes vote down vote up
@Transactional(readOnly = true)
public String attachKeyword(Integer siteId, String txt) {
	if (StringUtils.isBlank(txt)) {
		return txt;
	}
	List<CmsKeyword> list = getListBySiteId(siteId, true, true);
	int len = list.size();
	if (len <= 0) {
		return txt;
	}
	String[] searchArr = new String[len];
	String[] replacementArr = new String[len];
	int i = 0;
	for (CmsKeyword k : list) {
		searchArr[i] = k.getName();
		replacementArr[i] = k.getUrl();
		i++;
	}
	try {
		Lexer lexer = new Lexer(txt);
		Node node;
		StringBuilder sb = new StringBuilder((int) (txt.length() * 1.2));
		while ((node = lexer.nextNode()) != null) {
			if (node instanceof TextNode) {
				sb.append(StringUtils.replaceEach(node.toHtml(), searchArr,
						replacementArr));
			} else {
				sb.append(node.toHtml());
			}
		}
		return sb.toString();
	} catch (ParserException e) {
		throw new RuntimeException(e);
	}
}
 
Example #11
Source File: NodeUtils.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
public static boolean isCloseTagNodeNamed(Node node, String name) {
	if(isTagNode(node)) {
		TagNode tagNode = (TagNode) node;
		if(tagNode.isEndTag()) {
			String nodeName = tagNode.getTagName();
			return nodeName.equals(name.toUpperCase());
		}
	}
	return false;
}
 
Example #12
Source File: NodeUtils.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
public static boolean isNonEmptyOpenTagNodeNamed(Node node, String name) {
	if(isTagNode(node)) {
		TagNode tagNode = (TagNode) node;
		if(!tagNode.isEndTag() && !tagNode.isEmptyXmlTag()) {
			String nodeName = tagNode.getTagName();
			return nodeName.equals(name.toUpperCase());
		}
	}
	return false;
}
 
Example #13
Source File: NodeUtils.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
public static boolean isOpenTagNodeNamed(Node node, String name) {
	if(isTagNode(node)) {
		TagNode tagNode = (TagNode) node;
		if(!tagNode.isEndTag()) {
			String nodeName = tagNode.getTagName();
			return nodeName.equals(name.toUpperCase());
		}
	}
	return false;
}
 
Example #14
Source File: NodeUtils.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
public static boolean isTagNodeNamed(Node node, String name) {
	if(isTagNode(node)) {
		TagNode tagNode = (TagNode) node;
		String nodeName = tagNode.getTagName();
		return nodeName.equals(name.toUpperCase());
	}
	return false;
}
 
Example #15
Source File: Spider.java    From PADListener with GNU General Public License v2.0 5 votes vote down vote up
private void processHtml(HttpUrl base, NodeList nodelist) {
    NodeFilter filter = new HasAttributeFilter("href");
    filter = new OrFilter(filter, new HasAttributeFilter("src"));
    filter = new OrFilter(filter, new HasAttributeFilter("onclick"));
    filter = new OrFilter(filter, new HasAttributeFilter("onblur"));
    try {
        NodeList links = nodelist.extractAllNodesThatMatch(filter);
        for (NodeIterator ni = links.elements(); ni.hasMoreNodes(); ) {
            Node node = ni.nextNode();
            if (node instanceof Tag) {
                boolean got = false;
                Tag tag = (Tag) node;
                String src = tag.getAttribute("src");
                if (src != null) {
                    processLink(base, src);
                    got = true;
                }
                String href = tag.getAttribute("href");
                if (href != null) {
                    processLink(base, href);
                    got = true;
                }
                if (!got) {
                    // _logger.info("Didn't get anything from " + tag.getClass().getName() + ": " + tag);
                }
            }
        }
    } catch (ParserException pe) {
        _logger.warning("ParserException : " + pe);
    }
}
 
Example #16
Source File: NodeUtils.java    From webarchive-commons with Apache License 2.0 4 votes vote down vote up
public static boolean isRemarkNode(Node node) {
	return (node instanceof RemarkNode);
}
 
Example #17
Source File: NodeUtils.java    From webarchive-commons with Apache License 2.0 4 votes vote down vote up
public static boolean isTextNode(Node node) {
	return (node instanceof TextNode);
}
 
Example #18
Source File: NodeUtils.java    From webarchive-commons with Apache License 2.0 4 votes vote down vote up
public static boolean isTagNode(Node node) {
	return (node instanceof TagNode);
}
 
Example #19
Source File: HTMLForm.java    From navex with GNU General Public License v3.0 4 votes vote down vote up
public Node getForm() {
	return form;
}
 
Example #20
Source File: DouBanParsePage.java    From JewelCrawler with GNU General Public License v3.0 4 votes vote down vote up
public static void parseFromString(String content, Connection conn) throws Exception {

        Parser parser = new Parser(content);
        HasAttributeFilter filter = new HasAttributeFilter("href");

        String sql1 = null;
        ResultSet rs1 = null;
        PreparedStatement pstmt1 = null;
        Statement stmt1 = null;

        List<String> nextLinkList = new ArrayList<String>();

        int rowCount = 0;
        sql1 = "select count(*) as rowCount from record";
        stmt1 = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_UPDATABLE);
        rs1 = stmt1.executeQuery(sql1);
        if (rs1.next()) {
            rowCount = rs1.getString("rowCount") != null ? Integer.parseInt(rs1.getString("rowCount")) : 0;
        }

        if (rowCount <= Constants.maxCycle) { //once rowCount is bigger than maxCycle, the new crawled link will not insert into record table
            try {
                NodeList list = parser.parse(filter);
                int count = list.size();

                //process every link on this page
                for (int i = 0; i < count; i++) {
                    Node node = list.elementAt(i);

                    if (node instanceof LinkTag) {
                        LinkTag link = (LinkTag) node;
                        String nextLink = link.extractLink();
                        String mainUrl = Constants.MAINURL;

                        if (nextLink.startsWith(mainUrl)) {
                                //check if the link already exists in the database
                                sql1 = "SELECT * FROM record WHERE URL = '" + nextLink + "'";
                                stmt1 = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_UPDATABLE);
                                rs1 = stmt1.executeQuery(sql1);
                                if (rs1.next()) {

                                } else {
                                    Pattern moviePattern = Pattern.compile(Constants.MOVIE_REGULAR_EXP);
                                    Matcher movieMatcher = moviePattern.matcher(nextLink);

                                    Pattern commentPattern = Pattern.compile(Constants.COMMENT_REGULAR_EXP);
                                    Matcher commentMatcher = commentPattern.matcher(nextLink);

                                    if (movieMatcher.find() || commentMatcher.find()) {
                                        nextLinkList.add(nextLink);
                                    }
                                }
                        }
                    }
                }
                if (nextLinkList.size() > 0) {
                    conn.setAutoCommit(false);
                    //if the link does not exist in the database, insert it
                    sql1 = "INSERT INTO record (URL, crawled) VALUES (?,0)";
                    pstmt1 = conn.prepareStatement(sql1, Statement.RETURN_GENERATED_KEYS);
                    for (String nextLinkStr : nextLinkList) {
                        pstmt1.setString(1, nextLinkStr);
                        pstmt1.addBatch();
                        System.out.println(nextLinkStr);
                    }
                    pstmt1.executeBatch();
                    conn.commit();
                }
            } catch (Exception e) {
                //handle the exceptions
                e.printStackTrace();
                System.out.println("SQLException: " + e.getMessage());
            } finally {
                //close and release the resources of PreparedStatement, ResultSet and Statement
                if (pstmt1 != null) {
                    try {
                        pstmt1.close();
                    } catch (SQLException e2) {
                    }
                }
                pstmt1 = null;

                if (rs1 != null) {
                    try {
                        rs1.close();
                    } catch (SQLException e1) {
                    }
                }
                rs1 = null;

                if (stmt1 != null) {
                    try {
                        stmt1.close();
                    } catch (SQLException e3) {
                    }
                }
                stmt1 = null;
            }
        }
    }
 
Example #21
Source File: HTMLForm.java    From navex with GNU General Public License v3.0 4 votes vote down vote up
public void setForm(Node form) {
	this.form = form;
}