org.htmlcleaner.XPatherException Java Examples

The following examples show how to use org.htmlcleaner.XPatherException. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HtmlUtil.java    From ispider with Apache License 2.0 6 votes vote down vote up
/**
 * 得到url列表
 * @param tagNode
 * @param attr
 * @param xpath
 * @return
 */
public static List<String> getListUrlByXpath(TagNode tagNode, String attr, String xpath) {
    List<String> urls = new ArrayList<>();
    try {
        Object[] objs = tagNode.evaluateXPath(xpath);
        if (objs != null && objs.length > 0) {
            for (Object obj : objs) {
                TagNode aTagNode = (TagNode) obj;
                String url = aTagNode.getAttributeByName(attr);
                urls.add("https:" + url);
            }
        }
        return urls;
    } catch (XPatherException e) {
        e.printStackTrace();
    }
    return null;
}
 
Example #2
Source File: UserUtil.java    From BigData with GNU General Public License v3.0 6 votes vote down vote up
/**
 * 解析关注页面,关注与被关注
 * 
 * @param followUrl
 */
public static void processFollow(String followUrl) {
	String content = PageUtil.getContent(followUrl);
	HtmlCleaner htmlCleaner = new HtmlCleaner();
	TagNode tNode = htmlCleaner.clean(content);
	extractUserUrl(content);
	try {
		Object[] pageNumObj = tNode
				.evaluateXPath("//*[@id=\"Profile-following\"]//div[@class=\"Pagination\"]/button");
		if (pageNumObj != null && pageNumObj.length > 0) {
			TagNode node = (TagNode) pageNumObj[pageNumObj.length - 2];
			int pagenum = Integer.parseInt(node.getText().toString());
			for (int i = 2; i <= pagenum; i++) {
				String url = followUrl + "?page=" + i;
				content = PageUtil.getContent(url);
				extractUserUrl(content);
			}
		}
	} catch (XPatherException e) {
		logger.error(e.getMessage());
	}
}
 
Example #3
Source File: HtmlUtil.java    From ispider with Apache License 2.0 5 votes vote down vote up
/**
 * 根据xpath和属性获取对应标签的属性值
 *
 * @param tagNode
 * @param attr
 * @param xpath
 * @return
 */
public static String getAttrByXpath(TagNode tagNode, String attr, String xpath) {
    try {
        Object[] objs = tagNode.evaluateXPath(xpath);
        if (objs != null && objs.length > 0) {
            TagNode node = (TagNode) objs[0];
            return node.getAttributeByName(attr);
        }
    } catch (XPatherException e) {
        e.printStackTrace();
    }
    return null;
}
 
Example #4
Source File: HtmlUtil.java    From ispider with Apache License 2.0 5 votes vote down vote up
/**
 * 根据指定的xpath,从tagNode中选择具体的标签Text
 *
 * @param tagNode
 * @param xpath
 * @return
 */
public static String getTextByXpath(TagNode tagNode, String xpath) {
    Object[] objs = null;
    try {
        objs = tagNode.evaluateXPath(xpath);
        if (objs != null && objs.length > 0) {
            TagNode titleNode = (TagNode) objs[0];
            return titleNode.getText().toString().trim();
        }
    } catch (XPatherException e) {
        e.printStackTrace();
    }
    return null;
}
 
Example #5
Source File: UtilsStaticAnalyzer.java    From apogen with Apache License 2.0 5 votes vote down vote up
private static String digForAMeaningfulName(String xp, String dom) throws UnsupportedEncodingException {

		xp = xp.toLowerCase();

		HtmlCleaner cleaner = new HtmlCleaner();
		CleanerProperties props = cleaner.getProperties();
		props.setAllowHtmlInsideAttributes(true);
		props.setAllowMultiWordAttributes(true);
		props.setRecognizeUnicodeChars(true);
		props.setOmitComments(true);
		props.setOmitDoctypeDeclaration(true);

		TagNode node = cleaner.clean(dom);
		dom = "<html>\n" + cleaner.getInnerHtml(node) + "\n</html>";

		// workaround: htmlcleaner works with rel xpaths
		xp = xp.replace("html[1]/", "/");
		try {
			Object[] result = node.evaluateXPath(xp);

			if (result.length > 0) {
				TagNode r = (TagNode) result[0];
				return digTheTagTreeForAString(r);
			}

		} catch (XPatherException e) {
			e.printStackTrace();
		}

		// couldn't find a representative string :(

		return "";
	}
 
Example #6
Source File: XpathSelectorTest.java    From webmagic with Apache License 2.0 4 votes vote down vote up
@Ignore("take long time")
@Test
public void parserPerformanceTest() throws XPatherException {
    System.out.println(html.length());

    HtmlCleaner htmlCleaner = new HtmlCleaner();
    TagNode tagNode = htmlCleaner.clean(html);
    Document document = Jsoup.parse(html);

    long time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        htmlCleaner.clean(html);
    }
    System.out.println(System.currentTimeMillis()-time);

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        tagNode.evaluateXPath("//a");
    }
    System.out.println(System.currentTimeMillis()-time);

    System.out.println("=============");

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        Jsoup.parse(html);
    }
    System.out.println(System.currentTimeMillis()-time);

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        document.select("a");
    }
    System.out.println(System.currentTimeMillis()-time);

    System.out.println("=============");

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        htmlCleaner.clean(html);
    }
    System.out.println(System.currentTimeMillis()-time);

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        tagNode.evaluateXPath("//a");
    }
    System.out.println(System.currentTimeMillis()-time);

    System.out.println("=============");

    XPathEvaluator compile = Xsoup.compile("//a");
    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        compile.evaluate(document);
    }
    System.out.println(System.currentTimeMillis()-time);

}
 
Example #7
Source File: MagnetWServiceModelImp.java    From AndroidDownload with Apache License 2.0 4 votes vote down vote up
public List<MagnetInfo> parser(String rootUrl, String url, String keyword,String sort, int page, String group, String magnet, String name, String size, String count,String hot) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException {
    String newUrl = transformUrl(url, keyword,sort, page);
    String html = Jsoup.connect(newUrl).get().body().html();


    XPath xPath = XPathFactory.newInstance().newXPath();
    TagNode tagNode = new HtmlCleaner().clean(html);
    Document dom = new DomSerializer(new CleanerProperties()).createDOM(tagNode);

    NodeList result = (NodeList) xPath.evaluate(group, dom, XPathConstants.NODESET);
    List<MagnetInfo> infos = new ArrayList<MagnetInfo>();
    for (int i = 0; i < result.getLength(); i++) {
        Node node = result.item(i);
        if (node != null) {
            if (StringUtil.isEmpty(node.getTextContent().trim())) {
                continue;
            }
            MagnetInfo info = new MagnetInfo();
            Node magnetNote = (Node) xPath.evaluate(magnet, node, XPathConstants.NODE);
            //磁力链
            String magnetValue = magnetNote.getTextContent();
            info.setMagnet(transformMagnet(magnetValue));
            //名称
            Node nameNote = ((Node) xPath.evaluate(name, node, XPathConstants.NODE));
            String nameValue = nameNote.getTextContent();
            info.setName(nameValue);
            String nameHref = nameNote.getAttributes().getNamedItem("href").getTextContent();
            info.setDetailUrl(transformDetailUrl(rootUrl, nameHref));
            //大小
            Node sizeNote = ((Node) xPath.evaluate(size, node, XPathConstants.NODE));
            if (sizeNote != null) {
                String sizeValue = sizeNote.getTextContent();
                info.setFormatSize(sizeValue);

                info.setSize(transformSize(sizeValue));
            }
            //时间
            Node dateNote=((Node) xPath.evaluate(count, node, XPathConstants.NODE));
            if(dateNote!=null){
                String countValue = dateNote.getTextContent();
                info.setCount(countValue);
            }
            Node hotNote=((Node) xPath.evaluate(hot, node, XPathConstants.NODE));
            if(hotNote!=null){
                String hotValue = hotNote.getTextContent();
                info.setHot(hotValue);
            }
            //一些加工的额外信息
            String resolution = transformResolution(nameValue);
            info.setResolution(resolution);

            infos.add(info);
        }
    }
    return infos;
}
 
Example #8
Source File: MagnetWServiceModelImp.java    From AndroidDownload with Apache License 2.0 4 votes vote down vote up
public List<MagnetInfo> parser(MagnetRule rule, String keyword,String sort, int page) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException {
    this.rule=rule;
    return parser(rule.getUrl(), rule.getSource(), keyword,sort, page, rule.getGroup(), rule.getMagnet(), rule.getName(), rule.getSize(), rule.getCount(),rule.getHot());
}
 
Example #9
Source File: MagnetWServiceModelImp.java    From AndroidDownload with Apache License 2.0 4 votes vote down vote up
@Override
public List<MagnetInfo> parser(MagnetSearchBean bean) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException{
    return parser(bean.getRule(),bean.getKeyword(),bean.getSort(),transformPage(bean.getPage()));
}
 
Example #10
Source File: MagnetWServiceModelImp.java    From AndroidMagnetSearch with Apache License 2.0 4 votes vote down vote up
public List<MagnetInfo> parser(String rootUrl, String url, String keyword,String sort, int page, String group, String magnet, String name, String size, String count,String hot) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException {
    String newUrl = transformUrl(url, keyword,sort, page);
    String html = Jsoup.connect(newUrl).get().body().html();


    XPath xPath = XPathFactory.newInstance().newXPath();
    TagNode tagNode = new HtmlCleaner().clean(html);
    Document dom = new DomSerializer(new CleanerProperties()).createDOM(tagNode);

    NodeList result = (NodeList) xPath.evaluate(group, dom, XPathConstants.NODESET);
    List<MagnetInfo> infos = new ArrayList<MagnetInfo>();
    for (int i = 0; i < result.getLength(); i++) {
        Node node = result.item(i);
        if (node != null) {
            if (StringUtil.isEmpty(node.getTextContent().trim())) {
                continue;
            }
            MagnetInfo info = new MagnetInfo();
            Node magnetNote = (Node) xPath.evaluate(magnet, node, XPathConstants.NODE);
            //磁力链
            String magnetValue = magnetNote.getTextContent();
            info.setMagnet(transformMagnet(magnetValue));
            //名称
            Node nameNote = ((Node) xPath.evaluate(name, node, XPathConstants.NODE));
            String nameValue = nameNote.getTextContent();
            info.setName(nameValue);
            String nameHref = nameNote.getAttributes().getNamedItem("href").getTextContent();
            info.setDetailUrl(transformDetailUrl(rootUrl, nameHref));
            //大小
            Node sizeNote = ((Node) xPath.evaluate(size, node, XPathConstants.NODE));
            if (sizeNote != null) {
                String sizeValue = sizeNote.getTextContent();
                info.setFormatSize(sizeValue);

                info.setSize(transformSize(sizeValue));
            }
            //时间
            Node dateNote=((Node) xPath.evaluate(count, node, XPathConstants.NODE));
            if(dateNote!=null){
                String countValue = dateNote.getTextContent();
                info.setCount(countValue);
            }
            Node hotNote=((Node) xPath.evaluate(hot, node, XPathConstants.NODE));
            if(hotNote!=null){
                String hotValue = hotNote.getTextContent();
                info.setHot(hotValue);
            }
            //一些加工的额外信息
            String resolution = transformResolution(nameValue);
            info.setResolution(resolution);

            infos.add(info);
        }
    }
    return infos;
}
 
Example #11
Source File: MagnetWServiceModelImp.java    From AndroidMagnetSearch with Apache License 2.0 4 votes vote down vote up
public List<MagnetInfo> parser(MagnetRule rule, String keyword,String sort, int page) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException {
    this.rule=rule;
    return parser(rule.getUrl(), rule.getSource(), keyword,sort, page, rule.getGroup(), rule.getMagnet(), rule.getName(), rule.getSize(), rule.getCount(),rule.getHot());
}
 
Example #12
Source File: MagnetWServiceModelImp.java    From AndroidMagnetSearch with Apache License 2.0 4 votes vote down vote up
@Override
public List<MagnetInfo> parser(MagnetSearchBean bean) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException{
    return parser(bean.getRule(),bean.getKeyword(),bean.getSort(),transformPage(bean.getPage()));
}
 
Example #13
Source File: SNHtmlParserImpl.java    From ispider with Apache License 2.0 4 votes vote down vote up
/**
 * 苏宁的下一页按钮的url似乎也是动态加载的,所以没有办法像京东一样获取
 */

@Override
public void parser(Page page) {
    HtmlCleaner cleaner = new HtmlCleaner();
    TagNode rootNode = cleaner.clean(page.getContent());
    long start = System.currentTimeMillis();    // 解析开始时间

    if (page.getUrl().startsWith("https://product.suning.com")) {    // 解析商品
        parserProduct(page, rootNode);
        logger.info("解析商品页面:{}, 消耗时长:{}ms", page.getUrl(), System.currentTimeMillis() - start);
    } else if (page.getUrl().startsWith("https://list.suning.com")) {    // 解析列表
        // 当前页面的商品url列表
        List<String> urls = HtmlUtil.getListUrlByXpath(rootNode, "href", "//div[@id='filter-results']/ul/li/div/div/div/div[1]/div[1]/a");
        page.getUrls().addAll(urls);
        // 获取所有的列表页面url
        if (!ifGetAll) {
            Integer totalPage = null;
            try {
                // 获取总页码数
                Object[] objects = rootNode.evaluateXPath("//div[@id='second-filter']/div[2]/div/span");
                TagNode tagNode = (TagNode) objects[0];
                String text = tagNode.getText().toString(); // "\n\n1\n/100\n"
                Pattern pattern = Pattern.compile("[0-9]{2,3}");
                Matcher matcher = pattern.matcher(text);
                if (matcher.find()) {
                    totalPage = Integer.valueOf(matcher.group()); // 获得页码总数
                }
            } catch (XPatherException e) {
                e.printStackTrace();
            }
            if (totalPage != null) {
                // 从url中获取当前页码
                String currentPageStr = page.getUrl().split("0-20006-")[1].split("\\.")[0];    // url: https://list.suning.com/0-20006-0.html
                int currentPage = Integer.valueOf(currentPageStr);
                for (int i = currentPage + 1; i < totalPage; i++) {
                    String url = "https://list.suning.com/0-20006-" + i + ".html";
                    page.getUrls().add(url);
                }
            }
            ifGetAll = true;    // 解析完列表后记得设置为true
        }
        logger.info("解析列表页面:{}, 消耗时长:{}ms", page.getUrl(), System.currentTimeMillis() - start);
    }
}
 
Example #14
Source File: MagnetWServiceModel.java    From AndroidMagnetSearch with Apache License 2.0 votes vote down vote up
List<MagnetInfo> parser(String rootUrl, String url, String keyword,String sort, int page, String group, String magnet, String name, String size, String count,String hot) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException; 
Example #15
Source File: MagnetWServiceModel.java    From AndroidMagnetSearch with Apache License 2.0 votes vote down vote up
List<MagnetInfo> parser(MagnetRule rule, String keyword,String sort, int page) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException; 
Example #16
Source File: MagnetWServiceModel.java    From AndroidMagnetSearch with Apache License 2.0 votes vote down vote up
List<MagnetInfo> parser(MagnetSearchBean bean) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException; 
Example #17
Source File: MagnetWServiceModel.java    From AndroidDownload with Apache License 2.0 votes vote down vote up
List<MagnetInfo> parser(MagnetSearchBean bean) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException; 
Example #18
Source File: MagnetWServiceModel.java    From AndroidDownload with Apache License 2.0 votes vote down vote up
List<MagnetInfo> parser(MagnetRule rule, String keyword,String sort, int page) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException; 
Example #19
Source File: MagnetWServiceModel.java    From AndroidDownload with Apache License 2.0 votes vote down vote up
List<MagnetInfo> parser(String rootUrl, String url, String keyword,String sort, int page, String group, String magnet, String name, String size, String count,String hot) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException;