org.htmlcleaner.XPatherException Java Examples

The following examples show how to use org.htmlcleaner.XPatherException. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: HtmlUtil.java From ispider with Apache License 2.0

6 votes

/**
 * 得到url列表
 * @param tagNode
 * @param attr
 * @param xpath
 * @return
 */
public static List<String> getListUrlByXpath(TagNode tagNode, String attr, String xpath) {
    List<String> urls = new ArrayList<>();
    try {
        Object[] objs = tagNode.evaluateXPath(xpath);
        if (objs != null && objs.length > 0) {
            for (Object obj : objs) {
                TagNode aTagNode = (TagNode) obj;
                String url = aTagNode.getAttributeByName(attr);
                urls.add("https:" + url);
            }
        }
        return urls;
    } catch (XPatherException e) {
        e.printStackTrace();
    }
    return null;
}

Example #2

Source File: UserUtil.java From BigData with GNU General Public License v3.0

6 votes

/**
 * 解析关注页面，关注与被关注
 * 
 * @param followUrl
 */
public static void processFollow(String followUrl) {
	String content = PageUtil.getContent(followUrl);
	HtmlCleaner htmlCleaner = new HtmlCleaner();
	TagNode tNode = htmlCleaner.clean(content);
	extractUserUrl(content);
	try {
		Object[] pageNumObj = tNode
				.evaluateXPath("//*[@id=\"Profile-following\"]//div[@class=\"Pagination\"]/button");
		if (pageNumObj != null && pageNumObj.length > 0) {
			TagNode node = (TagNode) pageNumObj[pageNumObj.length - 2];
			int pagenum = Integer.parseInt(node.getText().toString());
			for (int i = 2; i <= pagenum; i++) {
				String url = followUrl + "?page=" + i;
				content = PageUtil.getContent(url);
				extractUserUrl(content);
			}
		}
	} catch (XPatherException e) {
		logger.error(e.getMessage());
	}
}

Example #3

Source File: HtmlUtil.java From ispider with Apache License 2.0

5 votes

/**
 * 根据xpath和属性获取对应标签的属性值
 *
 * @param tagNode
 * @param attr
 * @param xpath
 * @return
 */
public static String getAttrByXpath(TagNode tagNode, String attr, String xpath) {
    try {
        Object[] objs = tagNode.evaluateXPath(xpath);
        if (objs != null && objs.length > 0) {
            TagNode node = (TagNode) objs[0];
            return node.getAttributeByName(attr);
        }
    } catch (XPatherException e) {
        e.printStackTrace();
    }
    return null;
}

Example #4

Source File: HtmlUtil.java From ispider with Apache License 2.0

5 votes

/**
 * 根据指定的xpath，从tagNode中选择具体的标签Text
 *
 * @param tagNode
 * @param xpath
 * @return
 */
public static String getTextByXpath(TagNode tagNode, String xpath) {
    Object[] objs = null;
    try {
        objs = tagNode.evaluateXPath(xpath);
        if (objs != null && objs.length > 0) {
            TagNode titleNode = (TagNode) objs[0];
            return titleNode.getText().toString().trim();
        }
    } catch (XPatherException e) {
        e.printStackTrace();
    }
    return null;
}

Example #5

Source File: UtilsStaticAnalyzer.java From apogen with Apache License 2.0

5 votes

private static String digForAMeaningfulName(String xp, String dom) throws UnsupportedEncodingException {

		xp = xp.toLowerCase();

		HtmlCleaner cleaner = new HtmlCleaner();
		CleanerProperties props = cleaner.getProperties();
		props.setAllowHtmlInsideAttributes(true);
		props.setAllowMultiWordAttributes(true);
		props.setRecognizeUnicodeChars(true);
		props.setOmitComments(true);
		props.setOmitDoctypeDeclaration(true);

		TagNode node = cleaner.clean(dom);
		dom = "<html>\n" + cleaner.getInnerHtml(node) + "\n</html>";

		// workaround: htmlcleaner works with rel xpaths
		xp = xp.replace("html[1]/", "/");
		try {
			Object[] result = node.evaluateXPath(xp);

			if (result.length > 0) {
				TagNode r = (TagNode) result[0];
				return digTheTagTreeForAString(r);
			}

		} catch (XPatherException e) {
			e.printStackTrace();
		}

		// couldn't find a representative string :(

		return "";
	}

Example #6

Source File: XpathSelectorTest.java From webmagic with Apache License 2.0

4 votes

@Ignore("take long time")
@Test
public void parserPerformanceTest() throws XPatherException {
    System.out.println(html.length());

    HtmlCleaner htmlCleaner = new HtmlCleaner();
    TagNode tagNode = htmlCleaner.clean(html);
    Document document = Jsoup.parse(html);

    long time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        htmlCleaner.clean(html);
    }
    System.out.println(System.currentTimeMillis()-time);

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        tagNode.evaluateXPath("//a");
    }
    System.out.println(System.currentTimeMillis()-time);

    System.out.println("=============");

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        Jsoup.parse(html);
    }
    System.out.println(System.currentTimeMillis()-time);

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        document.select("a");
    }
    System.out.println(System.currentTimeMillis()-time);

    System.out.println("=============");

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        htmlCleaner.clean(html);
    }
    System.out.println(System.currentTimeMillis()-time);

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        tagNode.evaluateXPath("//a");
    }
    System.out.println(System.currentTimeMillis()-time);

    System.out.println("=============");

    XPathEvaluator compile = Xsoup.compile("//a");
    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        compile.evaluate(document);
    }
    System.out.println(System.currentTimeMillis()-time);

}

Example #7

Source File: MagnetWServiceModelImp.java From AndroidDownload with Apache License 2.0

4 votes

public List<MagnetInfo> parser(String rootUrl, String url, String keyword,String sort, int page, String group, String magnet, String name, String size, String count,String hot) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException {
    String newUrl = transformUrl(url, keyword,sort, page);
    String html = Jsoup.connect(newUrl).get().body().html();


    XPath xPath = XPathFactory.newInstance().newXPath();
    TagNode tagNode = new HtmlCleaner().clean(html);
    Document dom = new DomSerializer(new CleanerProperties()).createDOM(tagNode);

    NodeList result = (NodeList) xPath.evaluate(group, dom, XPathConstants.NODESET);
    List<MagnetInfo> infos = new ArrayList<MagnetInfo>();
    for (int i = 0; i < result.getLength(); i++) {
        Node node = result.item(i);
        if (node != null) {
            if (StringUtil.isEmpty(node.getTextContent().trim())) {
                continue;
            }
            MagnetInfo info = new MagnetInfo();
            Node magnetNote = (Node) xPath.evaluate(magnet, node, XPathConstants.NODE);
            //磁力链
            String magnetValue = magnetNote.getTextContent();
            info.setMagnet(transformMagnet(magnetValue));
            //名称
            Node nameNote = ((Node) xPath.evaluate(name, node, XPathConstants.NODE));
            String nameValue = nameNote.getTextContent();
            info.setName(nameValue);
            String nameHref = nameNote.getAttributes().getNamedItem("href").getTextContent();
            info.setDetailUrl(transformDetailUrl(rootUrl, nameHref));
            //大小
            Node sizeNote = ((Node) xPath.evaluate(size, node, XPathConstants.NODE));
            if (sizeNote != null) {
                String sizeValue = sizeNote.getTextContent();
                info.setFormatSize(sizeValue);

                info.setSize(transformSize(sizeValue));
            }
            //时间
            Node dateNote=((Node) xPath.evaluate(count, node, XPathConstants.NODE));
            if(dateNote!=null){
                String countValue = dateNote.getTextContent();
                info.setCount(countValue);
            }
            Node hotNote=((Node) xPath.evaluate(hot, node, XPathConstants.NODE));
            if(hotNote!=null){
                String hotValue = hotNote.getTextContent();
                info.setHot(hotValue);
            }
            //一些加工的额外信息
            String resolution = transformResolution(nameValue);
            info.setResolution(resolution);

            infos.add(info);
        }
    }
    return infos;
}

Example #8

Source File: MagnetWServiceModelImp.java From AndroidDownload with Apache License 2.0

4 votes

public List<MagnetInfo> parser(MagnetRule rule, String keyword,String sort, int page) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException {
    this.rule=rule;
    return parser(rule.getUrl(), rule.getSource(), keyword,sort, page, rule.getGroup(), rule.getMagnet(), rule.getName(), rule.getSize(), rule.getCount(),rule.getHot());
}

Example #9

Source File: MagnetWServiceModelImp.java From AndroidDownload with Apache License 2.0

4 votes

@Override
public List<MagnetInfo> parser(MagnetSearchBean bean) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException{
    return parser(bean.getRule(),bean.getKeyword(),bean.getSort(),transformPage(bean.getPage()));
}

Example #10

Source File: MagnetWServiceModelImp.java From AndroidMagnetSearch with Apache License 2.0

4 votes

public List<MagnetInfo> parser(String rootUrl, String url, String keyword,String sort, int page, String group, String magnet, String name, String size, String count,String hot) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException {
    String newUrl = transformUrl(url, keyword,sort, page);
    String html = Jsoup.connect(newUrl).get().body().html();


    XPath xPath = XPathFactory.newInstance().newXPath();
    TagNode tagNode = new HtmlCleaner().clean(html);
    Document dom = new DomSerializer(new CleanerProperties()).createDOM(tagNode);

    NodeList result = (NodeList) xPath.evaluate(group, dom, XPathConstants.NODESET);
    List<MagnetInfo> infos = new ArrayList<MagnetInfo>();
    for (int i = 0; i < result.getLength(); i++) {
        Node node = result.item(i);
        if (node != null) {
            if (StringUtil.isEmpty(node.getTextContent().trim())) {
                continue;
            }
            MagnetInfo info = new MagnetInfo();
            Node magnetNote = (Node) xPath.evaluate(magnet, node, XPathConstants.NODE);
            //磁力链
            String magnetValue = magnetNote.getTextContent();
            info.setMagnet(transformMagnet(magnetValue));
            //名称
            Node nameNote = ((Node) xPath.evaluate(name, node, XPathConstants.NODE));
            String nameValue = nameNote.getTextContent();
            info.setName(nameValue);
            String nameHref = nameNote.getAttributes().getNamedItem("href").getTextContent();
            info.setDetailUrl(transformDetailUrl(rootUrl, nameHref));
            //大小
            Node sizeNote = ((Node) xPath.evaluate(size, node, XPathConstants.NODE));
            if (sizeNote != null) {
                String sizeValue = sizeNote.getTextContent();
                info.setFormatSize(sizeValue);

                info.setSize(transformSize(sizeValue));
            }
            //时间
            Node dateNote=((Node) xPath.evaluate(count, node, XPathConstants.NODE));
            if(dateNote!=null){
                String countValue = dateNote.getTextContent();
                info.setCount(countValue);
            }
            Node hotNote=((Node) xPath.evaluate(hot, node, XPathConstants.NODE));
            if(hotNote!=null){
                String hotValue = hotNote.getTextContent();
                info.setHot(hotValue);
            }
            //一些加工的额外信息
            String resolution = transformResolution(nameValue);
            info.setResolution(resolution);

            infos.add(info);
        }
    }
    return infos;
}

Example #11

Source File: MagnetWServiceModelImp.java From AndroidMagnetSearch with Apache License 2.0

4 votes

public List<MagnetInfo> parser(MagnetRule rule, String keyword,String sort, int page) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException {
    this.rule=rule;
    return parser(rule.getUrl(), rule.getSource(), keyword,sort, page, rule.getGroup(), rule.getMagnet(), rule.getName(), rule.getSize(), rule.getCount(),rule.getHot());
}

Example #12

Source File: MagnetWServiceModelImp.java From AndroidMagnetSearch with Apache License 2.0

4 votes

@Override
public List<MagnetInfo> parser(MagnetSearchBean bean) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException{
    return parser(bean.getRule(),bean.getKeyword(),bean.getSort(),transformPage(bean.getPage()));
}

Example #13

Source File: SNHtmlParserImpl.java From ispider with Apache License 2.0

4 votes

/**
 * 苏宁的下一页按钮的url似乎也是动态加载的，所以没有办法像京东一样获取
 */

@Override
public void parser(Page page) {
    HtmlCleaner cleaner = new HtmlCleaner();
    TagNode rootNode = cleaner.clean(page.getContent());
    long start = System.currentTimeMillis();    // 解析开始时间

    if (page.getUrl().startsWith("https://product.suning.com")) {    // 解析商品
        parserProduct(page, rootNode);
        logger.info("解析商品页面:{}, 消耗时长:{}ms", page.getUrl(), System.currentTimeMillis() - start);
    } else if (page.getUrl().startsWith("https://list.suning.com")) {    // 解析列表
        // 当前页面的商品url列表
        List<String> urls = HtmlUtil.getListUrlByXpath(rootNode, "href", "//div[@id='filter-results']/ul/li/div/div/div/div[1]/div[1]/a");
        page.getUrls().addAll(urls);
        // 获取所有的列表页面url
        if (!ifGetAll) {
            Integer totalPage = null;
            try {
                // 获取总页码数
                Object[] objects = rootNode.evaluateXPath("//div[@id='second-filter']/div[2]/div/span");
                TagNode tagNode = (TagNode) objects[0];
                String text = tagNode.getText().toString(); // "\n\n1\n/100\n"
                Pattern pattern = Pattern.compile("[0-9]{2,3}");
                Matcher matcher = pattern.matcher(text);
                if (matcher.find()) {
                    totalPage = Integer.valueOf(matcher.group()); // 获得页码总数
                }
            } catch (XPatherException e) {
                e.printStackTrace();
            }
            if (totalPage != null) {
                // 从url中获取当前页码
                String currentPageStr = page.getUrl().split("0-20006-")[1].split("\\.")[0];    // url: https://list.suning.com/0-20006-0.html
                int currentPage = Integer.valueOf(currentPageStr);
                for (int i = currentPage + 1; i < totalPage; i++) {
                    String url = "https://list.suning.com/0-20006-" + i + ".html";
                    page.getUrls().add(url);
                }
            }
            ifGetAll = true;    // 解析完列表后记得设置为true
        }
        logger.info("解析列表页面:{}, 消耗时长:{}ms", page.getUrl(), System.currentTimeMillis() - start);
    }
}

Example #14

Source File: MagnetWServiceModel.java From AndroidMagnetSearch with Apache License 2.0

votes

List<MagnetInfo> parser(String rootUrl, String url, String keyword,String sort, int page, String group, String magnet, String name, String size, String count,String hot) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException;

Example #15

Source File: MagnetWServiceModel.java From AndroidMagnetSearch with Apache License 2.0

votes

List<MagnetInfo> parser(MagnetRule rule, String keyword,String sort, int page) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException;

Example #16

Source File: MagnetWServiceModel.java From AndroidMagnetSearch with Apache License 2.0

votes

List<MagnetInfo> parser(MagnetSearchBean bean) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException;

Example #17

Source File: MagnetWServiceModel.java From AndroidDownload with Apache License 2.0

votes

List<MagnetInfo> parser(MagnetSearchBean bean) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException;

Example #18

Source File: MagnetWServiceModel.java From AndroidDownload with Apache License 2.0

votes

List<MagnetInfo> parser(MagnetRule rule, String keyword,String sort, int page) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException;

Example #19

Source File: MagnetWServiceModel.java From AndroidDownload with Apache License 2.0

votes

List<MagnetInfo> parser(String rootUrl, String url, String keyword,String sort, int page, String group, String magnet, String name, String size, String count,String hot) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException;