us.codecraft.webmagic.selector.Html Java Examples

The following examples show how to use us.codecraft.webmagic.selector.Html. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HtmlResolver.java    From blog-hunter with MIT License 6 votes vote down vote up
@Override
public void process(Page page, HunterConfig model) {
    Html pageHtml = page.getHtml();
    String title = pageHtml.xpath(model.getTitleRegex()).get();
    String source = page.getRequest().getUrl();
    if (model.isSingle() || (!StringUtils.isEmpty(title) && (!"null".equals(title) && !model.getEntryUrls().contains(source)))) {
        page.putField("title", title);
        page.putField("source", source);
        this.put(page, pageHtml, "releaseDate", model.getReleaseDateRegex());
        this.put(page, pageHtml, "author", model.getAuthorRegex());
        this.put(page, pageHtml, "content", model.getContentRegex());
        this.put(page, pageHtml, "tags", model.getTagRegex());
        this.put(page, pageHtml, "description", model.getDescriptionRegex());
        this.put(page, pageHtml, "keywords", model.getKeywordsRegex());
    }
    if (!model.isSingle()) {
        if (StringUtils.isNotEmpty(model.getTargetLinksRegex())) {
            page.addTargetRequests(page.getHtml().links().regex(model.getTargetLinksRegex()).all());
        }
    }
}
 
Example #2
Source File: CommonUtil.java    From blog-hunter with MIT License 6 votes vote down vote up
/**
 * 获取所有图片标签的src连接
 *
 * @param html 原博客内容
 */
public static String formatHtml(String html) {
    if (StringUtils.isEmpty(html)) {
        return null;
    }
    String lazyloadFormat = "<img src=\"%s\" title=\"%s\" alt=\"%s\">";

    Html pageHtml = getHtml(html);
    List<Selectable> imgSelectables = pageHtml.$("img").nodes();
    for (Selectable imgSelectable : imgSelectables) {
        String oldImg = imgSelectable.get();
        String newImg = String.format(lazyloadFormat, getRealImgUrl(imgSelectable), imgSelectable.xpath("//img/@title").get(), imgSelectable.xpath("//img/@alt").get());
        html = html.replace(oldImg, newImg);
    }
    return html;
}
 
Example #3
Source File: CommonUtil.java    From blog-hunter with MIT License 6 votes vote down vote up
/**
 * 获取所有图片标签的src连接
 *
 * @param html 原博客内容
 */
public static Set<ImageLink> getAllImageLink(String html) {
    if (StringUtils.isEmpty(html)) {
        return null;
    }
    Set<ImageLink> imageLinks = new HashSet<>();
    ImageLink imageLink = null;

    Html pageHtml = getHtml(html);
    List<Selectable> imgSelectables = pageHtml.$("img").nodes();
    for (Selectable imgSelectable : imgSelectables) {
        String newImgSrc = getRealImgUrl(imgSelectable);
        imageLink = new ImageLink(newImgSrc);
        imageLinks.add(imageLink);
    }
    return imageLinks;
}
 
Example #4
Source File: ProcessorBenchmark.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void test() {
    ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me(), OschinaBlog.class);
    Page page = new Page();
    page.setRequest(new Request("http://my.oschina.net/flashsword/blog"));
    page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog"));
    page.setHtml(new Html(html));
    long time = System.currentTimeMillis();
    for (int i = 0; i < 1000; i++) {
        modelPageProcessor.process(page);
    }
    System.out.println(System.currentTimeMillis() - time);
    time = System.currentTimeMillis();
    for (int i = 0; i < 1000; i++) {
        modelPageProcessor.process(page);
    }
    System.out.println(System.currentTimeMillis() - time);
}
 
Example #5
Source File: ZhihuPageProcessor.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Override
public void process(Page page) {
    List<String> relativeUrl = page.getHtml().xpath("//li[@class='item clearfix']/div/a/@href").all();
    page.addTargetRequests(relativeUrl);
    relativeUrl = page.getHtml().xpath("//div[@id='zh-question-related-questions']//a[@class='question_link']/@href").all();
    page.addTargetRequests(relativeUrl);
    List<String> answers =  page.getHtml().xpath("//div[@id='zh-question-answer-wrap']/div").all();
    boolean exist = false;
    for(String answer:answers){
        String vote = new Html(answer).xpath("//div[@class='zm-votebar']//span[@class='count']/text()").toString();
        if(Integer.valueOf(vote) >= voteNum){
            page.putField("vote",vote);
            page.putField("content",new Html(answer).xpath("//div[@class='zm-editable-content']"));
            page.putField("userid", new Html(answer).xpath("//a[@class='author-link']/@href"));
            exist = true;
        }
    }
    if(!exist){
        page.setSkip(true);
    }
}
 
Example #6
Source File: HtmlResolver.java    From blog-hunter with MIT License 5 votes vote down vote up
private void put(Page page, Html pageHtml, String key, String regex) {
    if (StringUtils.isNotEmpty(regex)) {
        if (key.equals("tags")) {
            page.putField(key, pageHtml.xpath(regex).all());
            return;
        }
        page.putField(key, pageHtml.xpath(regex).get());
    }
}
 
Example #7
Source File: AmanzonPageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
public void process(Page page) {

        Html html = page.getHtml();
        List<String> questionList =  html.xpath("//table[@class='tgCustomerCommunityCenterColumn']//div[@class='content']//table[@class='dataGrid']//tr").all();

        if(questionList != null && questionList.size() > 1)
        {
            //i=0是列名称,所以i从1开始
            for( int i = 1 ; i < questionList.size(); i++)
            {
                System.out.println(questionList.get(i));
                Html tempHtml =  Html.create("<table>"+questionList.get(i)+"</table>");
                String comment = tempHtml.xpath("//td[@class='title']//a/text()").toString();
                System.out.println(comment);
                String answerNum =  tempHtml.xpath("//td[@class='num']/text()").toString();
                System.out.println(answerNum);
                String createTime = tempHtml.xpath("//td[3]/text()").toString();
                System.out.println(createTime);

				/* Document doc = Jsoup.parse(questionList.get(i));
				 Html hmt  = Html.create(questionList.get(i)) ;
			     String str = hmt.links().toString();
				  String   content =   doc.getElementsByTag("a").text();
				  String ss = doc.text();*/

            }
        }

    }
 
Example #8
Source File: HtmlTest.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Test
public void testNthNodesGet(){
	Html html = new Html("<a data-tip=\"p$t$xxx\" href=\"/xx/xx\">xx</a>");
	assertThat(html.xpath("//a[1]/@href").get()).isEqualTo("/xx/xx");
	Selectable selectable = html.xpath("//a[1]").nodes().get(0);
	assertThat(selectable.xpath("/a/@href").get()).isEqualTo("/xx/xx");
}
 
Example #9
Source File: HtmlTest.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Ignore("not work in jsoup 1.8.x")
@Test
public void testDisableJsoupHtmlEntityEscape() throws Exception {
	Html.DISABLE_HTML_ENTITY_ESCAPE = true;
	Html html = new Html("aaaaaaa&b");
	assertThat(html.regex("(aaaaaaa&b)").toString()).isEqualTo("aaaaaaa&b");
}
 
Example #10
Source File: HtmlTest.java    From webmagic with Apache License 2.0 4 votes vote down vote up
@Test
  public void testRegexSelector() {
      Html selectable = new Html("aaaaaaab");
assertThat(selectable.regex("(a+b)").replace("aa(a)", "$1bb").toString()).isEqualTo("abbabbab");
  }
 
Example #11
Source File: HtmlTest.java    From webmagic with Apache License 2.0 4 votes vote down vote up
@Test
public void testEnableJsoupHtmlEntityEscape() throws Exception {
	Html html = new Html("aaaaaaa&b");
	assertThat(html.regex("(aaaaaaa&amp;b)").toString()).isEqualTo("aaaaaaa&amp;b");
}
 
Example #12
Source File: HtmlTest.java    From webmagic with Apache License 2.0 4 votes vote down vote up
@Test
public void testAHrefExtract(){
	Html html = new Html("<a data-tip=\"p$t$xxx\" href=\"/xx/xx\">xx</a>");
	assertThat(html.links().all()).contains("/xx/xx");
}
 
Example #13
Source File: HttpClientDownloaderTest.java    From webmagic with Apache License 2.0 4 votes vote down vote up
@Test
public void testDownloader() {
    HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
    Html html = httpClientDownloader.download("https://www.baidu.com/");
    assertTrue(!html.getFirstSourceText().isEmpty());
}
 
Example #14
Source File: CommonUtil.java    From blog-hunter with MIT License 4 votes vote down vote up
private static Html getHtml(String html) {
    Page page = new Page();
    page.setRequest(new Request(""));
    page.setRawText(html);
    return page.getHtml();
}
 
Example #15
Source File: AbstractDownloader.java    From webmagic with Apache License 2.0 2 votes vote down vote up
/**
 * A simple method to download a url.
 *
 * @param url url
 * @param charset charset
 * @return html
 */
public Html download(String url, String charset) {
    Page page = download(new Request(url), Site.me().setCharset(charset).toTask());
    return (Html) page.getHtml();
}
 
Example #16
Source File: AbstractDownloader.java    From webmagic with Apache License 2.0 2 votes vote down vote up
/**
 * A simple method to download a url.
 *
 * @param url url
 * @return html
 */
public Html download(String url) {
    return download(url, null);
}
 
Example #17
Source File: Page.java    From webmagic with Apache License 2.0 2 votes vote down vote up
/**
 * @param html html
 * @deprecated since 0.4.0
 * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead.
 */
public void setHtml(Html html) {
    this.html = html;
}