Java Code Examples for us.codecraft.webmagic.Page#setSkip()

The following examples show how to use us.codecraft.webmagic.Page#setSkip() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BlogProcesser.java    From mogu_blog_v2 with Apache License 2.0 6 votes vote down vote up
private void saveBlogInfo(Page page) {

        //2、获取我们需要的内容: title和content
        String title = page.getHtml().xpath("//*[@id=\"mainBox\"]/main/div[1]/div/div/div[1]/h1/text()").toString();
        String content = page.getHtml().xpath("//*[@id=\"article_content\"]").toString();


        if (title != null) {
            page.putField("title", title);
            page.putField("content", content);
        } else {
			//跳过爬取
            page.setSkip(true);
        }

    }
 
Example 2
Source File: ZhihuPageProcessor.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Override
public void process(Page page) {
    List<String> relativeUrl = page.getHtml().xpath("//li[@class='item clearfix']/div/a/@href").all();
    page.addTargetRequests(relativeUrl);
    relativeUrl = page.getHtml().xpath("//div[@id='zh-question-related-questions']//a[@class='question_link']/@href").all();
    page.addTargetRequests(relativeUrl);
    List<String> answers =  page.getHtml().xpath("//div[@id='zh-question-answer-wrap']/div").all();
    boolean exist = false;
    for(String answer:answers){
        String vote = new Html(answer).xpath("//div[@class='zm-votebar']//span[@class='count']/text()").toString();
        if(Integer.valueOf(vote) >= voteNum){
            page.putField("vote",vote);
            page.putField("content",new Html(answer).xpath("//div[@class='zm-editable-content']"));
            page.putField("userid", new Html(answer).xpath("//a[@class='author-link']/@href"));
            exist = true;
        }
    }
    if(!exist){
        page.setSkip(true);
    }
}
 
Example 3
Source File: ConfigurablePageProcessor.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Override
public void process(Page page) {
    for (ExtractRule extractRule : extractRules) {
        if (extractRule.isMulti()) {
            List<String> results = page.getHtml().selectDocumentForList(extractRule.getSelector());
            if (extractRule.isNotNull() && results.size() == 0) {
                page.setSkip(true);
            } else {
                page.getResultItems().put(extractRule.getFieldName(), results);
            }
        } else {
            String result = page.getHtml().selectDocument(extractRule.getSelector());
            if (extractRule.isNotNull() && result == null) {
                page.setSkip(true);
            } else {
                page.getResultItems().put(extractRule.getFieldName(), result);
            }
        }
    }
}
 
Example 4
Source File: GithubRepoPageProcessor.java    From SmartEducation with Apache License 2.0 5 votes vote down vote up
@Override
    // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
    public void process(Page page) {
        // 部分二:定义如何抽取页面信息,并保存下来
    	String author=page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString();
        page.putField("author",author );
        String name=page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString();
        page.putField("name", name);

        if (page.getResultItems().get("name") == null) {
            //skip this page
            page.setSkip(true);
        }
        String readme=page.getHtml().xpath("//div[@id='readme']/tidyText()").toString();
        page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));

//        if(name==null){
//            Test test=new Test();
//            test.setAuthor(author);
//            test.setName(name);
//            test.setReadme(readme);
//            testService.save(test);
//        }
        // 部分三:从页面发现后续的url地址来抓取
        page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
    
    }
 
Example 5
Source File: GithubRepoPageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all());
    page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
    page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
    if (page.getResultItems().get("name")==null){
        //skip this page
        page.setSkip(true);
    }
    page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
}
 
Example 6
Source File: ZhihuPageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    page.addTargetRequests(page.getHtml().links().regex("https://www\\.zhihu\\.com/question/\\d+/answer/\\d+.*").all());
    page.putField("title", page.getHtml().xpath("//h1[@class='QuestionHeader-title']/text()").toString());
    page.putField("question", page.getHtml().xpath("//div[@class='QuestionRichText']//tidyText()").toString());
    page.putField("answer", page.getHtml().xpath("//div[@class='QuestionAnswer-content']/tidyText()").toString());
    if (page.getResultItems().get("title")==null){
        //skip this page
        page.setSkip(true);
    }
}
 
Example 7
Source File: MamacnPageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    List<Selectable> nodes = page.getHtml().xpath("//ul[@id=ma-thumb-list]/li").nodes();
    StringBuilder accum = new StringBuilder();
    for (Selectable node : nodes) {
        accum.append("img:").append(node.xpath("//a/@href").get()).append("\n");
        accum.append("title:").append(node.xpath("//img/@alt").get()).append("\n");
    }
    page.putField("",accum.toString());
    if (accum.length() == 0) {
        page.setSkip(true);
    }
    page.addTargetRequests(page.getHtml().links().regex("http://www\\.mama\\.cn/photo/.*\\.html").all());
}
 
Example 8
Source File: GithubRepoPageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
    GithubRepo githubRepo = new GithubRepo();
    githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
    githubRepo.setName(page.getHtml().xpath("//h1[contains(@class, 'entry-title') and contains(@class, 'public')]/strong/a/text()").toString());
    githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString());
    if (githubRepo.getName() == null) {
        //skip this page
        page.setSkip(true);
    } else {
        page.putField("repo", githubRepo);
    }
}
 
Example 9
Source File: AlexanderMcqueenGoodsProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    if (page.getUrl().regex(URL_POST).match()) {
        page.putField("goodsName", page.getHtml().xpath("//div[@id='description']/h1/tidyText()"));
        if (page.getResultItems().get("goodsName") == null) {
            page.setSkip(true);
        }
        page.putField("currency", page.getHtml().xpath("//div[@id='description']//div[@class='itemBoxPrice']/span//span[@class='currency']/tidyText()"));
        page.putField("goodsPrice", page.getHtml().xpath("//div[@id='description']//div[@class='itemBoxPrice']/span//span[@class='priceValue']/tidyText()"));
        page.putField("description", page.getHtml()
                .xpath("//div[@id='tabbedDescription']//div[@class='tabbedDescription']//ul[@id='tabs']//li[@id='tab_description']/div[@id='description_pane']/tidyText()"));
        page.putField("material", page.getHtml()
                .xpath("//div[@id='tabbedDescription']" +
                        "//div[@class='tabbedDescription']" +
                        "//ul[@id='tabs']" +
                        "//li[@id='tab_description']" +
                        "//div[@class='productProperty']" +
                        "//div[@class='productPropertyRow']/span[2]/tidyText()"));
        page.putField("goodsCode", page.getHtml()
                .xpath("//div[@id='tabbedDescription']" +
                        "//div[@class='tabbedDescription']" +
                        "//ul[@id='tabs']" +
                        "//li[@id='tab_description']" +
                        "//div[@class='productProperty']" +
                        "//div[@class='productPropertyRow']//span[@id='modelFabricColorContainer']/tidyText()"));
        page.putField("goodsSize", page.getHtml()
                .xpath("//div[@id='sizesContainer']//div[@id='sizes']//ul[@class='SizeW']"));
        page.putField("goodsColors", page.getHtml()
                .xpath("//div[@id='colors']/ul/html()"));
    } else {
        page.addTargetRequests(page.getHtml().links().regex(URL_POST).all(), 1000);
        page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all(), 1);
    }
}
 
Example 10
Source File: GithubRepoPageMapper.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
    GithubRepo githubRepo = githubRepoPageMapper.get(page);
    if (githubRepo == null) {
        page.setSkip(true);
    } else {
        page.putField("repo", githubRepo);
    }

}