Java Code Examples for us.codecraft.webmagic.Page#addTargetRequests()

The following examples show how to use us.codecraft.webmagic.Page#addTargetRequests() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: HtmlResolver.java    From blog-hunter with MIT License 6 votes vote down vote up
@Override
public void process(Page page, HunterConfig model) {
    Html pageHtml = page.getHtml();
    String title = pageHtml.xpath(model.getTitleRegex()).get();
    String source = page.getRequest().getUrl();
    if (model.isSingle() || (!StringUtils.isEmpty(title) && (!"null".equals(title) && !model.getEntryUrls().contains(source)))) {
        page.putField("title", title);
        page.putField("source", source);
        this.put(page, pageHtml, "releaseDate", model.getReleaseDateRegex());
        this.put(page, pageHtml, "author", model.getAuthorRegex());
        this.put(page, pageHtml, "content", model.getContentRegex());
        this.put(page, pageHtml, "tags", model.getTagRegex());
        this.put(page, pageHtml, "description", model.getDescriptionRegex());
        this.put(page, pageHtml, "keywords", model.getKeywordsRegex());
    }
    if (!model.isSingle()) {
        if (StringUtils.isNotEmpty(model.getTargetLinksRegex())) {
            page.addTargetRequests(page.getHtml().links().regex(model.getTargetLinksRegex()).all());
        }
    }
}
 
Example 2
Source File: ContentImageProcessor.java    From javabase with Apache License 2.0 6 votes vote down vote up
@Override
public void process(Page page) {
    List<String> imageUrlList = page.getHtml().$(".BDE_Image", "src").all();
    String pageId = page.getUrl().toString().replace(tieBaConfiguration.getTiebaContentPageUrl(),"");
    List<String> list = new ArrayList<>();
    for (String imageUrl : imageUrlList) {
        if (imageUrl.startsWith(tieBaConfiguration.getTiebaImageUrl())) {
            imageUrl=convertImageUrl(imageUrl);
            if (null!=imageUrl)list.add(imageUrl);
        }
    }
    if (list.size() > 0) {
        map.put(WebmagicService.getByte(TieBaImageIdMessageListener.TIEBA_CONTENT_IMAGE_KEY+pageId), WebmagicService.getByte(JSONObject.toJSONString(list)));
    }else{
        redisTemplate.convertAndSend(tieBaConfiguration.getTiebaContentNoImageIdTopic(), JSONObject.toJSONString(new ContentBean(pageId,tiebaName)));
    }
    if (!isAddTarget) {
        for (String id : pageNumberList) {
            StringBuilder sb = new StringBuilder();
            sb.append(url).append(id);
            page.addTargetRequests(Arrays.asList(sb.toString()));
        }
        isAddTarget = true;
    }
}
 
Example 3
Source File: ZhihuPageProcessor.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Override
public void process(Page page) {
    List<String> relativeUrl = page.getHtml().xpath("//li[@class='item clearfix']/div/a/@href").all();
    page.addTargetRequests(relativeUrl);
    relativeUrl = page.getHtml().xpath("//div[@id='zh-question-related-questions']//a[@class='question_link']/@href").all();
    page.addTargetRequests(relativeUrl);
    List<String> answers =  page.getHtml().xpath("//div[@id='zh-question-answer-wrap']/div").all();
    boolean exist = false;
    for(String answer:answers){
        String vote = new Html(answer).xpath("//div[@class='zm-votebar']//span[@class='count']/text()").toString();
        if(Integer.valueOf(vote) >= voteNum){
            page.putField("vote",vote);
            page.putField("content",new Html(answer).xpath("//div[@class='zm-editable-content']"));
            page.putField("userid", new Html(answer).xpath("//a[@class='author-link']/@href"));
            exist = true;
        }
    }
    if(!exist){
        page.setSkip(true);
    }
}
 
Example 4
Source File: ContentIdProcessor.java    From javabase with Apache License 2.0 6 votes vote down vote up
@Override
public void process(Page page) {
    for (int i = 1; i <= pageSize; i++) {
        String json = page.getHtml().xpath("//ul[@id='thread_list']/li[@class='j_thread_list clearfix'][" + (i) + "]/@data-field").toString();
        if(json!=null&&JSONObject.parseObject(json).containsKey("id")){
            JSONObject jsonObject = JSONObject.parseObject(json);
            String pageId=jsonObject.getString("id");
            String authorName=jsonObject.getString("author_name");
            String date = praseDate(page,i);
            String title=page.getHtml().xpath("a[@href='"+tieBaConfiguration.getTiebaContentPageUrl()+pageId+"']/@title").toString();

            pageNumberList.add(new ContentBean(pageId,date,tiebaName,authorName,title));
        }
    }

    if (!isAddTarget) {
        for (int i = 2; i <= endNum; i++) {
            StringBuilder sb = new StringBuilder();
            sb.append(tiebaUrl).append("&pn=" + i*pageSize);
            page.addTargetRequests(Arrays.asList(sb.toString()));
        }
        isAddTarget = true;
    }
}
 
Example 5
Source File: GovNewsPageProcesser.java    From elasticsearch-jest-example with MIT License 5 votes vote down vote up
public void process(Page page) {
    // 列表页
    if(page.getUrl().regex(URL_LIST).match()){
        // 添加详情页请求链接
        page.addTargetRequests(page.getHtml().links().regex(URL_POST).all());
        // 添加列表页请求链接
        page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all());
    }else{// 详情页
        page.putField("title", Utils.replaceHTML(page.getHtml().xpath("//div[@class='pages-title']").toString()));
        page.putField("content", Utils.replaceHTML(page.getHtml().xpath("//div[@class='article-colum']/div[@class='pages_content']/table[@id='printContent']/tbody/tr/td").toString()));
        page.putField("source",Utils.replaceHTML(page.getHtml().xpath("//div[@class='article-colum']/div[@class='pages-date']/span[@class='font'][2]").toString().replace("来源: ", "")));
        page.putField("author",Utils.replaceHTML(page.getHtml().xpath("//div[@class='article-colum']/div[@class='pages_content']/div[@class='editor']").toString().replace("责任编辑: ", "")));
        page.putField("create",Utils.replaceHTML(page.getHtml().xpath("//div[@class='article-colum']/div[@class='pages-date']").toString()));
        page.putField("url",page.getUrl().get());

        String title = (String)page.getResultItems().get("title");
        String content = (String)page.getResultItems().get("content");
        String create = (String)page.getResultItems().get("create");
        String source = (String)page.getResultItems().get("source");
        String url = (String)page.getResultItems().get("url");
        String author = (String)page.getResultItems().get("author");

        // 创建article
        Article article = Utils.createArticle(title, content, source, author, url, create);

        // 索引

        Utils.index(article);

    }
}
 
Example 6
Source File: HuxiuProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    List<String> requests = page.getHtml().links().regex(".*article.*").all();
    page.addTargetRequests(requests);
    page.putField("title",page.getHtml().xpath("//div[@class='clearfix neirong']//h1/text()"));
    page.putField("content",page.getHtml().xpath("//div[@id='neirong_box']/tidyText()"));
}
 
Example 7
Source File: NjuBBSProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    List<String> requests = page.getHtml().regex("<a[^<>]*href=(bbstcon\\?board=Pictures&file=[^>]*)").all();
    page.addTargetRequests(requests);
    page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
    page.putField("content",page.getHtml().smartContent());
}
 
Example 8
Source File: QzoneBlogProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    //http://progressdaily.diandian.com/post/2013-01-24/40046867275

    //http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106
    // &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone
    List<String> requests = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").all();
    page.addTargetRequests(requests);
    page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
    page.putField("content",page.getHtml().smartContent());
}
 
Example 9
Source File: MamacnPageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    List<Selectable> nodes = page.getHtml().xpath("//ul[@id=ma-thumb-list]/li").nodes();
    StringBuilder accum = new StringBuilder();
    for (Selectable node : nodes) {
        accum.append("img:").append(node.xpath("//a/@href").get()).append("\n");
        accum.append("title:").append(node.xpath("//img/@alt").get()).append("\n");
    }
    page.putField("",accum.toString());
    if (accum.length() == 0) {
        page.setSkip(true);
    }
    page.addTargetRequests(page.getHtml().links().regex("http://www\\.mama\\.cn/photo/.*\\.html").all());
}
 
Example 10
Source File: SimplePageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    List<String> requests = page.getHtml().links().regex(urlPattern).all();
    //add urls to fetch
    page.addTargetRequests(requests);
    //extract by XPath
    page.putField("title", page.getHtml().xpath("//title"));
    page.putField("html", page.getHtml().toString());
    //extract by Readability
    page.putField("content", page.getHtml().smartContent());
}
 
Example 11
Source File: GithubRepoPageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all());
    page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
    page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
    if (page.getResultItems().get("name")==null){
        //skip this page
        page.setSkip(true);
    }
    page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
}
 
Example 12
Source File: HuabanProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    page.addTargetRequests(page.getHtml().links().regex("http://huaban\\.com/.*").all());
    if (page.getUrl().toString().contains("pins")) {
        page.putField("img", page.getHtml().xpath("//div[@class='image-holder']/a/img/@src").toString());
    } else {
        page.getResultItems().setSkip(true);
    }
}
 
Example 13
Source File: BlogProcesser.java    From mogu_blog_v2 with Apache License 2.0 5 votes vote down vote up
/**
    * 处理我们需要的页面
    */


   @Override
   public void process(Page page) {
       List<String> list = page.getHtml().regex("https://blog.csdn.net/[a-zA-Z0-9_]+/article/details/[0-9]{9}").all();
       this.saveBlogInfo(page);
       page.addTargetRequests(list);

/*	if(list==null || list.size()==0){
		// 如果为空 表示这是图片详情页
		this.saveBlogInfo(page);
	}else {
		// 如果不为空 表示这是列表页 解析出详情页的url地址 放到任务队列中
		*//*for (Selectable selectable : list) {
			//获取url地址
		String details = 	selectable.links().toString();
		page.addTargetRequest(details);
		}*//*

		for (String details : list) {
			//获取url地址
			page.addTargetRequest(details);
		}
	}*/

   }
 
Example 14
Source File: NeteaseNewsPageProcesser.java    From elasticsearch-jest-example with MIT License 5 votes vote down vote up
public void process(Page page) {
    //列表页
    if (page.getUrl().regex(URL_LIST).match()||page.getUrl().regex("http://news\\.163\\.com/domestic").match()||page.getUrl().regex("http://news\\.163\\.com/shehui").match()) {
        page.addTargetRequests(page.getHtml().links().regex(URL_POST).all());
        page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all());
    }else{

        page.putField("title", Utils.replaceHTML(page.getHtml().xpath("//h1[@id='h1title']").toString()));
        page.putField("content", Utils.replaceHTML(page.getHtml().xpath("//div[@id='endText']").toString()));
        page.putField("create", Utils.replaceHTML(page.getHtml().xpath("//div[@class=\"ep-time-soure cDGray\"]").toString()));
        page.putField("source", Utils.replaceHTML(page.getHtml().xpath("//a[@id=\"ne_article_source\"]/text()").toString()));
        page.putField("url", page.getUrl().get());

        String title = (String)page.getResultItems().get("title");
        String content = (String)page.getResultItems().get("content");
        String create = (String)page.getResultItems().get("create");
        String source = (String)page.getResultItems().get("source");
        String url = (String)page.getResultItems().get("url");
        String author = "";

        // 创建article
        Article article = Utils.createArticle(title, content, source, author, url, create);

        // 索引

        Utils.index(article);

    }
}
 
Example 15
Source File: GithubRepoPageProcessor.java    From SmartEducation with Apache License 2.0 5 votes vote down vote up
@Override
    // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
    public void process(Page page) {
        // 部分二:定义如何抽取页面信息,并保存下来
    	String author=page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString();
        page.putField("author",author );
        String name=page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString();
        page.putField("name", name);

        if (page.getResultItems().get("name") == null) {
            //skip this page
            page.setSkip(true);
        }
        String readme=page.getHtml().xpath("//div[@id='readme']/tidyText()").toString();
        page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));

//        if(name==null){
//            Test test=new Test();
//            test.setAuthor(author);
//            test.setName(name);
//            test.setReadme(readme);
//            testService.save(test);
//        }
        // 部分三:从页面发现后续的url地址来抓取
        page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
    
    }
 
Example 16
Source File: AlexanderMcqueenGoodsProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    if (page.getUrl().regex(URL_POST).match()) {
        page.putField("goodsName", page.getHtml().xpath("//div[@id='description']/h1/tidyText()"));
        if (page.getResultItems().get("goodsName") == null) {
            page.setSkip(true);
        }
        page.putField("currency", page.getHtml().xpath("//div[@id='description']//div[@class='itemBoxPrice']/span//span[@class='currency']/tidyText()"));
        page.putField("goodsPrice", page.getHtml().xpath("//div[@id='description']//div[@class='itemBoxPrice']/span//span[@class='priceValue']/tidyText()"));
        page.putField("description", page.getHtml()
                .xpath("//div[@id='tabbedDescription']//div[@class='tabbedDescription']//ul[@id='tabs']//li[@id='tab_description']/div[@id='description_pane']/tidyText()"));
        page.putField("material", page.getHtml()
                .xpath("//div[@id='tabbedDescription']" +
                        "//div[@class='tabbedDescription']" +
                        "//ul[@id='tabs']" +
                        "//li[@id='tab_description']" +
                        "//div[@class='productProperty']" +
                        "//div[@class='productPropertyRow']/span[2]/tidyText()"));
        page.putField("goodsCode", page.getHtml()
                .xpath("//div[@id='tabbedDescription']" +
                        "//div[@class='tabbedDescription']" +
                        "//ul[@id='tabs']" +
                        "//li[@id='tab_description']" +
                        "//div[@class='productProperty']" +
                        "//div[@class='productPropertyRow']//span[@id='modelFabricColorContainer']/tidyText()"));
        page.putField("goodsSize", page.getHtml()
                .xpath("//div[@id='sizesContainer']//div[@id='sizes']//ul[@class='SizeW']"));
        page.putField("goodsColors", page.getHtml()
                .xpath("//div[@id='colors']/ul/html()"));
    } else {
        page.addTargetRequests(page.getHtml().links().regex(URL_POST).all(), 1000);
        page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all(), 1);
    }
}
 
Example 17
Source File: IteyeBlogProcessor.java    From webmagic with Apache License 2.0 4 votes vote down vote up
@Override
public void process(Page page) {
    page.addTargetRequests(page.getHtml().links().regex(".*yanghaoli\\.iteye\\.com/blog/\\d+").all());
    page.putField("title",page.getHtml().xpath("//title").toString());
    page.putField("content",page.getHtml().smartContent().toString());
}
 
Example 18
Source File: TopProcessor.java    From javabase with Apache License 2.0 4 votes vote down vote up
@Override
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
    for (int i = 1; i <= 20; i++) {
        String index = "";
        if (tiebaUrl.equals(page.getUrl().toString())) {
            if (i <= 3) {
                index = i + "";
            } else {
                index = page.getHtml().xpath(
                        "//tbody/tr[@class='drl_list_item'][" + (i+1) + "]/td[@class='drl_item_index']/p/text()")
                        .toString();
            }
        } else {
            index = page.getHtml().xpath(
                    "//tbody/tr[@class='drl_list_item'][" + (i + 1) + "]/td[@class='drl_item_index']/p/text()")
                    .toString();
        }
        if (StringUtils.isEmpty(index) || !StringUtils.isNumeric(index))
            continue;
        String name = page.getHtml().xpath(
                "//tbody/tr[@class='drl_list_item'][" + (i + 1) + "]/td[@class='drl_item_name']/div/a/text()")
                .toString();
        String level = page.getHtml().xpath(
                "//tbody/tr[@class='drl_list_item'][" + (i + 1) + "]/td[@class='drl_item_title']/div/div@class")
                .toString().replace("bg_lv", "");
        String experience = page.getHtml()
                .xpath("//tbody/tr[@class='drl_list_item'][" + (i + 1) + "]/td[@class='drl_item_exp']/span/text()")
                .toString();
        map.put(index, new TopBean(index, name, level, experience, tiebaName));
    }
    if (!isAddTarget) {
        String total = page.getHtml().xpath("//span[@class='drl_info_txt_gray'][1]/text()").toString();
        if (StringUtils.isNumeric(total)) {
            int pageSize = Integer.parseInt(total) / 20 + 1;
            for (int i = 2; i <= pageSize; i++) {
                StringBuilder sb = new StringBuilder();
                sb.append(tiebaUrl).append("&pn=" + i);
                page.addTargetRequests(Arrays.asList(sb.toString()));
            }
        }
        isAddTarget = true;
    }
}
 
Example 19
Source File: CsdnBlogPageProcessor.java    From webmagic-csdnblog with MIT License 4 votes vote down vote up
@Override
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
	// 列表页
	if (!page.getUrl().regex("http://blog\\.csdn\\.net/" + username + "/article/details/\\d+").match()) {
		// 添加所有文章页
		page.addTargetRequests(page.getHtml().xpath("//div[@id='article_list']").links()// 限定文章列表获取区域
				.regex("/" + username + "/article/details/\\d+")
				.replace("/" + username + "/", "http://blog.csdn.net/" + username + "/")// 巧用替换给把相对url转换成绝对url
				.all());
		// 添加其他列表页
		page.addTargetRequests(page.getHtml().xpath("//div[@id='papelist']").links()// 限定其他列表页获取区域
				.regex("/" + username + "/article/list/\\d+")
				.replace("/" + username + "/", "http://blog.csdn.net/" + username + "/")// 巧用替换给把相对url转换成绝对url
				.all());
		// 文章页
	} else {
		size++;// 文章数量加1
		// 用CsdnBlog类来存抓取到的数据,方便存入数据库
		CsdnBlog csdnBlog = new CsdnBlog();
		// 设置编号
		csdnBlog.setId(Integer.parseInt(
				page.getUrl().regex("http://blog\\.csdn\\.net/" + username + "/article/details/(\\d+)").get()));
		// 设置标题
		csdnBlog.setTitle(
				page.getHtml().xpath("//div[@class='article_title']//span[@class='link_title']/a/text()").get());
		// 设置日期
		csdnBlog.setDate(
				page.getHtml().xpath("//div[@class='article_r']/span[@class='link_postdate']/text()").get());
		// 设置标签(可以有多个,用,来分割)
		csdnBlog.setTags(listToString(page.getHtml()
				.xpath("//div[@class='article_l']/span[@class='link_categories']/a/allText()").all()));
		// 设置类别(可以有多个,用,来分割)
		csdnBlog.setCategory(
				listToString(page.getHtml().xpath("//div[@class='category_r']/label/span/text()").all()));
		// 设置阅读人数
		csdnBlog.setView(Integer.parseInt(page.getHtml().xpath("//div[@class='article_r']/span[@class='link_view']")
				.regex("(\\d+)人阅读").get()));
		// 设置评论人数
		csdnBlog.setComments(Integer.parseInt(page.getHtml()
				.xpath("//div[@class='article_r']/span[@class='link_comments']").regex("\\((\\d+)\\)").get()));
		// 设置是否原创
		csdnBlog.setCopyright(page.getHtml().regex("bog_copyright").match() ? 1 : 0);
		// 把对象存入数据库
		new CsdnBlogDao().add(csdnBlog);
		// 把对象输出控制台
		System.out.println(csdnBlog);
	}
}
 
Example 20
Source File: TomProcessor.java    From tom-crawler with Apache License 2.0 4 votes vote down vote up
public void process(Page page) {
    //输出一下site的值
    logger.info("site info:{}", site.getUserAgent());
    page.addTargetRequests(page.getHtml().links().all());
}