Java Code Examples for us.codecraft.webmagic.Page#putField()

The following examples show how to use us.codecraft.webmagic.Page#putField() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BlogProcesser.java    From mogu_blog_v2 with Apache License 2.0 6 votes vote down vote up
private void saveBlogInfo(Page page) {

        //2、获取我们需要的内容: title和content
        String title = page.getHtml().xpath("//*[@id=\"mainBox\"]/main/div[1]/div/div/div[1]/h1/text()").toString();
        String content = page.getHtml().xpath("//*[@id=\"article_content\"]").toString();


        if (title != null) {
            page.putField("title", title);
            page.putField("content", content);
        } else {
			//跳过爬取
            page.setSkip(true);
        }

    }
 
Example 2
Source File: HtmlResolver.java    From blog-hunter with MIT License 6 votes vote down vote up
@Override
public void process(Page page, HunterConfig model) {
    Html pageHtml = page.getHtml();
    String title = pageHtml.xpath(model.getTitleRegex()).get();
    String source = page.getRequest().getUrl();
    if (model.isSingle() || (!StringUtils.isEmpty(title) && (!"null".equals(title) && !model.getEntryUrls().contains(source)))) {
        page.putField("title", title);
        page.putField("source", source);
        this.put(page, pageHtml, "releaseDate", model.getReleaseDateRegex());
        this.put(page, pageHtml, "author", model.getAuthorRegex());
        this.put(page, pageHtml, "content", model.getContentRegex());
        this.put(page, pageHtml, "tags", model.getTagRegex());
        this.put(page, pageHtml, "description", model.getDescriptionRegex());
        this.put(page, pageHtml, "keywords", model.getKeywordsRegex());
    }
    if (!model.isSingle()) {
        if (StringUtils.isNotEmpty(model.getTargetLinksRegex())) {
            page.addTargetRequests(page.getHtml().links().regex(model.getTargetLinksRegex()).all());
        }
    }
}
 
Example 3
Source File: DiandianBlogProcessor.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Override
public void process(Page page) {
    //a()表示提取链接,links()表示提取所有链接
    //getHtml()返回Html对象,支持链式调用
    //r()表示用正则表达式提取一条内容,regex()表示提取多条内容
    //toString()表示取单条结果,all()表示取多条
    List<String> requests = page.getHtml().links().regex("(.*/post/.*)").all();
    //使用page.addTargetRequests()方法将待抓取的链接加入队列
    page.addTargetRequests(requests);
    //page.putField(key,value)将抽取的内容加入结果Map
    //x()和xs()使用xpath进行抽取
    page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|").toString());
    //smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率
    page.putField("content", page.getHtml().smartContent());
    page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/"));
    page.putField("id", page.getUrl().regex("post/\\d+-\\d+-\\d+/(\\d+)"));
}
 
Example 4
Source File: SinaBlogProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    //列表页
    if (page.getUrl().regex(URL_LIST).match()) {
        page.addTargetRequests(page.getHtml().xpath("//div[@class=\"articleList\"]").links().regex(URL_POST).all());
        page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all());
        //文章页
    } else {
        page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2"));
        page.putField("content", page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']"));
        page.putField("date",
                page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)"));
    }
}
 
Example 5
Source File: F58PageProcesser.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    List<String> strings = page.getHtml().links().regex(".*/yewu/.*").all();
    page.addTargetRequests(strings);
    page.putField("title",page.getHtml().regex("<title>(.*)</title>"));
    page.putField("body",page.getHtml().xpath("//dd"));
}
 
Example 6
Source File: AngularJSProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    if (page.getUrl().regex(LIST_URL).match()) {
        List<String> ids = new JsonPathSelector("$.data[*]._id").selectList(page.getRawText());
        if (CollectionUtils.isNotEmpty(ids)) {
            for (String id : ids) {
                page.addTargetRequest("http://angularjs.cn/api/article/" + id);
            }
        }
    } else {
        page.putField("title", new JsonPathSelector("$.data.title").select(page.getRawText()));
        page.putField("content", new JsonPathSelector("$.data.content").select(page.getRawText()));
    }

}
 
Example 7
Source File: MamacnPageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    List<Selectable> nodes = page.getHtml().xpath("//ul[@id=ma-thumb-list]/li").nodes();
    StringBuilder accum = new StringBuilder();
    for (Selectable node : nodes) {
        accum.append("img:").append(node.xpath("//a/@href").get()).append("\n");
        accum.append("title:").append(node.xpath("//img/@alt").get()).append("\n");
    }
    page.putField("",accum.toString());
    if (accum.length() == 0) {
        page.setSkip(true);
    }
    page.addTargetRequests(page.getHtml().links().regex("http://www\\.mama\\.cn/photo/.*\\.html").all());
}
 
Example 8
Source File: KaichibaProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    //http://progressdaily.diandian.com/post/2013-01-24/40046867275
    int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1;
    page.addTargetRequest("http://kaichiba.com/shop/" + i);
    page.putField("title",page.getHtml().xpath("//Title"));
    page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace("<span>.*?</span>", ""));
}
 
Example 9
Source File: ZipCodePageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
private void processDistrict(Page page) {
    String province = page.getRequest().getExtra("province").toString();
    String district = page.getRequest().getExtra("district").toString();
    String zipCode = page.getHtml().regex("<h2>邮编:(\\d+)</h2>").toString();
    page.putField("result", StringUtils.join(new String[]{province, district,
            zipCode}, "\t"));
    List<String> links = page.getHtml().links().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").all();
    for (String link : links) {
        page.addTargetRequest(new Request(link).setPriority(2).putExtra("province", province).putExtra("district", district));
    }

}
 
Example 10
Source File: GithubRepoPageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
    GithubRepo githubRepo = new GithubRepo();
    githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
    githubRepo.setName(page.getHtml().xpath("//h1[contains(@class, 'entry-title') and contains(@class, 'public')]/strong/a/text()").toString());
    githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString());
    if (githubRepo.getName() == null) {
        //skip this page
        page.setSkip(true);
    } else {
        page.putField("repo", githubRepo);
    }
}
 
Example 11
Source File: HuabanProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    page.addTargetRequests(page.getHtml().links().regex("http://huaban\\.com/.*").all());
    if (page.getUrl().toString().contains("pins")) {
        page.putField("img", page.getHtml().xpath("//div[@class='image-holder']/a/img/@src").toString());
    } else {
        page.getResultItems().setSkip(true);
    }
}
 
Example 12
Source File: TianyaPageProcesser.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").all();
    page.addTargetRequests(strings);
    page.putField("title", page.getHtml().xpath("//div[@id='post_head']//span[@class='s_title']//b"));
    page.putField("body",page.getHtml().smartContent());
}
 
Example 13
Source File: DiaoyuwengProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all();
    page.addTargetRequests(requests);
    requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all();
    page.addTargetRequests(requests);
    if (page.getUrl().toString().contains("thread")){
        page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']"));
        page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()"));
        page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"));
        page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString()));
    }
}
 
Example 14
Source File: GithubRepoPageProcessor.java    From SmartEducation with Apache License 2.0 5 votes vote down vote up
@Override
    // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
    public void process(Page page) {
        // 部分二:定义如何抽取页面信息,并保存下来
    	String author=page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString();
        page.putField("author",author );
        String name=page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString();
        page.putField("name", name);

        if (page.getResultItems().get("name") == null) {
            //skip this page
            page.setSkip(true);
        }
        String readme=page.getHtml().xpath("//div[@id='readme']/tidyText()").toString();
        page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));

//        if(name==null){
//            Test test=new Test();
//            test.setAuthor(author);
//            test.setName(name);
//            test.setReadme(readme);
//            testService.save(test);
//        }
        // 部分三:从页面发现后续的url地址来抓取
        page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
    
    }
 
Example 15
Source File: QzoneBlogProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    //http://progressdaily.diandian.com/post/2013-01-24/40046867275

    //http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106
    // &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone
    List<String> requests = page.getHtml().regex("<a[^<>]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").all();
    page.addTargetRequests(requests);
    page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a"));
    page.putField("content",page.getHtml().smartContent());
}
 
Example 16
Source File: JsonResolver.java    From blog-hunter with MIT License 5 votes vote down vote up
@Override
public void process(Page page, HunterConfig model) {
    String rawText = page.getRawText();
    String title = new JsonPathSelector(model.getTitleRegex()).select(rawText);
    if (!StringUtils.isEmpty(title) && !"null".equals(title)) {
        page.putField("title", title);
        page.putField("releaseDate", new JsonPathSelector(model.getReleaseDateRegex()).select(rawText));
        page.putField("author", new JsonPathSelector(model.getAuthorRegex()).select(rawText));
        page.putField("content", new JsonPathSelector(model.getContentRegex()).select(rawText));
        page.putField("source", page.getRequest().getUrl());
    }
    page.addTargetRequests(page.getHtml().links().regex(model.getTargetLinksRegex()).all());
}
 
Example 17
Source File: ChaoXingTest.java    From SmartEducation with Apache License 2.0 4 votes vote down vote up
@Override
	public void process(Page page) {
//		<li>
//        <a href="/courselist?professionid=100087&xuekeid=6" target="_self"><img class="fl" src="/zy/2288.jpg" onerror="src='/zy/no_course.jpg'" width="136" height="80"></a>
//        <ul>
//          <li class="name"><a target="_self" href="/courselist?professionid=100087xuekeid=6&amp;schoolid=0">软件工程专业</a></li>
//          <li class="info">
//            <span class="num"><b class="icons"></b>共有42门课程</span><span class="relation" title="所属学科:工学"><b class="icons"></b>工学</span>
//          </li>
//        </ul>
//      </li>
		//筛选专业名称
		List<String> professionNameList=page.getHtml().xpath("//ul/li[@class='name']/a/html()").all();
		page.putField("professionName", professionNameList);
		
		//筛选课程数量
		List<String> courseCountList=page.getHtml().xpath("//ul/li[@class='info']/span[@class='num']/text()").all();
		page.putField("courseCount", courseCountList);
		
		//筛选url 也可以用下面的,不过它使用了两个a标签,会得到重复的
		//page.getHtml().links().regex("(\\/courselist\\?professionid=\\d+&xuekeid=\\d+)").all();
		
		List<String> urlList=page.getHtml().xpath("//li[@class='name']/a/@href").all();
		page.putField("url", urlList);
		
		//筛选专业类型
		List<String> professionTypeList=page.getHtml().xpath("//span[@class='relation']/text()").all();
		page.putField("professionType", professionTypeList);
		
		//System.out.println("哈哈哈哈"+page.getResultItems().get("professionName"));
		//如果url不为空就保存到数据库
		if(professionNameList.size()>0){
			for(int i=0;i<professionNameList.size();i++){
				String temp=courseCountList.get(i).toString();
				String regEx="[^0-9]";
				Pattern p = Pattern.compile(regEx);   
				Matcher m = p.matcher(temp); 
				Integer count=Integer.parseInt(m.replaceAll("").trim());
				SpiderProfession spiderProfession=new SpiderProfession(professionNameList.get(i).toString(),count,urlList.get(i).toString(),professionTypeList.get(i).toString());
				spiderProfessionService.save(spiderProfession);
			}
		}
//		page.addTargetRequests(page.getHtml().links().regex("(\\/courselist\\?professionid=\\d+&xuekeid=\\d+)").all());
//		List<String> urls = page.getHtml().css("div.pagination").links().regex("").all();
//		page.addTargetRequests(urls);
		//page.addTargetRequest("http://nation.chaoxing.com/nation?prefix=hhit&id=400A4E71B99E66FEDC29078F41E3E3B57C094C6405C01671B4D556D8C6BCB5AB");
	}
 
Example 18
Source File: BaiduBaikePageProcessor.java    From webmagic with Apache License 2.0 4 votes vote down vote up
@Override
public void process(Page page) {
    page.putField("name", page.getHtml().css("dl.lemmaWgt-lemmaTitle h1","text").toString());
    page.putField("description", page.getHtml().xpath("//div[@class='lemma-summary']/allText()"));
}
 
Example 19
Source File: IteyeBlogProcessor.java    From webmagic with Apache License 2.0 4 votes vote down vote up
@Override
public void process(Page page) {
    page.addTargetRequests(page.getHtml().links().regex(".*yanghaoli\\.iteye\\.com/blog/\\d+").all());
    page.putField("title",page.getHtml().xpath("//title").toString());
    page.putField("content",page.getHtml().smartContent().toString());
}
 
Example 20
Source File: PhantomJSPageProcessor.java    From webmagic with Apache License 2.0 4 votes vote down vote up
@Override
public void process(Page page) {
    if (page.getRawText() != null)
        page.putField("html", page.getRawText());
}