Java Code Examples for us.codecraft.webmagic.Page#addTargetRequest()

The following examples show how to use us.codecraft.webmagic.Page#addTargetRequest() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ModelPageProcessor.java    From webmagic with Apache License 2.0 6 votes vote down vote up
private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) {
    List<String> links;
    if (urlRegionSelector == null) {
        links = page.getHtml().links().all();
    } else {
        links = page.getHtml().selectList(urlRegionSelector).links().all();
    }
    for (String link : links) {
        for (Pattern targetUrlPattern : urlPatterns) {
            Matcher matcher = targetUrlPattern.matcher(link);
            if (matcher.find()) {
                page.addTargetRequest(new Request(matcher.group(0)));
            }
        }
    }
}
 
Example 2
Source File: AngularJSProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    if (page.getUrl().regex(LIST_URL).match()) {
        List<String> ids = new JsonPathSelector("$.data[*]._id").selectList(page.getRawText());
        if (CollectionUtils.isNotEmpty(ids)) {
            for (String id : ids) {
                page.addTargetRequest("http://angularjs.cn/api/article/" + id);
            }
        }
    } else {
        page.putField("title", new JsonPathSelector("$.data.title").select(page.getRawText()));
        page.putField("content", new JsonPathSelector("$.data.content").select(page.getRawText()));
    }

}
 
Example 3
Source File: ZipCodePageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
private void processCountry(Page page) {
    List<String> provinces = page.getHtml().xpath("//*[@id=\"newAlexa\"]/table/tbody/tr/td").all();
    for (String province : provinces) {
        String link = xpath("//@href").select(province);
        String title = xpath("/text()").select(province);
        Request request = new Request(link).setPriority(0).putExtra("province", title);
        page.addTargetRequest(request);
    }
}
 
Example 4
Source File: ZipCodePageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
private void processProvince(Page page) {
    //这里仅靠xpath没法精准定位,所以使用正则作为筛选,不符合正则的会被过滤掉
    List<String> districts = page.getHtml().xpath("//body/table/tbody/tr[@bgcolor=\"#ffffff\"]").all();
    Pattern pattern = Pattern.compile("<td>([^<>]+)</td>.*?href=\"(.*?)\"",Pattern.DOTALL);
    for (String district : districts) {
        Matcher matcher = pattern.matcher(district);
        while (matcher.find()) {
            String title = matcher.group(1);
            String link = matcher.group(2);
            Request request = new Request(link).setPriority(1).putExtra("province", page.getRequest().getExtra("province")).putExtra("district", title);
            page.addTargetRequest(request);
        }
    }
}
 
Example 5
Source File: ZipCodePageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
private void processDistrict(Page page) {
    String province = page.getRequest().getExtra("province").toString();
    String district = page.getRequest().getExtra("district").toString();
    String zipCode = page.getHtml().regex("<h2>邮编:(\\d+)</h2>").toString();
    page.putField("result", StringUtils.join(new String[]{province, district,
            zipCode}, "\t"));
    List<String> links = page.getHtml().links().regex("http://www\\.ip138\\.com/\\d{6}[/]?$").all();
    for (String link : links) {
        page.addTargetRequest(new Request(link).setPriority(2).putExtra("province", province).putExtra("district", district));
    }

}
 
Example 6
Source File: KaichibaProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    //http://progressdaily.diandian.com/post/2013-01-24/40046867275
    int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1;
    page.addTargetRequest("http://kaichiba.com/shop/" + i);
    page.putField("title",page.getHtml().xpath("//Title"));
    page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace("<span>.*?</span>", ""));
}
 
Example 7
Source File: CourseSpider.java    From SmartEducation with Apache License 2.0 4 votes vote down vote up
/**
	 * 爬取课程
	 */
	public void crawerCourse(Page page) {
		/**
		 * 得到上级传来的专业类型实体
		 */
		SpiderProfessionType professionTypeModel = (SpiderProfessionType) page.getRequest().getExtra(
				"professionTypeModel");
		
		// <div class="label">
		// 哲学 </div>

		// 筛选专业类型
		String professionTypeName = page.getHtml()
				.xpath("//div[@class='label']/text()").toString();
		// <li class="ans-slow-anim">
		// <div class="picArea ans-slow-anim"><a href="/course/198413.html"
		// target="_blank">
		// <img
		// src="http://p.ananas.chaoxing.com/star/258_153c/1384413396917gvcrs.jpg"
		// width="178" height="109"></a>
		// </div>
		// <div class="introArea"><a href="/course/198413.html" target="_blank"
		// title="中华传统思想-对话先秦哲学">中华传统思想-对话先秦哲学</a></div>
		// <div class="introArea2" title="万献初 李景林 郭齐勇 夏可君  陈炎   武汉大学">
		// 万献初等
		// 武汉大学
		//
		// </div>
		// </li>
		// 筛选名称
		List<String> courseNameList = page.getHtml()
				.xpath("//div[@class='introArea']/a/html()").all();
		// page.putField("courseNameList", courseNameList);
		// 筛选url
		List<String> courseUrlList = page.getHtml()
				.xpath("//div[@class='introArea']/a/@href").all();
		// page.putField("courseUrlList", courseUrlList);
		// 筛选信息
		List<String> infoList = page.getHtml()
				.xpath("//div[@class='introArea2']/@title").all();
		// page.putField("infoList", infoList);
		
		//筛选imgUrl
//		<div class="picArea ans-slow-anim"><a href="/course/157855.html" target="_blank">
//			<img src="http://p.ananas.chaoxing.com/star/258_153c/1383715356523iiuzg.jpg" width="178" height="109"></a>
//		</div>
		List<String> courseImgUrlList=page.getHtml().xpath("//div[@class='picArea ans-slow-anim']/a/img/@src").all();
		
		if (courseNameList.size() > 0) {
			for (int i = 0; i < courseNameList.size(); i++) {
				SpiderCourse model = new SpiderCourse(courseNameList.get(i)
						.toString().trim(), courseUrlList.get(i).toString()
						.trim(), infoList.get(i).toString(), professionTypeName,courseImgUrlList.get(i).toString(),
						professionTypeModel,0);
				spiderCourseService.save(model);

				// Request request2=new
				// Request(courseUrlList.get(i)).setPriority(1).putExtra("courseModel",
				// model);
				// page.putField("model", model);
				// 设置优先级为1
				page.addTargetRequest(new Request(courseUrlList.get(i))
						.setPriority(1).putExtra("courseModel", model));
			}
		}
		//查找所有的课程类型
		 List<SpiderProfessionType> list =
		 spiderProfessionTypeService.findAll();
		 for (int j = 2; j < list.size(); j++) {
			 // 设置优先级为0
			 page.addTargetRequest(new Request(list.get(j).getUrl()+"/0/1400").setPriority(0).
					 putExtra("professionTypeModel", list.get(j)));
		 }
	}