Java Code Examples for us.codecraft.webmagic.Page

The following examples show how to use us.codecraft.webmagic.Page. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: mogu_blog_v2   Source File: BlogProcesser.java    License: Apache License 2.0 6 votes vote down vote up
private void saveBlogInfo(Page page) {

        //2、获取我们需要的内容: title和content
        String title = page.getHtml().xpath("//*[@id=\"mainBox\"]/main/div[1]/div/div/div[1]/h1/text()").toString();
        String content = page.getHtml().xpath("//*[@id=\"article_content\"]").toString();


        if (title != null) {
            page.putField("title", title);
            page.putField("content", content);
        } else {
			//跳过爬取
            page.setSkip(true);
        }

    }
 
Example 2
Source Project: plumemo   Source File: HttpClientDownloader.java    License: Apache License 2.0 6 votes vote down vote up
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
    byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
    String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
    Page page = new Page();
    page.setBytes(bytes);
    if (!request.isBinaryContent()) {
        if (charset == null) {
            charset = getHtmlCharset(contentType, bytes);
        }
        page.setCharset(charset);
        page.setRawText(new String(bytes, charset));
    }
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
    page.setDownloadSuccess(true);
    if (responseHeader) {
        page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
    }
    return page;
}
 
Example 3
@Override
public void process(Page page) {
    List<String> url_list = new ArrayList<>();
    List<String> name_list = new ArrayList<>();
    JSONObject jsonObject = (JSONObject) JSONObject.parse(page.getRawText());
    JSONArray data = (JSONArray) jsonObject.get("imgs");
    for(int i=0;i<data.size();i++){
        String url = (String) data.getJSONObject(i).get("objURL");
        String name = (String) data.getJSONObject(i).get("fromPageTitleEnc");
        if(url!=null){
            url_list.add(url);
            name_list.add(name);
        }
    }
    setUrls(url_list);
    setNames(name_list);
}
 
Example 4
Source Project: spider   Source File: CasperjsDownloader.java    License: GNU General Public License v3.0 6 votes vote down vote up
@Override
public Page download(Request request, Task task) {
    String html = null;
    Site site = null;
    if (task != null) {
        site = task.getSite();
    }
    try {
        html = casperjs.gatherHtml(new com.gs.spider.model.commons.Request(request.getUrl(), true));
    } catch (Exception e) {
        if (site.getCycleRetryTimes() > 0) {
            return addToCycleRetry(request, site);
        }
        request.putExtra("EXCEPTION", e);
        onError(request);
        return null;
    }
    Page page = new Page();
    page.setRawText(html);
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    onSuccess(request);
    return page;
}
 
Example 5
Source Project: javabase   Source File: ContentImageProcessor.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void process(Page page) {
    List<String> imageUrlList = page.getHtml().$(".BDE_Image", "src").all();
    String pageId = page.getUrl().toString().replace(tieBaConfiguration.getTiebaContentPageUrl(),"");
    List<String> list = new ArrayList<>();
    for (String imageUrl : imageUrlList) {
        if (imageUrl.startsWith(tieBaConfiguration.getTiebaImageUrl())) {
            imageUrl=convertImageUrl(imageUrl);
            if (null!=imageUrl)list.add(imageUrl);
        }
    }
    if (list.size() > 0) {
        map.put(WebmagicService.getByte(TieBaImageIdMessageListener.TIEBA_CONTENT_IMAGE_KEY+pageId), WebmagicService.getByte(JSONObject.toJSONString(list)));
    }else{
        redisTemplate.convertAndSend(tieBaConfiguration.getTiebaContentNoImageIdTopic(), JSONObject.toJSONString(new ContentBean(pageId,tiebaName)));
    }
    if (!isAddTarget) {
        for (String id : pageNumberList) {
            StringBuilder sb = new StringBuilder();
            sb.append(url).append(id);
            page.addTargetRequests(Arrays.asList(sb.toString()));
        }
        isAddTarget = true;
    }
}
 
Example 6
Source Project: webmagic   Source File: ConfigurablePageProcessor.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void process(Page page) {
    for (ExtractRule extractRule : extractRules) {
        if (extractRule.isMulti()) {
            List<String> results = page.getHtml().selectDocumentForList(extractRule.getSelector());
            if (extractRule.isNotNull() && results.size() == 0) {
                page.setSkip(true);
            } else {
                page.getResultItems().put(extractRule.getFieldName(), results);
            }
        } else {
            String result = page.getHtml().selectDocument(extractRule.getSelector());
            if (extractRule.isNotNull() && result == null) {
                page.setSkip(true);
            } else {
                page.getResultItems().put(extractRule.getFieldName(), result);
            }
        }
    }
}
 
Example 7
Source Project: javabase   Source File: ContentIdProcessor.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void process(Page page) {
    for (int i = 1; i <= pageSize; i++) {
        String json = page.getHtml().xpath("//ul[@id='thread_list']/li[@class='j_thread_list clearfix'][" + (i) + "]/@data-field").toString();
        if(json!=null&&JSONObject.parseObject(json).containsKey("id")){
            JSONObject jsonObject = JSONObject.parseObject(json);
            String pageId=jsonObject.getString("id");
            String authorName=jsonObject.getString("author_name");
            String date = praseDate(page,i);
            String title=page.getHtml().xpath("a[@href='"+tieBaConfiguration.getTiebaContentPageUrl()+pageId+"']/@title").toString();

            pageNumberList.add(new ContentBean(pageId,date,tiebaName,authorName,title));
        }
    }

    if (!isAddTarget) {
        for (int i = 2; i <= endNum; i++) {
            StringBuilder sb = new StringBuilder();
            sb.append(tiebaUrl).append("&pn=" + i*pageSize);
            page.addTargetRequests(Arrays.asList(sb.toString()));
        }
        isAddTarget = true;
    }
}
 
Example 8
Source Project: SmartEducation   Source File: CourseSpider.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void process(Page page) {
	// 格式:http://mooc.chaoxing.com/category/01/0/1000
	if (page.getUrl().regex("http://mooc\\.chaoxing\\.com/category/\\d+/\\d/\\d+")
			.toString() != null) {
		System.out.println("第一层");
		crawerCourse(page);
	}
	// 格式:http://mooc.chaoxing.com/course/55672.html
	else if (page.getUrl().regex("http://mooc\\.chaoxing\\.com/course/\\d+\\.html")
			.toString() != null) {
		System.out.println("第二层");
		crawCourseInfo(page);
	}

}
 
Example 9
Source Project: SmartEducation   Source File: ProfessionTypeSpider.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void process(Page page) {
	// <li><a href="/category/01">哲学</a></li>
	// 筛选名称
	List<String> professionTypeNameList = page.getHtml()
			.xpath("//ul[@class='category']/li/a/html()").all();
	page.putField("professionName", professionTypeNameList);
	// 筛选url
	List<String> professionTypeUrlList = page.getHtml().xpath("//ul[@class='category']/li/a/@href").all();
	page.putField("professionName", professionTypeUrlList);
	
	if(professionTypeNameList.size()>0){
		for(int i=0;i<professionTypeNameList.size();i++){
			SpiderProfessionType model=new SpiderProfessionType(professionTypeNameList.get(i).toString(), professionTypeUrlList.get(i));
			spiderProfessionTypeService.save(model);
		}
	}
}
 
Example 10
Source Project: webmagic   Source File: DiandianBlogProcessor.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void process(Page page) {
    //a()表示提取链接,links()表示提取所有链接
    //getHtml()返回Html对象,支持链式调用
    //r()表示用正则表达式提取一条内容,regex()表示提取多条内容
    //toString()表示取单条结果,all()表示取多条
    List<String> requests = page.getHtml().links().regex("(.*/post/.*)").all();
    //使用page.addTargetRequests()方法将待抓取的链接加入队列
    page.addTargetRequests(requests);
    //page.putField(key,value)将抽取的内容加入结果Map
    //x()和xs()使用xpath进行抽取
    page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|").toString());
    //smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率
    page.putField("content", page.getHtml().smartContent());
    page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/"));
    page.putField("id", page.getUrl().regex("post/\\d+-\\d+-\\d+/(\\d+)"));
}
 
Example 11
Source Project: webmagic   Source File: SeleniumDownloaderTest.java    License: Apache License 2.0 6 votes vote down vote up
@Ignore("need chrome driver")
@Test
public void test() {
	SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
	long time1 = System.currentTimeMillis();
	for (int i = 0; i < 100; i++) {
		Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() {
			@Override
			public String getUUID() {
				return "huaban.com";
			}

			@Override
			public Site getSite() {
				return Site.me();
			}
		});
		System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all());
	}
	System.out.println(System.currentTimeMillis() - time1);
}
 
Example 12
Source Project: webmagic   Source File: SeleniumDownloaderTest.java    License: Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testBaiduWenku() {
	SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
       seleniumDownloader.setSleepTime(10000);
	long time1 = System.currentTimeMillis();
	Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() {
		@Override
		public String getUUID() {
			return "huaban.com";
		}

		@Override
		public Site getSite() {
			return Site.me();
		}
	});
	System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>","").replace("&nsbp;","").all());
}
 
Example 13
Source Project: webmagic   Source File: HttpClientDownloader.java    License: Apache License 2.0 6 votes vote down vote up
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
    byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
    String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
    Page page = new Page();
    page.setBytes(bytes);
    if (!request.isBinaryContent()){
        if (charset == null) {
            charset = getHtmlCharset(contentType, bytes);
        }
        page.setCharset(charset);
        page.setRawText(new String(bytes, charset));
    }
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
    page.setDownloadSuccess(true);
    if (responseHeader) {
        page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
    }
    return page;
}
 
Example 14
Source Project: webmagic   Source File: ProcessorBenchmark.java    License: Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void test() {
    ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me(), OschinaBlog.class);
    Page page = new Page();
    page.setRequest(new Request("http://my.oschina.net/flashsword/blog"));
    page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog"));
    page.setHtml(new Html(html));
    long time = System.currentTimeMillis();
    for (int i = 0; i < 1000; i++) {
        modelPageProcessor.process(page);
    }
    System.out.println(System.currentTimeMillis() - time);
    time = System.currentTimeMillis();
    for (int i = 0; i < 1000; i++) {
        modelPageProcessor.process(page);
    }
    System.out.println(System.currentTimeMillis() - time);
}
 
Example 15
Source Project: webmagic   Source File: HttpClientDownloaderTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void test_set_request_cookie() throws Exception {
    HttpServer server = httpServer(13423);
    server.get(eq(cookie("cookie"), "cookie-webmagic")).response("ok");
    Runner.running(server, new Runnable() {
        @Override
        public void run() throws Exception {
            HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
            Request request = new Request();
            request.setUrl("http://127.0.0.1:13423");
            request.addCookie("cookie","cookie-webmagic");
            Page page = httpClientDownloader.download(request, Site.me().toTask());
            assertThat(page.getRawText()).isEqualTo("ok");
        }
    });
}
 
Example 16
Source Project: webmagic   Source File: ModelPageProcessor.java    License: Apache License 2.0 6 votes vote down vote up
private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) {
    List<String> links;
    if (urlRegionSelector == null) {
        links = page.getHtml().links().all();
    } else {
        links = page.getHtml().selectList(urlRegionSelector).links().all();
    }
    for (String link : links) {
        for (Pattern targetUrlPattern : urlPatterns) {
            Matcher matcher = targetUrlPattern.matcher(link);
            if (matcher.find()) {
                page.addTargetRequest(new Request(matcher.group(0)));
            }
        }
    }
}
 
Example 17
Source Project: webmagic   Source File: HttpClientDownloaderTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void test_set_site_cookie() throws Exception {
    HttpServer server = httpServer(13423);
    server.get(eq(cookie("cookie"), "cookie-webmagic")).response("ok");
    Runner.running(server, new Runnable() {
        @Override
        public void run() throws Exception {
            HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
            Request request = new Request();
            request.setUrl("http://127.0.0.1:13423");
            Site site = Site.me().addCookie("cookie", "cookie-webmagic").setDomain("127.0.0.1");
            Page page = httpClientDownloader.download(request, site.toTask());
            assertThat(page.getRawText()).isEqualTo("ok");
        }
    });
}
 
Example 18
Source Project: webmagic   Source File: HttpClientDownloaderTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void test_download_binary_content() throws Exception {
    HttpServer server = httpServer(13423);
    server.response("binary");
    Runner.running(server, new Runnable() {
        @Override
        public void run() throws Exception {
            final HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
            Request request = new Request();
            request.setBinaryContent(true);
            request.setUrl("http://127.0.0.1:13423/");
            Page page = httpClientDownloader.download(request, Site.me().toTask());
            assertThat(page.getRawText()).isNull();
            assertThat(page.getBytes()).isEqualTo("binary".getBytes());
        }
    });
}
 
Example 19
Source Project: webmagic   Source File: CompositePageProcessor.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    for (SubPageProcessor subPageProcessor : subPageProcessors) {
        if (subPageProcessor.match(page.getRequest())) {
            SubPageProcessor.MatchOther matchOtherProcessorProcessor = subPageProcessor.processPage(page);
            if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != SubPageProcessor.MatchOther.YES) {
                return;
            }
        }
    }
}
 
Example 20
Source Project: blog-hunter   Source File: HunterProcessor.java    License: MIT License 5 votes vote down vote up
@Override
public void process(Page page) {
    Resolver resolver = new HtmlResolver();
    if (config.getAjaxRequest()) {
        resolver = new JsonResolver();
    }
    resolver.process(page, config);

}
 
Example 21
Source Project: webmagic   Source File: DiaoyuwengProcessor.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all();
    page.addTargetRequests(requests);
    requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all();
    page.addTargetRequests(requests);
    if (page.getUrl().toString().contains("thread")){
        page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']"));
        page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()"));
        page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"));
        page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString()));
    }
}
 
Example 22
Source Project: webmagic   Source File: ModelPageProcessorTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testExtractLinks() throws Exception {
    ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(null, MockModel.class);
    Page page = pageMocker.getMockPage();
    modelPageProcessor.process(page);
    assertThat(page.getTargetRequests()).containsExactly(new Request("http://webmagic.io/bar/3"), new Request("http://webmagic.io/bar/4"), new Request("http://webmagic.io/foo/3"), new Request("http://webmagic.io/foo/4"));
}
 
Example 23
Source Project: blog-hunter   Source File: JsonResolver.java    License: MIT License 5 votes vote down vote up
@Override
public void process(Page page, HunterConfig model) {
    String rawText = page.getRawText();
    String title = new JsonPathSelector(model.getTitleRegex()).select(rawText);
    if (!StringUtils.isEmpty(title) && !"null".equals(title)) {
        page.putField("title", title);
        page.putField("releaseDate", new JsonPathSelector(model.getReleaseDateRegex()).select(rawText));
        page.putField("author", new JsonPathSelector(model.getAuthorRegex()).select(rawText));
        page.putField("content", new JsonPathSelector(model.getContentRegex()).select(rawText));
        page.putField("source", page.getRequest().getUrl());
    }
    page.addTargetRequests(page.getHtml().links().regex(model.getTargetLinksRegex()).all());
}
 
Example 24
Source Project: webmagic   Source File: ZipCodePageProcessor.java    License: Apache License 2.0 5 votes vote down vote up
private void processProvince(Page page) {
    //这里仅靠xpath没法精准定位,所以使用正则作为筛选,不符合正则的会被过滤掉
    List<String> districts = page.getHtml().xpath("//body/table/tbody/tr[@bgcolor=\"#ffffff\"]").all();
    Pattern pattern = Pattern.compile("<td>([^<>]+)</td>.*?href=\"(.*?)\"",Pattern.DOTALL);
    for (String district : districts) {
        Matcher matcher = pattern.matcher(district);
        while (matcher.find()) {
            String title = matcher.group(1);
            String link = matcher.group(2);
            Request request = new Request(link).setPriority(1).putExtra("province", page.getRequest().getExtra("province")).putExtra("district", title);
            page.addTargetRequest(request);
        }
    }
}
 
Example 25
Source Project: blog-hunter   Source File: HtmlResolver.java    License: MIT License 5 votes vote down vote up
private void put(Page page, Html pageHtml, String key, String regex) {
    if (StringUtils.isNotEmpty(regex)) {
        if (key.equals("tags")) {
            page.putField(key, pageHtml.xpath(regex).all());
            return;
        }
        page.putField(key, pageHtml.xpath(regex).get());
    }
}
 
Example 26
Source Project: webmagic   Source File: HuxiuProcessor.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    List<String> requests = page.getHtml().links().regex(".*article.*").all();
    page.addTargetRequests(requests);
    page.putField("title",page.getHtml().xpath("//div[@class='clearfix neirong']//h1/text()"));
    page.putField("content",page.getHtml().xpath("//div[@id='neirong_box']/tidyText()"));
}
 
Example 27
Source Project: plumemo   Source File: HttpClientDownloader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Page download(Request request, Task task) {
    if (task == null || task.getSite() == null) {
        throw new NullPointerException("task or site can not be null");
    }
    CloseableHttpResponse httpResponse = null;
    CloseableHttpClient httpClient = getHttpClient(task.getSite());
    Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null;
    HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
    Page page = Page.fail();
    try {
        httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
        page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
        onSuccess(request);
        logger.info("downloading page success {}", request.getUrl());
        return page;
    } catch (IOException e) {
        logger.warn("download page {} error", request.getUrl(), e);
        onError(request);
        return page;
    } finally {
        if (httpResponse != null) {
            //ensure the connection is released back to pool
            EntityUtils.consumeQuietly(httpResponse.getEntity());
        }
        if (proxyProvider != null && proxy != null) {
            proxyProvider.returnProxy(proxy, page, task);
        }
    }
}
 
Example 28
Source Project: webmagic   Source File: KaichibaProcessor.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    //http://progressdaily.diandian.com/post/2013-01-24/40046867275
    int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1;
    page.addTargetRequest("http://kaichiba.com/shop/" + i);
    page.putField("title",page.getHtml().xpath("//Title"));
    page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace("<span>.*?</span>", ""));
}
 
Example 29
Source Project: Gather-Platform   Source File: CommonSpider.java    License: GNU General Public License v3.0 5 votes vote down vote up
@Override
public void process(Page page) {
    spiderInfoPageConsumer.accept(page, info, task);
    if (!page.getResultItems().isSkip()) {//网页正常时再增加数量
        task.increaseCount();
    }
}
 
Example 30
@Override
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
    Page page;
    try {
        page = super.handleResponse(request, charset, httpResponse, task);
    } catch (IllegalArgumentException e) {
        writeExceptionLog(e, request);
        onError(request);
        LOG.warn("URL为:{} ,{}", request.getUrl(), e.getLocalizedMessage());
        throw e;
    }
    return page;
}