us.codecraft.webmagic.Page Java Examples

The following examples show how to use us.codecraft.webmagic.Page. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HttpClientDownloaderTest.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Test
public void test_download_binary_content() throws Exception {
    HttpServer server = httpServer(13423);
    server.response("binary");
    Runner.running(server, new Runnable() {
        @Override
        public void run() throws Exception {
            final HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
            Request request = new Request();
            request.setBinaryContent(true);
            request.setUrl("http://127.0.0.1:13423/");
            Page page = httpClientDownloader.download(request, Site.me().toTask());
            assertThat(page.getRawText()).isNull();
            assertThat(page.getBytes()).isEqualTo("binary".getBytes());
        }
    });
}
 
Example #2
Source File: SeleniumDownloaderTest.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testBaiduWenku() {
	SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
       seleniumDownloader.setSleepTime(10000);
	long time1 = System.currentTimeMillis();
	Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() {
		@Override
		public String getUUID() {
			return "huaban.com";
		}

		@Override
		public Site getSite() {
			return Site.me();
		}
	});
	System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>","").replace("&nsbp;","").all());
}
 
Example #3
Source File: ProcessorBenchmark.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void test() {
    ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me(), OschinaBlog.class);
    Page page = new Page();
    page.setRequest(new Request("http://my.oschina.net/flashsword/blog"));
    page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog"));
    page.setHtml(new Html(html));
    long time = System.currentTimeMillis();
    for (int i = 0; i < 1000; i++) {
        modelPageProcessor.process(page);
    }
    System.out.println(System.currentTimeMillis() - time);
    time = System.currentTimeMillis();
    for (int i = 0; i < 1000; i++) {
        modelPageProcessor.process(page);
    }
    System.out.println(System.currentTimeMillis() - time);
}
 
Example #4
Source File: BlogProcesser.java    From mogu_blog_v2 with Apache License 2.0 6 votes vote down vote up
private void saveBlogInfo(Page page) {

        //2、获取我们需要的内容: title和content
        String title = page.getHtml().xpath("//*[@id=\"mainBox\"]/main/div[1]/div/div/div[1]/h1/text()").toString();
        String content = page.getHtml().xpath("//*[@id=\"article_content\"]").toString();


        if (title != null) {
            page.putField("title", title);
            page.putField("content", content);
        } else {
			//跳过爬取
            page.setSkip(true);
        }

    }
 
Example #5
Source File: HttpClientDownloader.java    From webmagic with Apache License 2.0 6 votes vote down vote up
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
    byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
    String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
    Page page = new Page();
    page.setBytes(bytes);
    if (!request.isBinaryContent()){
        if (charset == null) {
            charset = getHtmlCharset(contentType, bytes);
        }
        page.setCharset(charset);
        page.setRawText(new String(bytes, charset));
    }
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
    page.setDownloadSuccess(true);
    if (responseHeader) {
        page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
    }
    return page;
}
 
Example #6
Source File: HttpClientDownloaderTest.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Test
public void test_set_request_cookie() throws Exception {
    HttpServer server = httpServer(13423);
    server.get(eq(cookie("cookie"), "cookie-webmagic")).response("ok");
    Runner.running(server, new Runnable() {
        @Override
        public void run() throws Exception {
            HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
            Request request = new Request();
            request.setUrl("http://127.0.0.1:13423");
            request.addCookie("cookie","cookie-webmagic");
            Page page = httpClientDownloader.download(request, Site.me().toTask());
            assertThat(page.getRawText()).isEqualTo("ok");
        }
    });
}
 
Example #7
Source File: HttpClientDownloader.java    From plumemo with Apache License 2.0 6 votes vote down vote up
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
    byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
    String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
    Page page = new Page();
    page.setBytes(bytes);
    if (!request.isBinaryContent()) {
        if (charset == null) {
            charset = getHtmlCharset(contentType, bytes);
        }
        page.setCharset(charset);
        page.setRawText(new String(bytes, charset));
    }
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
    page.setDownloadSuccess(true);
    if (responseHeader) {
        page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
    }
    return page;
}
 
Example #8
Source File: SeleniumDownloaderTest.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Ignore("need chrome driver")
@Test
public void test() {
	SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
	long time1 = System.currentTimeMillis();
	for (int i = 0; i < 100; i++) {
		Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() {
			@Override
			public String getUUID() {
				return "huaban.com";
			}

			@Override
			public Site getSite() {
				return Site.me();
			}
		});
		System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all());
	}
	System.out.println(System.currentTimeMillis() - time1);
}
 
Example #9
Source File: DownloadPicture.java    From Gather-Platform with GNU General Public License v3.0 6 votes vote down vote up
@Override
public void process(Page page) {
    List<String> url_list = new ArrayList<>();
    List<String> name_list = new ArrayList<>();
    JSONObject jsonObject = (JSONObject) JSONObject.parse(page.getRawText());
    JSONArray data = (JSONArray) jsonObject.get("imgs");
    for(int i=0;i<data.size();i++){
        String url = (String) data.getJSONObject(i).get("objURL");
        String name = (String) data.getJSONObject(i).get("fromPageTitleEnc");
        if(url!=null){
            url_list.add(url);
            name_list.add(name);
        }
    }
    setUrls(url_list);
    setNames(name_list);
}
 
Example #10
Source File: HttpClientDownloaderTest.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Test
public void test_set_site_cookie() throws Exception {
    HttpServer server = httpServer(13423);
    server.get(eq(cookie("cookie"), "cookie-webmagic")).response("ok");
    Runner.running(server, new Runnable() {
        @Override
        public void run() throws Exception {
            HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
            Request request = new Request();
            request.setUrl("http://127.0.0.1:13423");
            Site site = Site.me().addCookie("cookie", "cookie-webmagic").setDomain("127.0.0.1");
            Page page = httpClientDownloader.download(request, site.toTask());
            assertThat(page.getRawText()).isEqualTo("ok");
        }
    });
}
 
Example #11
Source File: ProfessionTypeSpider.java    From SmartEducation with Apache License 2.0 6 votes vote down vote up
@Override
public void process(Page page) {
	// <li><a href="/category/01">哲学</a></li>
	// 筛选名称
	List<String> professionTypeNameList = page.getHtml()
			.xpath("//ul[@class='category']/li/a/html()").all();
	page.putField("professionName", professionTypeNameList);
	// 筛选url
	List<String> professionTypeUrlList = page.getHtml().xpath("//ul[@class='category']/li/a/@href").all();
	page.putField("professionName", professionTypeUrlList);
	
	if(professionTypeNameList.size()>0){
		for(int i=0;i<professionTypeNameList.size();i++){
			SpiderProfessionType model=new SpiderProfessionType(professionTypeNameList.get(i).toString(), professionTypeUrlList.get(i));
			spiderProfessionTypeService.save(model);
		}
	}
}
 
Example #12
Source File: CasperjsDownloader.java    From spider with GNU General Public License v3.0 6 votes vote down vote up
@Override
public Page download(Request request, Task task) {
    String html = null;
    Site site = null;
    if (task != null) {
        site = task.getSite();
    }
    try {
        html = casperjs.gatherHtml(new com.gs.spider.model.commons.Request(request.getUrl(), true));
    } catch (Exception e) {
        if (site.getCycleRetryTimes() > 0) {
            return addToCycleRetry(request, site);
        }
        request.putExtra("EXCEPTION", e);
        onError(request);
        return null;
    }
    Page page = new Page();
    page.setRawText(html);
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    onSuccess(request);
    return page;
}
 
Example #13
Source File: ContentImageProcessor.java    From javabase with Apache License 2.0 6 votes vote down vote up
@Override
public void process(Page page) {
    List<String> imageUrlList = page.getHtml().$(".BDE_Image", "src").all();
    String pageId = page.getUrl().toString().replace(tieBaConfiguration.getTiebaContentPageUrl(),"");
    List<String> list = new ArrayList<>();
    for (String imageUrl : imageUrlList) {
        if (imageUrl.startsWith(tieBaConfiguration.getTiebaImageUrl())) {
            imageUrl=convertImageUrl(imageUrl);
            if (null!=imageUrl)list.add(imageUrl);
        }
    }
    if (list.size() > 0) {
        map.put(WebmagicService.getByte(TieBaImageIdMessageListener.TIEBA_CONTENT_IMAGE_KEY+pageId), WebmagicService.getByte(JSONObject.toJSONString(list)));
    }else{
        redisTemplate.convertAndSend(tieBaConfiguration.getTiebaContentNoImageIdTopic(), JSONObject.toJSONString(new ContentBean(pageId,tiebaName)));
    }
    if (!isAddTarget) {
        for (String id : pageNumberList) {
            StringBuilder sb = new StringBuilder();
            sb.append(url).append(id);
            page.addTargetRequests(Arrays.asList(sb.toString()));
        }
        isAddTarget = true;
    }
}
 
Example #14
Source File: ConfigurablePageProcessor.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Override
public void process(Page page) {
    for (ExtractRule extractRule : extractRules) {
        if (extractRule.isMulti()) {
            List<String> results = page.getHtml().selectDocumentForList(extractRule.getSelector());
            if (extractRule.isNotNull() && results.size() == 0) {
                page.setSkip(true);
            } else {
                page.getResultItems().put(extractRule.getFieldName(), results);
            }
        } else {
            String result = page.getHtml().selectDocument(extractRule.getSelector());
            if (extractRule.isNotNull() && result == null) {
                page.setSkip(true);
            } else {
                page.getResultItems().put(extractRule.getFieldName(), result);
            }
        }
    }
}
 
Example #15
Source File: ContentIdProcessor.java    From javabase with Apache License 2.0 6 votes vote down vote up
@Override
public void process(Page page) {
    for (int i = 1; i <= pageSize; i++) {
        String json = page.getHtml().xpath("//ul[@id='thread_list']/li[@class='j_thread_list clearfix'][" + (i) + "]/@data-field").toString();
        if(json!=null&&JSONObject.parseObject(json).containsKey("id")){
            JSONObject jsonObject = JSONObject.parseObject(json);
            String pageId=jsonObject.getString("id");
            String authorName=jsonObject.getString("author_name");
            String date = praseDate(page,i);
            String title=page.getHtml().xpath("a[@href='"+tieBaConfiguration.getTiebaContentPageUrl()+pageId+"']/@title").toString();

            pageNumberList.add(new ContentBean(pageId,date,tiebaName,authorName,title));
        }
    }

    if (!isAddTarget) {
        for (int i = 2; i <= endNum; i++) {
            StringBuilder sb = new StringBuilder();
            sb.append(tiebaUrl).append("&pn=" + i*pageSize);
            page.addTargetRequests(Arrays.asList(sb.toString()));
        }
        isAddTarget = true;
    }
}
 
Example #16
Source File: CourseSpider.java    From SmartEducation with Apache License 2.0 6 votes vote down vote up
@Override
public void process(Page page) {
	// 格式:http://mooc.chaoxing.com/category/01/0/1000
	if (page.getUrl().regex("http://mooc\\.chaoxing\\.com/category/\\d+/\\d/\\d+")
			.toString() != null) {
		System.out.println("第一层");
		crawerCourse(page);
	}
	// 格式:http://mooc.chaoxing.com/course/55672.html
	else if (page.getUrl().regex("http://mooc\\.chaoxing\\.com/course/\\d+\\.html")
			.toString() != null) {
		System.out.println("第二层");
		crawCourseInfo(page);
	}

}
 
Example #17
Source File: DiandianBlogProcessor.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Override
public void process(Page page) {
    //a()表示提取链接,links()表示提取所有链接
    //getHtml()返回Html对象,支持链式调用
    //r()表示用正则表达式提取一条内容,regex()表示提取多条内容
    //toString()表示取单条结果,all()表示取多条
    List<String> requests = page.getHtml().links().regex("(.*/post/.*)").all();
    //使用page.addTargetRequests()方法将待抓取的链接加入队列
    page.addTargetRequests(requests);
    //page.putField(key,value)将抽取的内容加入结果Map
    //x()和xs()使用xpath进行抽取
    page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|").toString());
    //smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率
    page.putField("content", page.getHtml().smartContent());
    page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/"));
    page.putField("id", page.getUrl().regex("post/\\d+-\\d+-\\d+/(\\d+)"));
}
 
Example #18
Source File: ModelPageProcessor.java    From webmagic with Apache License 2.0 6 votes vote down vote up
private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) {
    List<String> links;
    if (urlRegionSelector == null) {
        links = page.getHtml().links().all();
    } else {
        links = page.getHtml().selectList(urlRegionSelector).links().all();
    }
    for (String link : links) {
        for (Pattern targetUrlPattern : urlPatterns) {
            Matcher matcher = targetUrlPattern.matcher(link);
            if (matcher.find()) {
                page.addTargetRequest(new Request(matcher.group(0)));
            }
        }
    }
}
 
Example #19
Source File: NeteaseNewsPageProcesser.java    From elasticsearch-jest-example with MIT License 5 votes vote down vote up
public void process(Page page) {
    //列表页
    if (page.getUrl().regex(URL_LIST).match()||page.getUrl().regex("http://news\\.163\\.com/domestic").match()||page.getUrl().regex("http://news\\.163\\.com/shehui").match()) {
        page.addTargetRequests(page.getHtml().links().regex(URL_POST).all());
        page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all());
    }else{

        page.putField("title", Utils.replaceHTML(page.getHtml().xpath("//h1[@id='h1title']").toString()));
        page.putField("content", Utils.replaceHTML(page.getHtml().xpath("//div[@id='endText']").toString()));
        page.putField("create", Utils.replaceHTML(page.getHtml().xpath("//div[@class=\"ep-time-soure cDGray\"]").toString()));
        page.putField("source", Utils.replaceHTML(page.getHtml().xpath("//a[@id=\"ne_article_source\"]/text()").toString()));
        page.putField("url", page.getUrl().get());

        String title = (String)page.getResultItems().get("title");
        String content = (String)page.getResultItems().get("content");
        String create = (String)page.getResultItems().get("create");
        String source = (String)page.getResultItems().get("source");
        String url = (String)page.getResultItems().get("url");
        String author = "";

        // 创建article
        Article article = Utils.createArticle(title, content, source, author, url, create);

        // 索引

        Utils.index(article);

    }
}
 
Example #20
Source File: GithubRepoPageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
    GithubRepo githubRepo = new GithubRepo();
    githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
    githubRepo.setName(page.getHtml().xpath("//h1[contains(@class, 'entry-title') and contains(@class, 'public')]/strong/a/text()").toString());
    githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString());
    if (githubRepo.getName() == null) {
        //skip this page
        page.setSkip(true);
    } else {
        page.putField("repo", githubRepo);
    }
}
 
Example #21
Source File: ZhihuPageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    page.addTargetRequests(page.getHtml().links().regex("https://www\\.zhihu\\.com/question/\\d+/answer/\\d+.*").all());
    page.putField("title", page.getHtml().xpath("//h1[@class='QuestionHeader-title']/text()").toString());
    page.putField("question", page.getHtml().xpath("//div[@class='QuestionRichText']//tidyText()").toString());
    page.putField("answer", page.getHtml().xpath("//div[@class='QuestionAnswer-content']/tidyText()").toString());
    if (page.getResultItems().get("title")==null){
        //skip this page
        page.setSkip(true);
    }
}
 
Example #22
Source File: AmanzonPageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
public void process(Page page) {

        Html html = page.getHtml();
        List<String> questionList =  html.xpath("//table[@class='tgCustomerCommunityCenterColumn']//div[@class='content']//table[@class='dataGrid']//tr").all();

        if(questionList != null && questionList.size() > 1)
        {
            //i=0是列名称,所以i从1开始
            for( int i = 1 ; i < questionList.size(); i++)
            {
                System.out.println(questionList.get(i));
                Html tempHtml =  Html.create("<table>"+questionList.get(i)+"</table>");
                String comment = tempHtml.xpath("//td[@class='title']//a/text()").toString();
                System.out.println(comment);
                String answerNum =  tempHtml.xpath("//td[@class='num']/text()").toString();
                System.out.println(answerNum);
                String createTime = tempHtml.xpath("//td[3]/text()").toString();
                System.out.println(createTime);

				/* Document doc = Jsoup.parse(questionList.get(i));
				 Html hmt  = Html.create(questionList.get(i)) ;
			     String str = hmt.links().toString();
				  String   content =   doc.getElementsByTag("a").text();
				  String ss = doc.text();*/

            }
        }

    }
 
Example #23
Source File: GithubRepoPageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all());
    page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
    page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
    if (page.getResultItems().get("name")==null){
        //skip this page
        page.setSkip(true);
    }
    page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
}
 
Example #24
Source File: SSLCompatibilityTest.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Test
public void test_tls12() throws Exception {
    HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
    Task task = Site.me().setCycleRetryTimes(5).toTask();
    Request request = new Request("https://juejin.im/");
    Page page = httpClientDownloader.download(request, task);
    assertThat(page.isDownloadSuccess()).isTrue();
}
 
Example #25
Source File: ZipCodePageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
private void processCountry(Page page) {
    List<String> provinces = page.getHtml().xpath("//*[@id=\"newAlexa\"]/table/tbody/tr/td").all();
    for (String province : provinces) {
        String link = xpath("//@href").select(province);
        String title = xpath("/text()").select(province);
        Request request = new Request(link).setPriority(0).putExtra("province", title);
        page.addTargetRequest(request);
    }
}
 
Example #26
Source File: MockGithubDownloader.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public Page download(Request request, Task task) {
    Page page = new Page();
    InputStream resourceAsStream = this.getClass().getResourceAsStream("/html/mock-github.html");
    try {
        page.setRawText(IOUtils.toString(resourceAsStream));
    } catch (IOException e) {
        e.printStackTrace();
    }
    page.setRequest(new Request("https://github.com/code4craft/webmagic"));
    page.setUrl(new PlainText("https://github.com/code4craft/webmagic"));
    return page;
}
 
Example #27
Source File: GithubRepoPageProcessor.java    From SmartEducation with Apache License 2.0 5 votes vote down vote up
@Override
    // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
    public void process(Page page) {
        // 部分二:定义如何抽取页面信息,并保存下来
    	String author=page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString();
        page.putField("author",author );
        String name=page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString();
        page.putField("name", name);

        if (page.getResultItems().get("name") == null) {
            //skip this page
            page.setSkip(true);
        }
        String readme=page.getHtml().xpath("//div[@id='readme']/tidyText()").toString();
        page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));

//        if(name==null){
//            Test test=new Test();
//            test.setAuthor(author);
//            test.setName(name);
//            test.setReadme(readme);
//            testService.save(test);
//        }
        // 部分三:从页面发现后续的url地址来抓取
        page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
    
    }
 
Example #28
Source File: HttpClientDownloader.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public Page download(Request request, Task task) {
    if (task == null || task.getSite() == null) {
        throw new NullPointerException("task or site can not be null");
    }
    CloseableHttpResponse httpResponse = null;
    CloseableHttpClient httpClient = getHttpClient(task.getSite());
    Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null;
    HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
    Page page = Page.fail();
    try {
        httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
        page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
        onSuccess(request);
        logger.info("downloading page success {}", request.getUrl());
        return page;
    } catch (IOException e) {
        logger.warn("download page {} error", request.getUrl(), e);
        onError(request);
        return page;
    } finally {
        if (httpResponse != null) {
            //ensure the connection is released back to pool
            EntityUtils.consumeQuietly(httpResponse.getEntity());
        }
        if (proxyProvider != null && proxy != null) {
            proxyProvider.returnProxy(proxy, page, task);
        }
    }
}
 
Example #29
Source File: HuabanProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
    page.addTargetRequests(page.getHtml().links().regex("http://huaban\\.com/.*").all());
    if (page.getUrl().toString().contains("pins")) {
        page.putField("img", page.getHtml().xpath("//div[@class='image-holder']/a/img/@src").toString());
    } else {
        page.getResultItems().setSkip(true);
    }
}
 
Example #30
Source File: DocumentTest.java    From SmartEducation with Apache License 2.0 5 votes vote down vote up
@Override
public void process(Page page) {
	if (page.getUrl()
			.regex("http://mooc\\.chaoxing\\.com/course/\\d+\\.html")
			.toString() != null) {
		System.out.println("第二层");
		crawCourseInfo(page);
	}

}