Java Code Examples for us.codecraft.webmagic.Task

The following examples show how to use us.codecraft.webmagic.Task. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: blog-hunter   Source File: HttpClientDownloader.java    License: MIT License 6 votes vote down vote up
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
    byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
    String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
    Page page = new Page();
    page.setBytes(bytes);
    if (!request.isBinaryContent()){
        if (charset == null) {
            charset = getHtmlCharset(contentType, bytes);
        }
        page.setCharset(charset);
        page.setRawText(new String(bytes, charset));
    }
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
    page.setDownloadSuccess(true);
    if (responseHeader) {
        page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
    }
    return page;
}
 
Example 2
Source Project: webmagic   Source File: RedisPriorityScheduler.java    License: Apache License 2.0 6 votes vote down vote up
private String getRequest(Jedis jedis, Task task)
{
    String url;
    Set<String> urls = jedis.zrevrange(getZsetPlusPriorityKey(task), 0, 0);
    if(urls.isEmpty())
    {
        url = jedis.lpop(getQueueNoPriorityKey(task));
        if(StringUtils.isBlank(url))
        {
            urls = jedis.zrevrange(getZsetMinusPriorityKey(task), 0, 0);
            if(!urls.isEmpty())
            {
                url = urls.toArray(new String[0])[0];
                jedis.zrem(getZsetMinusPriorityKey(task), url);
            }
        }
    }
    else
    {
        url = urls.toArray(new String[0])[0];
        jedis.zrem(getZsetPlusPriorityKey(task), url);
    }
    return url;
}
 
Example 3
Source Project: webmagic   Source File: RedisPriorityScheduler.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public synchronized Request poll(Task task)
{
    Jedis jedis = pool.getResource();
    try
    {
        String url = getRequest(jedis, task);
        if(StringUtils.isBlank(url))
            return null;
        return getExtrasInItem(jedis, url, task);
    }
    finally
    {
        pool.returnResource(jedis);
    }
}
 
Example 4
Source Project: webmagic   Source File: SeleniumDownloaderTest.java    License: Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testBaiduWenku() {
	SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
       seleniumDownloader.setSleepTime(10000);
	long time1 = System.currentTimeMillis();
	Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() {
		@Override
		public String getUUID() {
			return "huaban.com";
		}

		@Override
		public Site getSite() {
			return Site.me();
		}
	});
	System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>","").replace("&nsbp;","").all());
}
 
Example 5
Source Project: webmagic   Source File: ScriptConsole.java    License: Apache License 2.0 6 votes vote down vote up
private static void startSpider(Params params) {
    ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom()
            .language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build();
    pageProcessor.getSite().setSleepTime(params.getSleepTime());
    pageProcessor.getSite().setRetryTimes(3);
    pageProcessor.getSite().setAcceptStatCode(WMCollections.<Integer>newHashSet(200, 404,403, 500,502));
    Spider spider = Spider.create(pageProcessor).thread(params.getThread());
    spider.clearPipeline().addPipeline(new Pipeline() {
        @Override
        public void process(ResultItems resultItems, Task task) {

        }
    });
    if (params.getUrls() == null || params.getUrls().size() == 0) {
        System.err.println("Need at least one argument");
        System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]");
        System.exit(-1);
    }
    for (String url : params.getUrls()) {
        spider.addUrl(url);
    }
    spider.run();
}
 
Example 6
Source Project: webmagic   Source File: ModelPipeline.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void process(ResultItems resultItems, Task task) {
    for (Map.Entry<Class, PageModelPipeline> classPageModelPipelineEntry : pageModelPipelines.entrySet()) {
        Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName());
        if (o != null) {
            Annotation annotation = classPageModelPipelineEntry.getKey().getAnnotation(ExtractBy.class);
            if (annotation == null || !((ExtractBy) annotation).multi()) {
                classPageModelPipelineEntry.getValue().process(o, task);
            } else {
                List<Object> list = (List<Object>) o;
                for (Object o1 : list) {
                    classPageModelPipelineEntry.getValue().process(o1, task);
                }
            }
        }
    }
}
 
Example 7
Source Project: webmagic   Source File: OneFilePipeline.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public synchronized void process(ResultItems resultItems, Task task) {
    printWriter.println("url:\t" + resultItems.getRequest().getUrl());
    for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
        if (entry.getValue() instanceof Iterable) {
            Iterable value = (Iterable) entry.getValue();
            printWriter.println(entry.getKey() + ":");
            for (Object o : value) {
                printWriter.println(o);
            }
        } else {
            printWriter.println(entry.getKey() + ":\t" + entry.getValue());
        }
    }
    printWriter.flush();
}
 
Example 8
Source Project: webmagic   Source File: HttpClientDownloader.java    License: Apache License 2.0 6 votes vote down vote up
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
    byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
    String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
    Page page = new Page();
    page.setBytes(bytes);
    if (!request.isBinaryContent()){
        if (charset == null) {
            charset = getHtmlCharset(contentType, bytes);
        }
        page.setCharset(charset);
        page.setRawText(new String(bytes, charset));
    }
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
    page.setDownloadSuccess(true);
    if (responseHeader) {
        page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
    }
    return page;
}
 
Example 9
Source Project: webmagic   Source File: FilePageModelPipeline.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void process(Object o, Task task) {
    String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
    try {
        String filename;
        if (o instanceof HasKey) {
            filename = path + ((HasKey) o).key() + ".html";
        } else {
            filename = path + DigestUtils.md5Hex(ToStringBuilder.reflectionToString(o)) + ".html";
        }
        PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(filename)));
        printWriter.write(ToStringBuilder.reflectionToString(o));
        printWriter.close();
    } catch (IOException e) {
        logger.warn("write file error", e);
    }
}
 
Example 10
Source Project: webmagic   Source File: FilePipelineTest.java    License: Apache License 2.0 6 votes vote down vote up
@BeforeClass
public static void before() {
    resultItems = new ResultItems();
    resultItems.put("content", "webmagic 爬虫工具");
    Request request = new Request("http://www.baidu.com");
    resultItems.setRequest(request);

    task = new Task() {
        @Override
        public String getUUID() {
            return UUID.randomUUID().toString();
        }

        @Override
        public Site getSite() {
            return null;
        }
    };
}
 
Example 11
Source Project: spring-boot-demo   Source File: MyPipeline.java    License: MIT License 5 votes vote down vote up
@Override
public void process(ResultItems resultItems, Task task) {
    log.info("get page: " + resultItems.getRequest().getUrl());
    for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
        log.info(entry.getKey() + ":\t" + entry.getValue());
    }
}
 
Example 12
Source Project: feiqu-opensource   Source File: TopicInfoPipeline.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void process(V2exDTO v2exDTO, Task task) {
    if(StringUtils.isEmpty(v2exDTO.getTitle())){
        return;
    }
    Date now = new Date();
    FqTopicExample topicExample = new FqTopicExample();
    topicExample.createCriteria().andGmtCreateGreaterThan(DateUtil.offsetHour(now,-5));
    long count = fqTopicMapper.countByExample(topicExample);
    if(count >= 50){
        OOSpider ooSpider = (OOSpider)task;
        ooSpider.stop();
    }
    topicExample.clear();
    topicExample.createCriteria().andTitleEqualTo(v2exDTO.getTitle()).andAuthorEqualTo(v2exDTO.getAuthor());
    count = fqTopicMapper.countByExample(topicExample);
    if(count > 0){
        return;
    }
    FqTopic fqTopic = DTO2DO(v2exDTO);
    fqTopic.setContent(EmojiUtils.toAliases(fqTopic.getContent()));
    fqTopicMapper.insert(fqTopic);
    if(CollectionUtil.isNotEmpty(v2exDTO.getReply())){
        v2exDTO.getReply().forEach(reply->{
            if(StringUtils.isEmpty(reply)){
                return;
            }
            if(reply.length() > 500){
                reply = reply.substring(0,480);
            }
            reply = EmojiUtils.toAliases(reply);
            FqTopicReply fqTopicReply = new FqTopicReply();
            fqTopicReply.setGmtCreate(now);
            fqTopicReply.setContent(reply);
            fqTopicReply.setTopicId(fqTopic.getId());
            fqTopicReplyMapper.insert(fqTopicReply);
        });
    }
}
 
Example 13
Source Project: mogu_blog_v2   Source File: BlogPipeline.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void process(ResultItems res, Task task) {
    //获取title和content
    String title = res.get("title");
    String content = res.get("content");
    System.out.println("title: " + title);
    System.out.println("content: " + content);
    if (!StringUtils.isEmpty(title) && !StringUtils.isEmpty(content)) {

        try {
            BlogSpider blog = new BlogSpider();
            blog.setUid(idWorker.nextId() + "");
            blog.setTitle(title);
            blog.setSummary(title);
            blog.setContent(content);
            blog.setTagUid("5c4c541e600ff422ccb371ee788f59d6");
            blog.setClickCount(0);
            blog.setCollectCount(0);
            blog.setStatus(EStatus.ENABLE);
            blog.setAdminUid("1f01cd1d2f474743b241d74008b12333");
            blog.setAuthor("陌溪");
            blog.setArticlesPart("蘑菇博客");
            blog.setBlogSortUid("6a1c7a50c0e7b8e8657949bf02d5d0ca");
            blog.setLevel(0);
            blog.setIsPublish(EPublish.PUBLISH);
            blog.setSort(0);
            blog.insert();

            //下载到本地
            //DownloadUtil.download("http://pic.netbian.com"+fileUrl,fileName,SAVE_PATH);

        } catch (Exception e) {
            e.printStackTrace();
        }
    }


}
 
Example 14
Source Project: blog-hunter   Source File: HttpClientDownloader.java    License: MIT License 5 votes vote down vote up
@Override
public Page download(Request request, Task task) {
    if (task == null || task.getSite() == null) {
        throw new NullPointerException("task or site can not be null");
    }
    CloseableHttpResponse httpResponse = null;
    CloseableHttpClient httpClient = getHttpClient(task.getSite());
    Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null;
    HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
    Page page = Page.fail();
    try {
        httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
        page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
        onSuccess(request);
        logger.debug("downloading page success {}", request.getUrl());
        return page;
    } catch (IOException e) {
        logger.warn("download page {} error", request.getUrl(), e);
        onError(request);
        return page;
    } finally {
        if (httpResponse != null) {
            //ensure the connection is released back to pool
            EntityUtils.consumeQuietly(httpResponse.getEntity());
        }
        if (proxyProvider != null && proxy != null) {
            proxyProvider.returnProxy(proxy, page, task);
        }
    }
}
 
Example 15
Source Project: blog-hunter   Source File: BlockingQueueScheduler.java    License: MIT License 5 votes vote down vote up
@Override
public void pushWhenNoDuplicate(Request request, Task task) {
    // 当程序退出方式非URL_COUNT时按照正常逻辑处理
    if (realUrlCount == -1) {
        this.queue.add(request);
        return;
    }
    // 在有效期内(realUrlCount > 0),每次push url时realUrlCount - 1, 当 realUrlCount <= 0 时,当前Scheduler将不再收录新的url
    if (realUrlCount <= 0) {
        return;
    }
    realUrlCount--;
    this.queue.add(request);
}
 
Example 16
Source Project: webmagic   Source File: DelayQueueScheduler.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public synchronized void push(Request request, Task task) {
    if (urls.add(request.getUrl())) {
        queue.add(new RequestWrapper(request));
    }

}
 
Example 17
Source Project: webmagic   Source File: FileCacheQueueScheduler.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public synchronized Request poll(Task task) {
    if (!inited.get()) {
        init(task);
    }
    fileCursorWriter.println(cursor.incrementAndGet());
    return queue.poll();
}
 
Example 18
@Override
public void process(ResultItems resultItems, Task task) {
    Webpage webpage = CommonWebpagePipeline.convertResultItems2Webpage(resultItems);
    try {
        FileUtils.writeStringToFile(
                new File("gather_platform_data/" + webpage.getSpiderUUID() + ".json"),
                gson.toJson(webpage) + "\n",
                true);
    } catch (IOException e) {
        LOG.error("序列化网页信息出错,{}", e.getLocalizedMessage());
    }
}
 
Example 19
Source Project: webmagic   Source File: GithubRepoTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void test() {
    OOSpider.create(Site.me().setSleepTime(0)
            , new PageModelPipeline<GithubRepo>() {
        @Override
        public void process(GithubRepo o, Task task) {
            assertThat(o.getStar()).isEqualTo(86);
            assertThat(o.getFork()).isEqualTo(70);
        }
    }, GithubRepo.class).addUrl("https://github.com/code4craft/webmagic").setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
}
 
Example 20
Source Project: webmagic   Source File: DuplicateRemovedSchedulerTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void test_duplicate_removed_for_get_request() throws Exception {
    DuplicateRemover duplicateRemover = Mockito.mock(DuplicateRemover.class);
    duplicateRemovedScheduler.setDuplicateRemover(duplicateRemover);
    Request request = new Request("https://www.google.com/");
    request.setMethod(HttpConstant.Method.GET);
    duplicateRemovedScheduler.push(request, null);
    verify(duplicateRemover,times(1)).isDuplicate(any(Request.class),any(Task.class));
}
 
Example 21
Source Project: Gather-Platform   Source File: ESPipeline.java    License: GNU General Public License v3.0 5 votes vote down vote up
@Override
public void process(ResultItems resultItems, Task task) {
    Iterator i$ = resultItems.getAll().entrySet().iterator();
    try {
        XContentBuilder xContentBuilder = jsonBuilder().startObject();
        while (i$.hasNext()) {
            Map.Entry entry = (Map.Entry) i$.next();
            xContentBuilder.field((String) entry.getKey(), entry.getValue());
        }
        String json = xContentBuilder.endObject().string();
        IndexResponse response = null;
        if (StringUtils.isNotBlank(resultItems.get("id"))) {
            response = client
                    .prepareIndex(INDEX_NAME, TYPE_NAME, resultItems.get("id"))
                    .setSource(json).get();
        } else {
            response = client
                    .prepareIndex(INDEX_NAME, TYPE_NAME)
                    .setSource(json).get();
        }
        if (response.getResult() != IndexResponse.Result.CREATED)
            LOG.error("索引失败,可能重复创建,resultItem:" + resultItems);
    } catch (IOException e) {
        LOG.error("索引出错," + e.getLocalizedMessage());
        e.printStackTrace();
    }
}
 
Example 22
@Override
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
    Page page;
    try {
        page = super.handleResponse(request, charset, httpResponse, task);
    } catch (IllegalArgumentException e) {
        writeExceptionLog(e, request);
        onError(request);
        LOG.warn("URL为:{} ,{}", request.getUrl(), e.getLocalizedMessage());
        throw e;
    }
    return page;
}
 
Example 23
Source Project: webmagic   Source File: Kr36NewsModel.java    License: Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws IOException, JMException {
    //Just for benchmark
    Spider thread = OOSpider.create(Site.me().setSleepTime(0), new PageModelPipeline() {
        @Override
        public void process(Object o, Task task) {

        }
    }, Kr36NewsModel.class).thread(20).addUrl("http://www.36kr.com/");
    thread.start();
    SpiderMonitor spiderMonitor = SpiderMonitor.instance();
    spiderMonitor.register(thread);
}
 
Example 24
Source Project: webmagic   Source File: PageModelCollectorPipeline.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public synchronized void process(ResultItems resultItems, Task task) {
    Object o = resultItems.get(clazz.getCanonicalName());
    if (o != null) {
        Annotation annotation = clazz.getAnnotation(ExtractBy.class);
        if (annotation == null || !((ExtractBy) annotation).multi()) {
            classPipeline.process((T) o, task);
        } else {
            List<Object> list = (List<Object>) o;
            for (Object o1 : list) {
               classPipeline.process((T) o1, task);
            }
        }
    }
}
 
Example 25
Source Project: spider   Source File: JsonFilePipeline.java    License: GNU General Public License v3.0 5 votes vote down vote up
@Override
public void process(ResultItems resultItems, Task task) {
    Webpage webpage = CommonWebpagePipeline.convertResultItems2Webpage(resultItems);
    try {
        FileUtils.writeStringToFile(
                new File("gather_platform_data/" + webpage.getSpiderUUID() + ".json"),
                gson.toJson(webpage) + "\n",
                true);
    } catch (IOException e) {
        LOG.error("序列化网页信息出错,{}", e.getLocalizedMessage());
    }
}
 
Example 26
Source Project: webmagic   Source File: RedisPriorityScheduler.java    License: Apache License 2.0 5 votes vote down vote up
private void setExtrasInItem(Jedis jedis,Request request, Task task)
{
    if(request.getExtras() != null)
    {
        String field = DigestUtils.shaHex(request.getUrl());
        String value = JSON.toJSONString(request);
        jedis.hset(getItemKey(task), field, value);
    }
}
 
Example 27
Source Project: spider   Source File: CommonWebpagePipeline.java    License: GNU General Public License v3.0 5 votes vote down vote up
@Override
public boolean isDuplicate(Request request, Task task) {
    Set<String> tempLists = urls.computeIfAbsent(task.getUUID(), k -> Sets.newConcurrentHashSet());
    //初始化已采集网站列表缓存
    if (tempLists.add(request.getUrl())) {//先检查当前生命周期是否抓取过,如果当前生命周期未抓取,则进一步检查ES
        GetResponse response = client.prepareGet(INDEX_NAME, TYPE_NAME,
                Hashing.md5().hashString(request.getUrl(), Charset.forName("utf-8")).toString()
        ).get();
        return response.isExists();
    } else {//如果当前生命周期已抓取,直接置为重复
        return true;
    }

}
 
Example 28
@Override
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
    Page page;
    try {
        page = super.handleResponse(request, charset, httpResponse, task);
    } catch (IllegalArgumentException e) {
        writeExceptionLog(e, request);
        onError(request);
        LOG.warn("URL为:{} ,{}", request.getUrl(), e.getLocalizedMessage());
        throw e;
    }
    return page;
}
 
Example 29
Source Project: webmagic   Source File: HttpClientDownloader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Page download(Request request, Task task) {
    if (task == null || task.getSite() == null) {
        throw new NullPointerException("task or site can not be null");
    }
    CloseableHttpResponse httpResponse = null;
    CloseableHttpClient httpClient = getHttpClient(task.getSite());
    Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null;
    HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
    Page page = Page.fail();
    try {
        httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
        page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
        onSuccess(request);
        logger.info("downloading page success {}", request.getUrl());
        return page;
    } catch (IOException e) {
        logger.warn("download page {} error", request.getUrl(), e);
        onError(request);
        return page;
    } finally {
        if (httpResponse != null) {
            //ensure the connection is released back to pool
            EntityUtils.consumeQuietly(httpResponse.getEntity());
        }
        if (proxyProvider != null && proxy != null) {
            proxyProvider.returnProxy(proxy, page, task);
        }
    }
}
 
Example 30
Source Project: webmagic   Source File: OschinaBlog.java    License: Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    OOSpider.create(Site.me()
            .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36")
            .setSleepTime(0)
            .setRetryTimes(3)
            ,new PageModelPipeline() {
        @Override
        public void process(Object o, Task task) {

        }
    }, OschinaBlog.class).thread(10).addUrl("http://my.oschina.net/flashsword/blog").run();
}