us.codecraft.webmagic.Task Java Examples

The following examples show how to use us.codecraft.webmagic.Task. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: RedisPriorityScheduler.java    From webmagic with Apache License 2.0 6 votes vote down vote up
private String getRequest(Jedis jedis, Task task)
{
    String url;
    Set<String> urls = jedis.zrevrange(getZsetPlusPriorityKey(task), 0, 0);
    if(urls.isEmpty())
    {
        url = jedis.lpop(getQueueNoPriorityKey(task));
        if(StringUtils.isBlank(url))
        {
            urls = jedis.zrevrange(getZsetMinusPriorityKey(task), 0, 0);
            if(!urls.isEmpty())
            {
                url = urls.toArray(new String[0])[0];
                jedis.zrem(getZsetMinusPriorityKey(task), url);
            }
        }
    }
    else
    {
        url = urls.toArray(new String[0])[0];
        jedis.zrem(getZsetPlusPriorityKey(task), url);
    }
    return url;
}
 
Example #2
Source File: RedisPriorityScheduler.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Override
public synchronized Request poll(Task task)
{
    Jedis jedis = pool.getResource();
    try
    {
        String url = getRequest(jedis, task);
        if(StringUtils.isBlank(url))
            return null;
        return getExtrasInItem(jedis, url, task);
    }
    finally
    {
        pool.returnResource(jedis);
    }
}
 
Example #3
Source File: SeleniumDownloaderTest.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testBaiduWenku() {
	SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
       seleniumDownloader.setSleepTime(10000);
	long time1 = System.currentTimeMillis();
	Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() {
		@Override
		public String getUUID() {
			return "huaban.com";
		}

		@Override
		public Site getSite() {
			return Site.me();
		}
	});
	System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>","").replace("&nsbp;","").all());
}
 
Example #4
Source File: HttpClientDownloader.java    From blog-hunter with MIT License 6 votes vote down vote up
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
    byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
    String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
    Page page = new Page();
    page.setBytes(bytes);
    if (!request.isBinaryContent()){
        if (charset == null) {
            charset = getHtmlCharset(contentType, bytes);
        }
        page.setCharset(charset);
        page.setRawText(new String(bytes, charset));
    }
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
    page.setDownloadSuccess(true);
    if (responseHeader) {
        page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
    }
    return page;
}
 
Example #5
Source File: ScriptConsole.java    From webmagic with Apache License 2.0 6 votes vote down vote up
private static void startSpider(Params params) {
    ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom()
            .language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build();
    pageProcessor.getSite().setSleepTime(params.getSleepTime());
    pageProcessor.getSite().setRetryTimes(3);
    pageProcessor.getSite().setAcceptStatCode(WMCollections.<Integer>newHashSet(200, 404,403, 500,502));
    Spider spider = Spider.create(pageProcessor).thread(params.getThread());
    spider.clearPipeline().addPipeline(new Pipeline() {
        @Override
        public void process(ResultItems resultItems, Task task) {

        }
    });
    if (params.getUrls() == null || params.getUrls().size() == 0) {
        System.err.println("Need at least one argument");
        System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]");
        System.exit(-1);
    }
    for (String url : params.getUrls()) {
        spider.addUrl(url);
    }
    spider.run();
}
 
Example #6
Source File: FilePipelineTest.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@BeforeClass
public static void before() {
    resultItems = new ResultItems();
    resultItems.put("content", "webmagic 爬虫工具");
    Request request = new Request("http://www.baidu.com");
    resultItems.setRequest(request);

    task = new Task() {
        @Override
        public String getUUID() {
            return UUID.randomUUID().toString();
        }

        @Override
        public Site getSite() {
            return null;
        }
    };
}
 
Example #7
Source File: ModelPipeline.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Override
public void process(ResultItems resultItems, Task task) {
    for (Map.Entry<Class, PageModelPipeline> classPageModelPipelineEntry : pageModelPipelines.entrySet()) {
        Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName());
        if (o != null) {
            Annotation annotation = classPageModelPipelineEntry.getKey().getAnnotation(ExtractBy.class);
            if (annotation == null || !((ExtractBy) annotation).multi()) {
                classPageModelPipelineEntry.getValue().process(o, task);
            } else {
                List<Object> list = (List<Object>) o;
                for (Object o1 : list) {
                    classPageModelPipelineEntry.getValue().process(o1, task);
                }
            }
        }
    }
}
 
Example #8
Source File: FilePageModelPipeline.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Override
public void process(Object o, Task task) {
    String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
    try {
        String filename;
        if (o instanceof HasKey) {
            filename = path + ((HasKey) o).key() + ".html";
        } else {
            filename = path + DigestUtils.md5Hex(ToStringBuilder.reflectionToString(o)) + ".html";
        }
        PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(filename)));
        printWriter.write(ToStringBuilder.reflectionToString(o));
        printWriter.close();
    } catch (IOException e) {
        logger.warn("write file error", e);
    }
}
 
Example #9
Source File: OneFilePipeline.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Override
public synchronized void process(ResultItems resultItems, Task task) {
    printWriter.println("url:\t" + resultItems.getRequest().getUrl());
    for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
        if (entry.getValue() instanceof Iterable) {
            Iterable value = (Iterable) entry.getValue();
            printWriter.println(entry.getKey() + ":");
            for (Object o : value) {
                printWriter.println(o);
            }
        } else {
            printWriter.println(entry.getKey() + ":\t" + entry.getValue());
        }
    }
    printWriter.flush();
}
 
Example #10
Source File: HttpClientDownloader.java    From webmagic with Apache License 2.0 6 votes vote down vote up
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
    byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
    String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
    Page page = new Page();
    page.setBytes(bytes);
    if (!request.isBinaryContent()){
        if (charset == null) {
            charset = getHtmlCharset(contentType, bytes);
        }
        page.setCharset(charset);
        page.setRawText(new String(bytes, charset));
    }
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
    page.setDownloadSuccess(true);
    if (responseHeader) {
        page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
    }
    return page;
}
 
Example #11
Source File: JsonFilePipeline.java    From spider with GNU General Public License v3.0 5 votes vote down vote up
@Override
public void process(ResultItems resultItems, Task task) {
    Webpage webpage = CommonWebpagePipeline.convertResultItems2Webpage(resultItems);
    try {
        FileUtils.writeStringToFile(
                new File("gather_platform_data/" + webpage.getSpiderUUID() + ".json"),
                gson.toJson(webpage) + "\n",
                true);
    } catch (IOException e) {
        LOG.error("序列化网页信息出错,{}", e.getLocalizedMessage());
    }
}
 
Example #12
Source File: CommonWebpagePipeline.java    From spider with GNU General Public License v3.0 5 votes vote down vote up
@Override
public boolean isDuplicate(Request request, Task task) {
    Set<String> tempLists = urls.computeIfAbsent(task.getUUID(), k -> Sets.newConcurrentHashSet());
    //初始化已采集网站列表缓存
    if (tempLists.add(request.getUrl())) {//先检查当前生命周期是否抓取过,如果当前生命周期未抓取,则进一步检查ES
        GetResponse response = client.prepareGet(INDEX_NAME, TYPE_NAME,
                Hashing.md5().hashString(request.getUrl(), Charset.forName("utf-8")).toString()
        ).get();
        return response.isExists();
    } else {//如果当前生命周期已抓取,直接置为重复
        return true;
    }

}
 
Example #13
Source File: RedisPriorityScheduler.java    From webmagic with Apache License 2.0 5 votes vote down vote up
private void setExtrasInItem(Jedis jedis,Request request, Task task)
{
    if(request.getExtras() != null)
    {
        String field = DigestUtils.shaHex(request.getUrl());
        String value = JSON.toJSONString(request);
        jedis.hset(getItemKey(task), field, value);
    }
}
 
Example #14
Source File: DelayQueueScheduler.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public synchronized void push(Request request, Task task) {
    if (urls.add(request.getUrl())) {
        queue.add(new RequestWrapper(request));
    }

}
 
Example #15
Source File: PageModelCollectorPipeline.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public synchronized void process(ResultItems resultItems, Task task) {
    Object o = resultItems.get(clazz.getCanonicalName());
    if (o != null) {
        Annotation annotation = clazz.getAnnotation(ExtractBy.class);
        if (annotation == null || !((ExtractBy) annotation).multi()) {
            classPipeline.process((T) o, task);
        } else {
            List<Object> list = (List<Object>) o;
            for (Object o1 : list) {
               classPipeline.process((T) o1, task);
            }
        }
    }
}
 
Example #16
Source File: Kr36NewsModel.java    From webmagic with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws IOException, JMException {
    //Just for benchmark
    Spider thread = OOSpider.create(Site.me().setSleepTime(0), new PageModelPipeline() {
        @Override
        public void process(Object o, Task task) {

        }
    }, Kr36NewsModel.class).thread(20).addUrl("http://www.36kr.com/");
    thread.start();
    SpiderMonitor spiderMonitor = SpiderMonitor.instance();
    spiderMonitor.register(thread);
}
 
Example #17
Source File: ContentLengthLimitHttpClientDownloader.java    From Gather-Platform with GNU General Public License v3.0 5 votes vote down vote up
@Override
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
    Page page;
    try {
        page = super.handleResponse(request, charset, httpResponse, task);
    } catch (IllegalArgumentException e) {
        writeExceptionLog(e, request);
        onError(request);
        LOG.warn("URL为:{} ,{}", request.getUrl(), e.getLocalizedMessage());
        throw e;
    }
    return page;
}
 
Example #18
Source File: ESPipeline.java    From Gather-Platform with GNU General Public License v3.0 5 votes vote down vote up
@Override
public void process(ResultItems resultItems, Task task) {
    Iterator i$ = resultItems.getAll().entrySet().iterator();
    try {
        XContentBuilder xContentBuilder = jsonBuilder().startObject();
        while (i$.hasNext()) {
            Map.Entry entry = (Map.Entry) i$.next();
            xContentBuilder.field((String) entry.getKey(), entry.getValue());
        }
        String json = xContentBuilder.endObject().string();
        IndexResponse response = null;
        if (StringUtils.isNotBlank(resultItems.get("id"))) {
            response = client
                    .prepareIndex(INDEX_NAME, TYPE_NAME, resultItems.get("id"))
                    .setSource(json).get();
        } else {
            response = client
                    .prepareIndex(INDEX_NAME, TYPE_NAME)
                    .setSource(json).get();
        }
        if (response.getResult() != IndexResponse.Result.CREATED)
            LOG.error("索引失败,可能重复创建,resultItem:" + resultItems);
    } catch (IOException e) {
        LOG.error("索引出错," + e.getLocalizedMessage());
        e.printStackTrace();
    }
}
 
Example #19
Source File: JsonFilePipeline.java    From Gather-Platform with GNU General Public License v3.0 5 votes vote down vote up
@Override
public void process(ResultItems resultItems, Task task) {
    Webpage webpage = CommonWebpagePipeline.convertResultItems2Webpage(resultItems);
    try {
        FileUtils.writeStringToFile(
                new File("gather_platform_data/" + webpage.getSpiderUUID() + ".json"),
                gson.toJson(webpage) + "\n",
                true);
    } catch (IOException e) {
        LOG.error("序列化网页信息出错,{}", e.getLocalizedMessage());
    }
}
 
Example #20
Source File: TopicInfoPipeline.java    From feiqu-opensource with Apache License 2.0 5 votes vote down vote up
@Override
public void process(V2exDTO v2exDTO, Task task) {
    if(StringUtils.isEmpty(v2exDTO.getTitle())){
        return;
    }
    Date now = new Date();
    FqTopicExample topicExample = new FqTopicExample();
    topicExample.createCriteria().andGmtCreateGreaterThan(DateUtil.offsetHour(now,-5));
    long count = fqTopicMapper.countByExample(topicExample);
    if(count >= 50){
        OOSpider ooSpider = (OOSpider)task;
        ooSpider.stop();
    }
    topicExample.clear();
    topicExample.createCriteria().andTitleEqualTo(v2exDTO.getTitle()).andAuthorEqualTo(v2exDTO.getAuthor());
    count = fqTopicMapper.countByExample(topicExample);
    if(count > 0){
        return;
    }
    FqTopic fqTopic = DTO2DO(v2exDTO);
    fqTopic.setContent(EmojiUtils.toAliases(fqTopic.getContent()));
    fqTopicMapper.insert(fqTopic);
    if(CollectionUtil.isNotEmpty(v2exDTO.getReply())){
        v2exDTO.getReply().forEach(reply->{
            if(StringUtils.isEmpty(reply)){
                return;
            }
            if(reply.length() > 500){
                reply = reply.substring(0,480);
            }
            reply = EmojiUtils.toAliases(reply);
            FqTopicReply fqTopicReply = new FqTopicReply();
            fqTopicReply.setGmtCreate(now);
            fqTopicReply.setContent(reply);
            fqTopicReply.setTopicId(fqTopic.getId());
            fqTopicReplyMapper.insert(fqTopicReply);
        });
    }
}
 
Example #21
Source File: ContentLengthLimitHttpClientDownloader.java    From spider with GNU General Public License v3.0 5 votes vote down vote up
@Override
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
    Page page;
    try {
        page = super.handleResponse(request, charset, httpResponse, task);
    } catch (IllegalArgumentException e) {
        writeExceptionLog(e, request);
        onError(request);
        LOG.warn("URL为:{} ,{}", request.getUrl(), e.getLocalizedMessage());
        throw e;
    }
    return page;
}
 
Example #22
Source File: HttpClientDownloader.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public Page download(Request request, Task task) {
    if (task == null || task.getSite() == null) {
        throw new NullPointerException("task or site can not be null");
    }
    CloseableHttpResponse httpResponse = null;
    CloseableHttpClient httpClient = getHttpClient(task.getSite());
    Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null;
    HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
    Page page = Page.fail();
    try {
        httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
        page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
        onSuccess(request);
        logger.info("downloading page success {}", request.getUrl());
        return page;
    } catch (IOException e) {
        logger.warn("download page {} error", request.getUrl(), e);
        onError(request);
        return page;
    } finally {
        if (httpResponse != null) {
            //ensure the connection is released back to pool
            EntityUtils.consumeQuietly(httpResponse.getEntity());
        }
        if (proxyProvider != null && proxy != null) {
            proxyProvider.returnProxy(proxy, page, task);
        }
    }
}
 
Example #23
Source File: OschinaBlog.java    From webmagic with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    OOSpider.create(Site.me()
            .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36")
            .setSleepTime(0)
            .setRetryTimes(3)
            ,new PageModelPipeline() {
        @Override
        public void process(Object o, Task task) {

        }
    }, OschinaBlog.class).thread(10).addUrl("http://my.oschina.net/flashsword/blog").run();
}
 
Example #24
Source File: PriorityScheduler.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void pushWhenNoDuplicate(Request request, Task task) {
    if (request.getPriority() == 0) {
        noPriorityQueue.add(request);
    } else if (request.getPriority() > 0) {
        priorityQueuePlus.put(request);
    } else {
        priorityQueueMinus.put(request);
    }
}
 
Example #25
Source File: DuplicateRemovedScheduler.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void push(Request request, Task task) {
    logger.trace("get a candidate url {}", request.getUrl());
    if (shouldReserved(request) || noNeedToRemoveDuplicate(request) || !duplicatedRemover.isDuplicate(request, task)) {
        logger.debug("push to queue {}", request.getUrl());
        pushWhenNoDuplicate(request, task);
    }
}
 
Example #26
Source File: GithubRepoPageProcessorTest.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Test
public void test_github() throws Exception {
    Spider.create(new GithubRepoPageProcessor()).addPipeline(new Pipeline() {
        @Override
        public void process(ResultItems resultItems, Task task) {
            assertThat(((String) resultItems.get("name")).trim()).isEqualTo("webmagic");
            assertThat(((String) resultItems.get("author")).trim()).isEqualTo("code4craft");
        }
    }).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
}
 
Example #27
Source File: MockGithubDownloader.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public Page download(Request request, Task task) {
    Page page = new Page();
    InputStream resourceAsStream = this.getClass().getResourceAsStream("/html/mock-github.html");
    try {
        page.setRawText(IOUtils.toString(resourceAsStream));
    } catch (IOException e) {
        e.printStackTrace();
    }
    page.setRequest(new Request("https://github.com/code4craft/webmagic"));
    page.setUrl(new PlainText("https://github.com/code4craft/webmagic"));
    return page;
}
 
Example #28
Source File: SSLCompatibilityTest.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Test
public void test_tls12() throws Exception {
    HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
    Task task = Site.me().setCycleRetryTimes(5).toTask();
    Request request = new Request("https://juejin.im/");
    Page page = httpClientDownloader.download(request, task);
    assertThat(page.isDownloadSuccess()).isTrue();
}
 
Example #29
Source File: HttpClientDownloaderTest.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Test
public void test_download_fail() {
    HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
    Task task = Site.me().setDomain("localhost").setCycleRetryTimes(5).toTask();
    Request request = new Request(PAGE_ALWAYS_NOT_EXISTS);
    Page page = httpClientDownloader.download(request, task);
    assertThat(page.isDownloadSuccess()).isFalse();
}
 
Example #30
Source File: RedisScheduler.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public int getTotalRequestsCount(Task task) {
    Jedis jedis = pool.getResource();
    try {
        Long size = jedis.scard(getSetKey(task));
        return size.intValue();
    } finally {
        pool.returnResource(jedis);
    }
}