us.codecraft.webmagic.pipeline.Pipeline Java Examples

The following examples show how to use us.codecraft.webmagic.pipeline.Pipeline. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SpiderTest.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Ignore("long time")
@Test
public void testStartAndStop() throws InterruptedException {
    Spider spider = Spider.create(new SimplePageProcessor( "http://www.oschina.net/*")).addPipeline(new Pipeline() {
        @Override
        public void process(ResultItems resultItems, Task task) {
            System.out.println(1);
        }
    }).thread(1).addUrl("http://www.oschina.net/");
    spider.start();
    Thread.sleep(10000);
    spider.stop();
    Thread.sleep(10000);
    spider.start();
    Thread.sleep(10000);
}
 
Example #2
Source File: ScriptConsole.java    From webmagic with Apache License 2.0 6 votes vote down vote up
private static void startSpider(Params params) {
    ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom()
            .language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build();
    pageProcessor.getSite().setSleepTime(params.getSleepTime());
    pageProcessor.getSite().setRetryTimes(3);
    pageProcessor.getSite().setAcceptStatCode(WMCollections.<Integer>newHashSet(200, 404,403, 500,502));
    Spider spider = Spider.create(pageProcessor).thread(params.getThread());
    spider.clearPipeline().addPipeline(new Pipeline() {
        @Override
        public void process(ResultItems resultItems, Task task) {

        }
    });
    if (params.getUrls() == null || params.getUrls().size() == 0) {
        System.err.println("Need at least one argument");
        System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]");
        System.exit(-1);
    }
    for (String url : params.getUrls()) {
        spider.addUrl(url);
    }
    spider.run();
}
 
Example #3
Source File: Spider.java    From webmagic with Apache License 2.0 5 votes vote down vote up
public void close() {
    destroyEach(downloader);
    destroyEach(pageProcessor);
    destroyEach(scheduler);
    for (Pipeline pipeline : pipelines) {
        destroyEach(pipeline);
    }
    threadPool.shutdown();
}
 
Example #4
Source File: Spider.java    From webmagic with Apache License 2.0 5 votes vote down vote up
private void onDownloadSuccess(Request request, Page page) {
    if (site.getAcceptStatCode().contains(page.getStatusCode())){
        pageProcessor.process(page);
        extractAndAddRequests(page, spawnUrl);
        if (!page.getResultItems().isSkip()) {
            for (Pipeline pipeline : pipelines) {
                pipeline.process(page.getResultItems(), this);
            }
        }
    } else {
        logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
    }
    sleep(site.getSleepTime());
    return;
}
 
Example #5
Source File: GithubRepoPageProcessorTest.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Test
public void test_github() throws Exception {
    Spider.create(new GithubRepoPageProcessor()).addPipeline(new Pipeline() {
        @Override
        public void process(ResultItems resultItems, Task task) {
            assertThat(((String) resultItems.get("name")).trim()).isEqualTo("webmagic");
            assertThat(((String) resultItems.get("author")).trim()).isEqualTo("code4craft");
        }
    }).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
}
 
Example #6
Source File: GithubRepoProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Test
public void test() {
    OOSpider.create(new GithubRepoProcessor()).addPipeline(new Pipeline() {
        @Override
        public void process(ResultItems resultItems, Task task) {
            Assert.assertEquals("78",((String)resultItems.get("star")).trim());
            Assert.assertEquals("65",((String)resultItems.get("fork")).trim());
        }
    }).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
}
 
Example #7
Source File: CommonSpider.java    From Gather-Platform with GNU General Public License v3.0 4 votes vote down vote up
public List<Pipeline> getPipelineList() {
    return pipelineList;
}
 
Example #8
Source File: CommonSpider.java    From Gather-Platform with GNU General Public License v3.0 4 votes vote down vote up
public CommonSpider setPipelineList(List<Pipeline> pipelineList) {
    this.pipelineList = pipelineList;
    return this;
}
 
Example #9
Source File: CommonSpider.java    From spider with GNU General Public License v3.0 4 votes vote down vote up
public List<Pipeline> getPipelineList() {
    return pipelineList;
}
 
Example #10
Source File: CommonSpider.java    From spider with GNU General Public License v3.0 4 votes vote down vote up
public CommonSpider setPipelineList(List<Pipeline> pipelineList) {
    this.pipelineList = pipelineList;
    return this;
}
 
Example #11
Source File: Spider.java    From webmagic with Apache License 2.0 2 votes vote down vote up
/**
 * add a pipeline for Spider
 *
 * @param pipeline pipeline
 * @return this
 * @see Pipeline
 * @since 0.2.1
 */
public Spider addPipeline(Pipeline pipeline) {
    checkIfRunning();
    this.pipelines.add(pipeline);
    return this;
}
 
Example #12
Source File: Spider.java    From webmagic with Apache License 2.0 2 votes vote down vote up
/**
 * set pipelines for Spider
 *
 * @param pipelines pipelines
 * @return this
 * @see Pipeline
 * @since 0.4.1
 */
public Spider setPipelines(List<Pipeline> pipelines) {
    checkIfRunning();
    this.pipelines = pipelines;
    return this;
}
 
Example #13
Source File: Spider.java    From webmagic with Apache License 2.0 2 votes vote down vote up
/**
 * clear the pipelines set
 *
 * @return this
 */
public Spider clearPipeline() {
    pipelines = new ArrayList<Pipeline>();
    return this;
}