us.codecraft.webmagic.Spider Java Examples

The following examples show how to use us.codecraft.webmagic.Spider. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ScriptConsole.java    From webmagic with Apache License 2.0 6 votes vote down vote up
private static void startSpider(Params params) {
    ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom()
            .language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build();
    pageProcessor.getSite().setSleepTime(params.getSleepTime());
    pageProcessor.getSite().setRetryTimes(3);
    pageProcessor.getSite().setAcceptStatCode(WMCollections.<Integer>newHashSet(200, 404,403, 500,502));
    Spider spider = Spider.create(pageProcessor).thread(params.getThread());
    spider.clearPipeline().addPipeline(new Pipeline() {
        @Override
        public void process(ResultItems resultItems, Task task) {

        }
    });
    if (params.getUrls() == null || params.getUrls().size() == 0) {
        System.err.println("Need at least one argument");
        System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]");
        System.exit(-1);
    }
    for (String url : params.getUrls()) {
        spider.addUrl(url);
    }
    spider.run();
}
 
Example #2
Source File: ConfigurablePageProcessorTest.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Test
public void test() throws Exception {
    List<ExtractRule> extractRules = new ArrayList<ExtractRule>();
    ExtractRule extractRule = new ExtractRule();
    extractRule.setExpressionType(ExpressionType.XPath);
    extractRule.setExpressionValue("//title");
    extractRule.setFieldName("title");
    extractRules.add(extractRule);
    extractRule = new ExtractRule();
    extractRule.setExpressionType(ExpressionType.XPath);
    extractRule.setExpressionValue("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()");
    extractRule.setFieldName("star");
    extractRules.add(extractRule);
    ResultItems resultItems = Spider.create(new ConfigurablePageProcessor(Site.me(), extractRules))
            .setDownloader(new MockGithubDownloader()).get("https://github.com/code4craft/webmagic");
    assertThat(resultItems.getAll()).containsEntry("title", "<title>code4craft/webmagic · GitHub</title>");
    assertThat(resultItems.getAll()).containsEntry("star", " 86 ");

}
 
Example #3
Source File: SpiderMonitorTest.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Test
public void testInherit() throws Exception {
    SpiderMonitor spiderMonitor = new SpiderMonitor(){
        @Override
        protected SpiderStatusMXBean getSpiderStatusMBean(Spider spider, MonitorSpiderListener monitorSpiderListener) {
            return new CustomSpiderStatus(spider, monitorSpiderListener);
        }
    };

    Spider zhihuSpider = Spider.create(new ZhihuPageProcessor())
            .addUrl("http://my.oschina.net/flashsword/blog").thread(2);
    Spider githubSpider = Spider.create(new GithubRepoPageProcessor())
            .addUrl("https://github.com/code4craft");

    spiderMonitor.register(zhihuSpider, githubSpider);

}
 
Example #4
Source File: SpiderMonitor.java    From webmagic with Apache License 2.0 6 votes vote down vote up
/**
 * Register spider for monitor.
 *
 * @param spiders spiders
 * @return this
 * @throws JMException JMException
 */
public synchronized SpiderMonitor register(Spider... spiders) throws JMException {
    for (Spider spider : spiders) {
        MonitorSpiderListener monitorSpiderListener = new MonitorSpiderListener();
        if (spider.getSpiderListeners() == null) {
            List<SpiderListener> spiderListeners = new ArrayList<SpiderListener>();
            spiderListeners.add(monitorSpiderListener);
            spider.setSpiderListeners(spiderListeners);
        } else {
            spider.getSpiderListeners().add(monitorSpiderListener);
        }
        SpiderStatusMXBean spiderStatusMBean = getSpiderStatusMBean(spider, monitorSpiderListener);
        registerMBean(spiderStatusMBean);
        spiderStatuses.add(spiderStatusMBean);
    }
    return this;
}
 
Example #5
Source File: BaiduBaikePageProcessor.java    From webmagic with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    //single download
    Spider spider = Spider.create(new BaiduBaikePageProcessor()).thread(2);
    String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
    ResultItems resultItems = spider.<ResultItems>get(String.format(urlTemplate, "水力发电"));
    System.out.println(resultItems);

    //multidownload
    List<String> list = new ArrayList<String>();
    list.add(String.format(urlTemplate,"风力发电"));
    list.add(String.format(urlTemplate,"太阳能"));
    list.add(String.format(urlTemplate,"地热发电"));
    list.add(String.format(urlTemplate,"地热发电"));
    List<ResultItems> resultItemses = spider.<ResultItems>getAll(list);
    for (ResultItems resultItemse : resultItemses) {
        System.out.println(resultItemse.getAll());
    }
    spider.close();
}
 
Example #6
Source File: ChapterSpiderHhit.java    From SmartEducation with Apache License 2.0 5 votes vote down vote up
@Test
public void crawer() {

	Spider.create(new ChapterSpiderHhit())//
			.addUrl("http://mooc1.chaoxing.com/course/86454598.html")//
			//.addRequest(new Request(professionTypeModel.getUrl()+"/0/1000").setPriority(0).putExtra("professionTypeModel", professionTypeModel))
			.thread(10)//
			.run();
}
 
Example #7
Source File: BlogSpiderRestApi.java    From mogu_blog_v2 with Apache License 2.0 5 votes vote down vote up
/**
 * 爬取csdn博客
 *
 * @return
 */
@ApiOperation(value = "startSpiderCsdn", notes = "startSpiderCsdn")
@RequestMapping(value = "/startSpiderCsdn", method = RequestMethod.GET)
public String startSpiderCsdn() {

    if (spider != null) {
        spider.run();
        return "启动爬取";
    }
    //开启蜘蛛爬取内容
    spider = Spider.create(blogProcesser)
            .addUrl("https://www.csdn.net/")
            .addPipeline(blogPipeline)
            .setScheduler(new QueueScheduler())
            .thread(10);

    spider.start();

    return "开始爬取";
}
 
Example #8
Source File: ChaoXingTest.java    From SmartEducation with Apache License 2.0 5 votes vote down vote up
public void crawer(){
       Spider.create(new ChaoXingTest())
       //从"http://nation.chaoxing.com/index?xuekeid=0&start=0&size=434"开始抓
       //因为有分页,这里通过设置url可以实现显示全部,但是需要使用cookie模拟登录
       .addUrl("http://nation.chaoxing.com/index?xuekeid=0&start=0&size=434")
       //开启5个线程抓取
       .thread(5)
       //启动爬虫
       .run();
}
 
Example #9
Source File: NeteaseNewsPageProcesser.java    From elasticsearch-jest-example with MIT License 5 votes vote down vote up
public static void main(String[] args) {
    ApplicationContext applicationContext = new ClassPathXmlApplicationContext("applicationContext.xml");
    JdbcPipeline jdbcPipeline = (JdbcPipeline)applicationContext.getBean("jdbcPipeline");
    Spider.create(new NeteaseNewsPageProcesser())
            .addUrl("http://news.163.com/domestic")
            .addUrl("http://news.163.com/shehui")
            .addPipeline(jdbcPipeline)
            .thread(5)
            .run();
}
 
Example #10
Source File: GovNewsPageProcesser.java    From elasticsearch-jest-example with MIT License 5 votes vote down vote up
public static void main(String[] args) {

        ApplicationContext applicationContext = new ClassPathXmlApplicationContext("applicationContext.xml");
        JdbcPipeline jdbcPipeline = (JdbcPipeline)applicationContext.getBean("jdbcPipeline");
        Spider.create(new GovNewsPageProcesser())
                .addUrl("http://new.sousuo.gov.cn/column/19769/0.htm") //要闻
                .addUrl("http://new.sousuo.gov.cn/column/16704/0.htm") //热点
                .addUrl("http://new.sousuo.gov.cn/column/16700/0.htm") //部门新闻
                .addUrl("http://new.sousuo.gov.cn/column/16699/0.htm") //地方报道
                .addUrl("http://new.sousuo.gov.cn/column/16697/0.htm") //执法监管
                .addUrl("http://new.sousuo.gov.cn/column/19423/0.htm") //国务院信息
                .addUrl("http://new.sousuo.gov.cn/column/16622/0.htm") //讲话
                .addUrl("http://new.sousuo.gov.cn/column/16623/0.htm") //会议
                .addUrl("http://new.sousuo.gov.cn/column/16621/0.htm") //活动
                .addUrl("http://new.sousuo.gov.cn/column/16620/0.htm") //出访
                .addUrl("http://new.sousuo.gov.cn/column/16740/0.htm") //专题信息-最新
                .addUrl("http://new.sousuo.gov.cn/column/16739/0.htm") //专题信息-聚焦
                .addUrl("http://new.sousuo.gov.cn/column/16743/0.htm") //事件
                .addUrl("http://new.sousuo.gov.cn/column/16744/0.htm") //预案
                .addUrl("http://new.sousuo.gov.cn/column/16742/0.htm") //工作
                .addUrl("http://new.sousuo.gov.cn/column/16765/0.htm") //政策法规解读-专家
                .addUrl("http://new.sousuo.gov.cn/column/16764/0.htm") //政策法规解读-媒体
                .addUrl("http://new.sousuo.gov.cn/column/17999/0.htm") //评论-要论
                .addUrl("http://new.sousuo.gov.cn/column/18000/0.htm") //评论-时评
                .addUrl("http://new.sousuo.gov.cn/column/18001/0.htm") //评论-网评
                .addUrl("http://new.sousuo.gov.cn/column/16852/0.htm") //数据要闻
                .addPipeline(jdbcPipeline) // 将抓取到的结果保存到数据库
                .thread(5)
                .run();
    }
 
Example #11
Source File: HuabanProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    Spider.create(new HuabanProcessor()).thread(5)
            .addPipeline(new FilePipeline("/data/webmagic/test/"))
            .setDownloader(new SeleniumDownloader("/Users/yihua/Downloads/chromedriver"))
            .addUrl("http://huaban.com/")
            .runAsync();
}
 
Example #12
Source File: GooglePlayProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
	Spider.create(new GooglePlayProcessor())
			.thread(5)
			.addPipeline(
					new FilePipeline(
							"/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/data/"))
			.setDownloader(new SeleniumDownloader())
			.addUrl("https://play.google.com/store/apps/details?id=com.tencent.mm")
			.runAsync();
}
 
Example #13
Source File: GithubRepoPageProcessorTest.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Test
public void test_github() throws Exception {
    Spider.create(new GithubRepoPageProcessor()).addPipeline(new Pipeline() {
        @Override
        public void process(ResultItems resultItems, Task task) {
            assertThat(((String) resultItems.get("name")).trim()).isEqualTo("webmagic");
            assertThat(((String) resultItems.get("author")).trim()).isEqualTo("code4craft");
        }
    }).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
}
 
Example #14
Source File: Kr36NewsModel.java    From webmagic with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws IOException, JMException {
    //Just for benchmark
    Spider thread = OOSpider.create(Site.me().setSleepTime(0), new PageModelPipeline() {
        @Override
        public void process(Object o, Task task) {

        }
    }, Kr36NewsModel.class).thread(20).addUrl("http://www.36kr.com/");
    thread.start();
    SpiderMonitor spiderMonitor = SpiderMonitor.instance();
    spiderMonitor.register(thread);
}
 
Example #15
Source File: MamacnPageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
    Spider.create(new MamacnPageProcessor())
            .setScheduler(new FileCacheQueueScheduler("/data/webmagic/mamacn"))
            .addUrl("http://www.mama.cn/photo/t1-p1.html")
            .addPipeline(new OneFilePipeline("/data/webmagic/mamacn/data"))
            .thread(5)
            .run();
}
 
Example #16
Source File: ZhihuPageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    Spider.create(new ZhihuPageProcessor()).
            addUrl("http://www.zhihu.com/search?type=question&q=java").
            addPipeline(new FilePipeline("D:\\webmagic\\")).
            thread(5).
            run();
}
 
Example #17
Source File: PhantomJSPageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
    PhantomJSDownloader phantomDownloader = new PhantomJSDownloader().setRetryNum(3);

    CollectorPipeline<ResultItems> collectorPipeline = new ResultItemsCollectorPipeline();

    Spider.create(new PhantomJSPageProcessor())
            .addUrl("http://s.taobao.com/search?q=%B6%AC%D7%B0&sort=sale-desc") //%B6%AC%D7%B0为冬装的GBK编码
            .setDownloader(phantomDownloader)
            .addPipeline(collectorPipeline)
            .thread((Runtime.getRuntime().availableProcessors() - 1) << 1)
            .run();

    List<ResultItems> resultItemsList = collectorPipeline.getCollected();
    System.out.println(resultItemsList.get(0).get("html").toString());
}
 
Example #18
Source File: SinablogProcessorTest.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Ignore
@Test
public void test() throws IOException {
    SinaBlogProcessor sinaBlogProcessor = new SinaBlogProcessor();
    //pipeline是抓取结束后的处理
    //默认放到/data/webmagic/ftl/[domain]目录下
    JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/");
    //Spider.me()是简化写法,其实就是new一个啦
    //Spider.pipeline()设定一个pipeline,支持链式调用
    //ConsolePipeline输出结果到控制台
    //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
    //Spider.run()执行
    Spider.create(sinaBlogProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
            run();
}
 
Example #19
Source File: MonitorExample.java    From webmagic with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {

        Spider zhihuSpider = Spider.create(new ZhihuPageProcessor())
                .addUrl("http://my.oschina.net/flashsword/blog");
        Spider githubSpider = Spider.create(new GithubRepoPageProcessor())
                .addUrl("https://github.com/code4craft");

        SpiderMonitor.instance().register(zhihuSpider);
        SpiderMonitor.instance().register(githubSpider);
        zhihuSpider.start();
        githubSpider.start();
    }
 
Example #20
Source File: CourseSpiderHhit.java    From SmartEducation with Apache License 2.0 5 votes vote down vote up
@Test
public void crawer() {

	Spider.create(new CourseSpiderHhit())//
			.addUrl("http://mooc1.chaoxing.com/course/86454598.html")//
			//.addRequest(new Request(professionTypeModel.getUrl()+"/0/1000").setPriority(0).putExtra("professionTypeModel", professionTypeModel))
			.thread(10)//
			.run();
}
 
Example #21
Source File: CommonSpider.java    From Gather-Platform with GNU General Public License v3.0 5 votes vote down vote up
/**
 * 删除全部爬虫
 */
public void deleteAll() {
    List<String> spiderUUID2BeRemoved = spiderMap.entrySet().stream().filter(
            spiderEntry -> spiderEntry.getValue().getStatus() == Spider.Status.Stopped
    ).map(Map.Entry::getKey).collect(Collectors.toList());
    for (String uuid : spiderUUID2BeRemoved) {
        try {
            deleteTaskById(uuid);
            spiderMap.remove(uuid);
        } catch (Exception e) {
            LOG.error("删除任务ID:{}出错,{}", uuid, e.getLocalizedMessage());
        }
    }
    taskManager.deleteTasksByState(State.STOP);
}
 
Example #22
Source File: DownloadPicture.java    From Gather-Platform with GNU General Public License v3.0 5 votes vote down vote up
public static void main(String[] args) {
    //tn:resultjsonavatarnew
    //ie:utf-8 字符编码(ie输入 oe输出)
    //word:美女 搜索关键字
    //pn:60 开始条数
    //rn:30 显示数量
    //z:0 尺寸(0全部尺寸 9特大 3大 2中 1小)
    //width:1024 自定义尺寸-宽
    //height:768 自定义尺寸-高
    //ic:0 颜色(0全部颜色 1红色 2黄色 4绿色 8青色 16蓝色 32紫色 64粉色 128棕色 256橙色 512黑色 1024白色 2048黑白)
    //s:0 3头像图片
    //face:0 1面部特写
    //st:-1 -1全部类型 1卡通画 2简笔画
    //lm:-1 (6动态图片 7静态图片)
    //gsm:3c pn值的十六进制数

    String key = "海贼王";    //百度图片 关键词
    DownloadPicture downloadPicture = new DownloadPicture();
    ArrayList<String> nameList = new ArrayList<>();
    ArrayList<String> urlList = new ArrayList<>();
    for(int i=0;i<2;i++){   //控制爬取页数,一页30张图片
        String url = "http://image.baidu.com/search/avatarjson?tn=resultjsonavatarnew&ie=utf-8&word="+key+"&pn="+i*3+"0&rn=30&z=3&ic=0&s=0&face=0&st=-1&lm=-1";
        Spider.create(new DownloadPicture())
            .addUrl(url)
            .run();
        urlList.addAll(urls);
        nameList.addAll(names);
    }
    downloadPicture.downloadPicture(urlList,nameList,key);
}
 
Example #23
Source File: PostsServiceImpl.java    From plumemo with Apache License 2.0 5 votes vote down vote up
private void crawler(PostsVO postsVO) {
    Class platformClass = PlatformEnum.getEnumTypeMap().get(postsVO.getPlatformType()).getPlatformClass();
    Spider spider = OOSpider.create(Site.me(), platformClass).setDownloader(new HttpClientDownloader());
    Object object = spider.get(postsVO.getSourceUri());

    String join = "";
    if (postsVO.getPlatformType().equals(PlatformEnum.JIAN_SHU.getType())) {
        JianShuVO jianShuVO = (JianShuVO) object;
        postsVO.setTitle(jianShuVO.getTitle());
        join = String.join("", jianShuVO.getContent());
    } else if (postsVO.getPlatformType().equals(PlatformEnum.JUE_JIN.getType())) {
        JueJinVO jueJinVO = (JueJinVO) object;
        postsVO.setTitle(jueJinVO.getTitle());
        join = String.join("", jueJinVO.getContent());
    } else if (postsVO.getPlatformType().equals(PlatformEnum.SEGMENT_FAULT.getType())) {
        SegmentFaultVO segmentFaultVO = (SegmentFaultVO) object;
        postsVO.setTitle(segmentFaultVO.getTitle());
        join = String.join("", segmentFaultVO.getContent());
    } else if (postsVO.getPlatformType().equals(PlatformEnum.CSDN.getType())) {
        CSDNVO csdnVO = (CSDNVO) object;
        postsVO.setTitle(csdnVO.getTitle());
        join = String.join("", csdnVO.getContent());
    } else if (postsVO.getPlatformType().equals(PlatformEnum.CN_BLOGS.getType())) {
        CNBlogsVO cnBlogsVO = (CNBlogsVO) object;
        postsVO.setTitle(cnBlogsVO.getTitle());
        join = String.join("", cnBlogsVO.getContent());
    } else {
        ExceptionUtil.rollback(ErrorEnum.PARAM_ERROR);
    }
    String converted = new Remark().convertFragment(join);
    postsVO.setContent(converted);
}
 
Example #24
Source File: CrawlerConfig.java    From tom-crawler with Apache License 2.0 5 votes vote down vote up
@Bean
public CrawlerScheduler scheduler(RedisService redisService, RedisTemplate<String, String> redisTemplate,
                                  Spider spider) {
    BloomFilter<String> bloomFilter = new BloomFilter<>(FALSE_POSITIVE_PROBABILITY, EXPECTED_NUMBER_OF_ELEMENTS);
    //初始化redisBitSet
    BaseBitSet bitSet = new RedisBitSet(redisTemplate, RedisConstant.getBitKey(spider));
    //绑定实现
    bloomFilter.bind(bitSet);
    logger.info("bloomFilter bind complete!");
    return new CrawlerScheduler(redisService, bloomFilter);
}
 
Example #25
Source File: Hunter.java    From blog-hunter with MIT License 5 votes vote down vote up
@Override
public void stop() {
    Spider.Status status = this.getStatus();
    if (status.equals(Spider.Status.Running)) {
        super.stop();
        SPIDER_BUCKET.remove(this.hunterId);
    } else if (status.equals(Spider.Status.Init)) {
        throw new HunterException("爬虫正在初始化!HunterId:[" + this.hunterId + "]");
    } else {
        throw new HunterException("当前没有正在运行的爬虫!HunterId:[" + this.hunterId + "]");
    }
}
 
Example #26
Source File: BlogTask.java    From mogu_blog_v2 with Apache License 2.0 5 votes vote down vote up
/**
 * 爬取文章: 爬取数据库分类
 */
//@Scheduled(cron = "0/20 * * * * ?")
//initialDelay 任务启动后多久后执行
//fixedDelay 多久执行一次
@Scheduled(initialDelay = 1000, fixedDelay = 100 * 1000)
public void webArticleTask() {
    //开启蜘蛛爬取内容
    Spider.create(blogProcesser)
            .addUrl("https://www.csdn.net/")
            .addPipeline(blogPipeline)
            .setScheduler(new QueueScheduler())
            .thread(10)
            .run();
}
 
Example #27
Source File: Main.java    From spring-boot-demo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
    //获取影片标题和页面链接
    Spider.create(new ListPageProcesser()).addUrl("https://www.dytt8.net/html/gndy/dyzz/list_23_1.html")
            .addPipeline(new MyPipeline()).thread(1).run();

    //获取指定详情页面的影片下载地址
    Spider.create(new DetailPageProcesser()).addUrl("https://www.dytt8.net/html/gndy/dyzz/20191204/59453.html")
            .addPipeline(new MyPipeline()).thread(1).run();
}
 
Example #28
Source File: CsdnBlogPageProcessor.java    From webmagic-csdnblog with MIT License 5 votes vote down vote up
public static void main(String[] args) {
	long startTime, endTime;
	System.out.println("【爬虫开始】请耐心等待一大波数据到你碗里来...");
	startTime = System.currentTimeMillis();
	// 从用户博客首页开始抓,开启5个线程,启动爬虫
	Spider.create(new CsdnBlogPageProcessor()).addUrl("http://blog.csdn.net/" + username).thread(5).run();
	endTime = System.currentTimeMillis();
	System.out.println("【爬虫结束】共抓取" + size + "篇文章,耗时约" + ((endTime - startTime) / 1000) + "秒,已保存到数据库,请查收!");
}
 
Example #29
Source File: CommonSpider.java    From spider with GNU General Public License v3.0 5 votes vote down vote up
/**
 * 删除全部爬虫
 */
public void deleteAll() {
    List<String> spiderUUID2BeRemoved = spiderMap.entrySet().stream().filter(
            spiderEntry -> spiderEntry.getValue().getStatus() == Spider.Status.Stopped
    ).map(Map.Entry::getKey).collect(Collectors.toList());
    for (String uuid : spiderUUID2BeRemoved) {
        try {
            deleteTaskById(uuid);
            spiderMap.remove(uuid);
        } catch (Exception e) {
            LOG.error("删除任务ID:{}出错,{}", uuid, e.getLocalizedMessage());
        }
    }
    taskManager.deleteTasksByState(State.STOP);
}
 
Example #30
Source File: ContentImageProcessor.java    From javabase with Apache License 2.0 5 votes vote down vote up
public ConcurrentHashMap<byte[], byte[]> start(List<String> pageNumberList, String tiebaName) {
    isAddTarget=false;
    map.clear();
    this.tiebaName=tiebaName;
    this.pageNumberList = pageNumberList;
    this.url = tieBaConfiguration.getTiebaContentPageUrl();
    Spider.create(this).addUrl(url).addPipeline(new ConsolePipeline())
            // 开启5个线程抓取
            .thread(30)
            // 启动爬虫
            .run();
    return  map;
}