us.codecraft.webmagic.ResultItems Java Examples

The following examples show how to use us.codecraft.webmagic.ResultItems. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JdbcPipeline.java    From elasticsearch-jest-example with MIT License 6 votes vote down vote up
public void process(ResultItems resultItems, Task task) {
    Map<String,Object> items = resultItems.getAll();
    if(resultItems!=null&&resultItems.getAll().size()>0){
        Article article = new Article();
        article.setTitle((String) items.get("title"));
        article.setContent((String) items.get("content"));
        article.setSource((String) items.get("source"));
        article.setAuthor((String) items.get("author"));
        article.setUrl((String)items.get("url"));
        String dataStr = (String)items.get("create");
        Pattern pattern = Pattern.compile("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}");
        Matcher matcher = pattern.matcher(dataStr);
        if(matcher.find()){
            dataStr = matcher.group(0);
        }
        try {
            article.setPubdate(new SimpleDateFormat("yyyy-MM-dd HH:mm").parse(dataStr));
        } catch (ParseException e) {
            e.printStackTrace();
        }
        articleDao.save(article);
    }
}
 
Example #2
Source File: ScriptConsole.java    From webmagic with Apache License 2.0 6 votes vote down vote up
private static void startSpider(Params params) {
    ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom()
            .language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build();
    pageProcessor.getSite().setSleepTime(params.getSleepTime());
    pageProcessor.getSite().setRetryTimes(3);
    pageProcessor.getSite().setAcceptStatCode(WMCollections.<Integer>newHashSet(200, 404,403, 500,502));
    Spider spider = Spider.create(pageProcessor).thread(params.getThread());
    spider.clearPipeline().addPipeline(new Pipeline() {
        @Override
        public void process(ResultItems resultItems, Task task) {

        }
    });
    if (params.getUrls() == null || params.getUrls().size() == 0) {
        System.err.println("Need at least one argument");
        System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]");
        System.exit(-1);
    }
    for (String url : params.getUrls()) {
        spider.addUrl(url);
    }
    spider.run();
}
 
Example #3
Source File: ConfigurablePageProcessorTest.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Test
public void test() throws Exception {
    List<ExtractRule> extractRules = new ArrayList<ExtractRule>();
    ExtractRule extractRule = new ExtractRule();
    extractRule.setExpressionType(ExpressionType.XPath);
    extractRule.setExpressionValue("//title");
    extractRule.setFieldName("title");
    extractRules.add(extractRule);
    extractRule = new ExtractRule();
    extractRule.setExpressionType(ExpressionType.XPath);
    extractRule.setExpressionValue("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()");
    extractRule.setFieldName("star");
    extractRules.add(extractRule);
    ResultItems resultItems = Spider.create(new ConfigurablePageProcessor(Site.me(), extractRules))
            .setDownloader(new MockGithubDownloader()).get("https://github.com/code4craft/webmagic");
    assertThat(resultItems.getAll()).containsEntry("title", "<title>code4craft/webmagic · GitHub</title>");
    assertThat(resultItems.getAll()).containsEntry("star", " 86 ");

}
 
Example #4
Source File: ModelPipeline.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Override
public void process(ResultItems resultItems, Task task) {
    for (Map.Entry<Class, PageModelPipeline> classPageModelPipelineEntry : pageModelPipelines.entrySet()) {
        Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName());
        if (o != null) {
            Annotation annotation = classPageModelPipelineEntry.getKey().getAnnotation(ExtractBy.class);
            if (annotation == null || !((ExtractBy) annotation).multi()) {
                classPageModelPipelineEntry.getValue().process(o, task);
            } else {
                List<Object> list = (List<Object>) o;
                for (Object o1 : list) {
                    classPageModelPipelineEntry.getValue().process(o1, task);
                }
            }
        }
    }
}
 
Example #5
Source File: OneFilePipeline.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Override
public synchronized void process(ResultItems resultItems, Task task) {
    printWriter.println("url:\t" + resultItems.getRequest().getUrl());
    for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
        if (entry.getValue() instanceof Iterable) {
            Iterable value = (Iterable) entry.getValue();
            printWriter.println(entry.getKey() + ":");
            for (Object o : value) {
                printWriter.println(o);
            }
        } else {
            printWriter.println(entry.getKey() + ":\t" + entry.getValue());
        }
    }
    printWriter.flush();
}
 
Example #6
Source File: FilePipelineTest.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@BeforeClass
public static void before() {
    resultItems = new ResultItems();
    resultItems.put("content", "webmagic 爬虫工具");
    Request request = new Request("http://www.baidu.com");
    resultItems.setRequest(request);

    task = new Task() {
        @Override
        public String getUUID() {
            return UUID.randomUUID().toString();
        }

        @Override
        public Site getSite() {
            return null;
        }
    };
}
 
Example #7
Source File: BaiduBaikePageProcessor.java    From webmagic with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    //single download
    Spider spider = Spider.create(new BaiduBaikePageProcessor()).thread(2);
    String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
    ResultItems resultItems = spider.<ResultItems>get(String.format(urlTemplate, "水力发电"));
    System.out.println(resultItems);

    //multidownload
    List<String> list = new ArrayList<String>();
    list.add(String.format(urlTemplate,"风力发电"));
    list.add(String.format(urlTemplate,"太阳能"));
    list.add(String.format(urlTemplate,"地热发电"));
    list.add(String.format(urlTemplate,"地热发电"));
    List<ResultItems> resultItemses = spider.<ResultItems>getAll(list);
    for (ResultItems resultItemse : resultItemses) {
        System.out.println(resultItemse.getAll());
    }
    spider.close();
}
 
Example #8
Source File: FilePipeline.java    From webmagic with Apache License 2.0 6 votes vote down vote up
@Override
public void process(ResultItems resultItems, Task task) {
    String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
    try {
        PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")),"UTF-8"));
        printWriter.println("url:\t" + resultItems.getRequest().getUrl());
        for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
            if (entry.getValue() instanceof Iterable) {
                Iterable value = (Iterable) entry.getValue();
                printWriter.println(entry.getKey() + ":");
                for (Object o : value) {
                    printWriter.println(o);
                }
            } else {
                printWriter.println(entry.getKey() + ":\t" + entry.getValue());
            }
        }
        printWriter.close();
    } catch (IOException e) {
        logger.warn("write file error", e);
    }
}
 
Example #9
Source File: CommonWebpagePipeline.java    From spider with GNU General Public License v3.0 6 votes vote down vote up
/**
 * 将webmagic的resultItems转换成webpage对象
 *
 * @param resultItems
 * @return
 */
public static Webpage convertResultItems2Webpage(ResultItems resultItems) {
    Webpage webpage = new Webpage();
    webpage.setContent(resultItems.get("content"));
    webpage.setTitle(resultItems.get("title"));
    webpage.setUrl(resultItems.get("url"));
    webpage.setId(Hashing.md5().hashString(webpage.getUrl(), Charset.forName("utf-8")).toString());
    webpage.setDomain(resultItems.get("domain"));
    webpage.setSpiderInfoId(resultItems.get("spiderInfoId"));
    webpage.setGathertime(resultItems.get("gatherTime"));
    webpage.setSpiderUUID(resultItems.get("spiderUUID"));
    webpage.setKeywords(resultItems.get("keywords"));
    webpage.setSummary(resultItems.get("summary"));
    webpage.setNamedEntity(resultItems.get("namedEntity"));
    webpage.setPublishTime(resultItems.get("publishTime"));
    webpage.setCategory(resultItems.get("category"));
    webpage.setRawHTML(resultItems.get("rawHTML"));
    webpage.setDynamicFields(resultItems.get(DYNAMIC_FIELD));
    webpage.setStaticFields(resultItems.get("staticField"));
    webpage.setAttachmentList(resultItems.get("attachmentList"));
    webpage.setImageList(resultItems.get("imageList"));
    webpage.setProcessTime(resultItems.get("processTime"));
    return webpage;
}
 
Example #10
Source File: HunterProcessor.java    From blog-hunter with MIT License 6 votes vote down vote up
/**
 * 自定义管道的处理方法
 *
 * @param resultItems     自定义Processor处理完后的所有参数
 * @param virtualArticles 爬虫文章集合
 */
final void process(ResultItems resultItems, List<VirtualArticle> virtualArticles, Hunter spider) {
    if (null == spider) {
        return;
    }
    Map<String, Object> map = resultItems.getAll();
    if (CollectionUtil.isEmpty(map)) {
        return;
    }
    String title = String.valueOf(map.get("title"));
    ParserConfig jcParserConfig = new ParserConfig();
    jcParserConfig.putDeserializer(Date.class, HunterDateDeserializer.instance);
    VirtualArticle virtualArticle = JSON.parseObject(JSON.toJSONString(map), VirtualArticle.class, jcParserConfig, JSON.DEFAULT_PARSER_FEATURE);
    virtualArticle.setDescription(CommonUtil.getRealDescription(virtualArticle.getDescription(), virtualArticle.getContent()))
            .setKeywords(CommonUtil.getRealKeywords(virtualArticle.getKeywords()));
    if (this.config.isConvertImg()) {
        virtualArticle.setContent(CommonUtil.formatHtml(virtualArticle.getContent()));
        virtualArticle.setImageLinks(CommonUtil.getAllImageLink(virtualArticle.getContent()));
    }
    if (CollectionUtils.isEmpty(virtualArticle.getTags())) {
        virtualArticle.setTags(Collections.singletonList("其他"));
    }
    virtualArticles.add(virtualArticle);
    writer.print(String.format("<a href=\"%s\" target=\"_blank\">%s</a> -- %s -- %s", virtualArticle.getSource(), title, virtualArticle.getAuthor(), virtualArticle.getReleaseDate()));
}
 
Example #11
Source File: WebMagicProcessorDelegator.java    From vscrawler with Apache License 2.0 6 votes vote down vote up
@Override
protected void parse(Seed seed, String result, GrabResult crawlResult) {
    if (result == null) {
        seed.retry();
        return;
    }
    SipSoupPage sipSoupPage = new SipSoupPage();
    sipSoupPage.setRawText(result);
    sipSoupPage.setUrl(new PlainText(seed.getData()));
    sipSoupPage.setRequest(CovertUtil.convertSeed(seed));
    sipSoupPage.setStatusCode(200);
    pageProcessor.process(sipSoupPage);

    // new url
    List<Request> targetRequests = sipSoupPage.getTargetRequests();
    for (Request request : targetRequests) {
        crawlResult.addSeed(CovertUtil.covertRequest(request));
    }

    if (!sipSoupPage.getResultItems().isSkip()) {
        ResultItems resultItems = sipSoupPage.getResultItems();
        crawlResult.addResult(JSONObject.toJSONString(resultItems.getAll()));
    }
}
 
Example #12
Source File: CommonWebpagePipeline.java    From Gather-Platform with GNU General Public License v3.0 6 votes vote down vote up
/**
 * 将webmagic的resultItems转换成webpage对象
 *
 * @param resultItems
 * @return
 */
public static Webpage convertResultItems2Webpage(ResultItems resultItems) {
    Webpage webpage = new Webpage();
    webpage.setContent(resultItems.get("content"));
    webpage.setTitle(resultItems.get("title"));
    webpage.setUrl(resultItems.get("url"));
    webpage.setId(Hashing.md5().hashString(webpage.getUrl(), Charset.forName("utf-8")).toString());
    webpage.setDomain(resultItems.get("domain"));
    webpage.setSpiderInfoId(resultItems.get("spiderInfoId"));
    webpage.setGathertime(resultItems.get("gatherTime"));
    webpage.setSpiderUUID(resultItems.get("spiderUUID"));
    webpage.setKeywords(resultItems.get("keywords"));
    webpage.setSummary(resultItems.get("summary"));
    webpage.setNamedEntity(resultItems.get("namedEntity"));
    webpage.setPublishTime(resultItems.get("publishTime"));
    webpage.setCategory(resultItems.get("category"));
    webpage.setRawHTML(resultItems.get("rawHTML"));
    webpage.setDynamicFields(resultItems.get(DYNAMIC_FIELD));
    webpage.setStaticFields(resultItems.get("staticField"));
    webpage.setAttachmentList(resultItems.get("attachmentList"));
    webpage.setImageList(resultItems.get("imageList"));
    webpage.setProcessTime(resultItems.get("processTime"));
    return webpage;
}
 
Example #13
Source File: CommonWebpagePipeline.java    From Gather-Platform with GNU General Public License v3.0 6 votes vote down vote up
@Override
public void process(ResultItems resultItems, Task task) {
    SpiderInfo spiderInfo = resultItems.get("spiderInfo");
    Webpage webpage = convertResultItems2Webpage(resultItems);
    SearchRequestBuilder searchRequestBuilder = client.prepareSearch(INDEX_NAME)
            .setTypes(TYPE_NAME)
            .setQuery(QueryBuilders.matchQuery("url", webpage.getUrl()));
    SearchResponse response = searchRequestBuilder.execute().actionGet();
    if (response.getHits().totalHits() == 0) {
        try {
            client.prepareIndex(INDEX_NAME, TYPE_NAME)
                    .setId(Hashing.md5().hashString(webpage.getUrl(), Charset.forName("utf-8")).toString())
                    .setSource(gson.toJson(webpage))
                    .get();
        } catch (Exception e) {
            LOG.error("索引 Webpage 出错," + e.getLocalizedMessage());
        }
    }
}
 
Example #14
Source File: WebMagicPipelineDelegator.java    From vscrawler with Apache License 2.0 6 votes vote down vote up
@Override
public void saveItem(GrabResult grabResult, Seed seed) {
    for (Object str : grabResult.allEntityResult()) {
        ResultItems resultItems = new ResultItems();
        resultItems.setRequest(CovertUtil.convertSeed(seed));
        if (str instanceof CharSequence) {
            handleJson(resultItems, str.toString());
        } else {
            handleJsonObject(resultItems, str);
        }
        try {
            webMagicPipeline.process(resultItems, null);
        } catch (Exception e) {
            log.error("error when process result", e);
        }
    }
}
 
Example #15
Source File: ConsolePipeline.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(ResultItems resultItems, Task task) {
    System.out.println("get page: " + resultItems.getRequest().getUrl());
    for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
        System.out.println(entry.getKey() + ":\t" + entry.getValue());
    }
}
 
Example #16
Source File: BlogPipeline.java    From mogu_blog_v2 with Apache License 2.0 5 votes vote down vote up
@Override
public void process(ResultItems res, Task task) {
    //获取title和content
    String title = res.get("title");
    String content = res.get("content");
    System.out.println("title: " + title);
    System.out.println("content: " + content);
    if (!StringUtils.isEmpty(title) && !StringUtils.isEmpty(content)) {

        try {
            BlogSpider blog = new BlogSpider();
            blog.setUid(idWorker.nextId() + "");
            blog.setTitle(title);
            blog.setSummary(title);
            blog.setContent(content);
            blog.setTagUid("5c4c541e600ff422ccb371ee788f59d6");
            blog.setClickCount(0);
            blog.setCollectCount(0);
            blog.setStatus(EStatus.ENABLE);
            blog.setAdminUid("1f01cd1d2f474743b241d74008b12333");
            blog.setAuthor("陌溪");
            blog.setArticlesPart("蘑菇博客");
            blog.setBlogSortUid("6a1c7a50c0e7b8e8657949bf02d5d0ca");
            blog.setLevel(0);
            blog.setIsPublish(EPublish.PUBLISH);
            blog.setSort(0);
            blog.insert();

            //下载到本地
            //DownloadUtil.download("http://pic.netbian.com"+fileUrl,fileName,SAVE_PATH);

        } catch (Exception e) {
            e.printStackTrace();
        }
    }


}
 
Example #17
Source File: CompositePipeline.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(ResultItems resultItems, Task task) {
    for (SubPipeline subPipeline : subPipelines) {
        if (subPipeline.match(resultItems.getRequest())) {
            RequestMatcher.MatchOther matchOtherProcessorProcessor = subPipeline.processResult(resultItems, task);
            if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != RequestMatcher.MatchOther.YES) {
                return;
            }
        }
    }
}
 
Example #18
Source File: PageModelCollectorPipeline.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public synchronized void process(ResultItems resultItems, Task task) {
    Object o = resultItems.get(clazz.getCanonicalName());
    if (o != null) {
        Annotation annotation = clazz.getAnnotation(ExtractBy.class);
        if (annotation == null || !((ExtractBy) annotation).multi()) {
            classPipeline.process((T) o, task);
        } else {
            List<Object> list = (List<Object>) o;
            for (Object o1 : list) {
               classPipeline.process((T) o1, task);
            }
        }
    }
}
 
Example #19
Source File: JsonFilePipeline.java    From Gather-Platform with GNU General Public License v3.0 5 votes vote down vote up
@Override
public void process(ResultItems resultItems, Task task) {
    Webpage webpage = CommonWebpagePipeline.convertResultItems2Webpage(resultItems);
    try {
        FileUtils.writeStringToFile(
                new File("gather_platform_data/" + webpage.getSpiderUUID() + ".json"),
                gson.toJson(webpage) + "\n",
                true);
    } catch (IOException e) {
        LOG.error("序列化网页信息出错,{}", e.getLocalizedMessage());
    }
}
 
Example #20
Source File: MultiPagePipeline.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(ResultItems resultItems, Task task) {
    Map<String, Object> resultItemsAll = resultItems.getAll();
    Iterator<Map.Entry<String, Object>> iterator = resultItemsAll.entrySet().iterator();
    while (iterator.hasNext()) {
        handleObject(iterator);
    }
}
 
Example #21
Source File: JsonFilePipeline.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Override
public void process(ResultItems resultItems, Task task) {
    String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
    try {
        PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json")));
        printWriter.write(JSON.toJSONString(resultItems.getAll()));
        printWriter.close();
    } catch (IOException e) {
        logger.warn("write file error", e);
    }
}
 
Example #22
Source File: PhantomJSPageProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
    PhantomJSDownloader phantomDownloader = new PhantomJSDownloader().setRetryNum(3);

    CollectorPipeline<ResultItems> collectorPipeline = new ResultItemsCollectorPipeline();

    Spider.create(new PhantomJSPageProcessor())
            .addUrl("http://s.taobao.com/search?q=%B6%AC%D7%B0&sort=sale-desc") //%B6%AC%D7%B0为冬装的GBK编码
            .setDownloader(phantomDownloader)
            .addPipeline(collectorPipeline)
            .thread((Runtime.getRuntime().availableProcessors() - 1) << 1)
            .run();

    List<ResultItems> resultItemsList = collectorPipeline.getCollected();
    System.out.println(resultItemsList.get(0).get("html").toString());
}
 
Example #23
Source File: GithubRepoPageProcessorTest.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Test
public void test_github() throws Exception {
    Spider.create(new GithubRepoPageProcessor()).addPipeline(new Pipeline() {
        @Override
        public void process(ResultItems resultItems, Task task) {
            assertThat(((String) resultItems.get("name")).trim()).isEqualTo("webmagic");
            assertThat(((String) resultItems.get("author")).trim()).isEqualTo("code4craft");
        }
    }).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
}
 
Example #24
Source File: ESPipeline.java    From Gather-Platform with GNU General Public License v3.0 5 votes vote down vote up
@Override
public void process(ResultItems resultItems, Task task) {
    Iterator i$ = resultItems.getAll().entrySet().iterator();
    try {
        XContentBuilder xContentBuilder = jsonBuilder().startObject();
        while (i$.hasNext()) {
            Map.Entry entry = (Map.Entry) i$.next();
            xContentBuilder.field((String) entry.getKey(), entry.getValue());
        }
        String json = xContentBuilder.endObject().string();
        IndexResponse response = null;
        if (StringUtils.isNotBlank(resultItems.get("id"))) {
            response = client
                    .prepareIndex(INDEX_NAME, TYPE_NAME, resultItems.get("id"))
                    .setSource(json).get();
        } else {
            response = client
                    .prepareIndex(INDEX_NAME, TYPE_NAME)
                    .setSource(json).get();
        }
        if (response.getResult() != IndexResponse.Result.CREATED)
            LOG.error("索引失败,可能重复创建,resultItem:" + resultItems);
    } catch (IOException e) {
        LOG.error("索引出错," + e.getLocalizedMessage());
        e.printStackTrace();
    }
}
 
Example #25
Source File: MyPipeline.java    From spring-boot-demo with MIT License 5 votes vote down vote up
@Override
public void process(ResultItems resultItems, Task task) {
    log.info("get page: " + resultItems.getRequest().getUrl());
    for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
        log.info(entry.getKey() + ":\t" + entry.getValue());
    }
}
 
Example #26
Source File: ESPipeline.java    From spider with GNU General Public License v3.0 5 votes vote down vote up
@Override
public void process(ResultItems resultItems, Task task) {
    Iterator i$ = resultItems.getAll().entrySet().iterator();
    try {
        XContentBuilder xContentBuilder = jsonBuilder().startObject();
        while (i$.hasNext()) {
            Map.Entry entry = (Map.Entry) i$.next();
            xContentBuilder.field((String) entry.getKey(), entry.getValue());
        }
        String json = xContentBuilder.endObject().string();
        IndexResponse response = null;
        if (StringUtils.isNotBlank(resultItems.get("id"))) {
            response = client
                    .prepareIndex(INDEX_NAME, TYPE_NAME, resultItems.get("id"))
                    .setSource(json).get();
        } else {
            response = client
                    .prepareIndex(INDEX_NAME, TYPE_NAME)
                    .setSource(json).get();
        }
        if (response.getResult() != IndexResponse.Result.CREATED)
            LOG.error("索引失败,可能重复创建,resultItem:" + resultItems);
    } catch (IOException e) {
        LOG.error("索引出错," + e.getLocalizedMessage());
        e.printStackTrace();
    }
}
 
Example #27
Source File: CommonWebpagePipeline.java    From spider with GNU General Public License v3.0 5 votes vote down vote up
@Override
public void process(ResultItems resultItems, Task task) {
    SpiderInfo spiderInfo = resultItems.get("spiderInfo");
    Webpage webpage = convertResultItems2Webpage(resultItems);
    try {
        client.prepareIndex(INDEX_NAME, TYPE_NAME)
                .setId(Hashing.md5().hashString(webpage.getUrl(), Charset.forName("utf-8")).toString())
                .setSource(gson.toJson(webpage))
                .get();
    } catch (Exception e) {
        LOG.error("索引 Webpage 出错," + e.getLocalizedMessage());
    }
}
 
Example #28
Source File: JsonFilePipeline.java    From spider with GNU General Public License v3.0 5 votes vote down vote up
@Override
public void process(ResultItems resultItems, Task task) {
    Webpage webpage = CommonWebpagePipeline.convertResultItems2Webpage(resultItems);
    try {
        FileUtils.writeStringToFile(
                new File("gather_platform_data/" + webpage.getSpiderUUID() + ".json"),
                gson.toJson(webpage) + "\n",
                true);
    } catch (IOException e) {
        LOG.error("序列化网页信息出错,{}", e.getLocalizedMessage());
    }
}
 
Example #29
Source File: WebMagicPipelineDelegator.java    From vscrawler with Apache License 2.0 5 votes vote down vote up
private void handleJson(ResultItems resultItems, String str) {
    try {
        JSONObject jsonObject = JSON.parseObject(str);
        for (Map.Entry<String, Object> entry : jsonObject.entrySet()) {
            resultItems.put(entry.getKey(), entry.getValue());
        }
    } catch (Exception e) {
        log.warn("craw result is not a json format:{}", str);
        resultItems.put("data", str);
    }
}
 
Example #30
Source File: WebMagicPipelineDelegator.java    From vscrawler with Apache License 2.0 5 votes vote down vote up
private void handleJsonObject(ResultItems resultItems, Object obj) {
    Field[] declaredFields = obj.getClass().getDeclaredFields();
    for (Field field : declaredFields) {
        try {
            resultItems.put(field.getName(), ReflectUtil.getField(obj, field.getName()));
        } catch (Exception e) {
            //ignore,not happen
        }
    }
}