java source code of DemoAnnotatedBingCrawler

/*
 * Copyright (C) 2015 hu
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package cn.edu.hfut.dmic.webcollector.example;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.ram.RamCrawler;
import cn.edu.hfut.dmic.webcollector.util.ExceptionUtils;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;

/**
 * 本教程演示了WebCollector 2.20的新特性:
 *  1)MetaData:
 *    MetaData是每个爬取任务的附加信息,灵活应用MetaData可以大大简化爬虫的设计.
 *    例如Post请求往往需要包含参数，而传统爬虫单纯使用URL来保存参数的方法不适合复杂的POST请求.
 *    一些爬取任务希望获取遍历树的深度信息，这也可以通过MetaData轻松实现，可参见教程DemoDepthCrawler
 *    
 *  2)RamCrawler:
 *    RamCrawler不需要依赖文件系统或数据库，适合一次性的爬取任务.
 *    如果希望编写长期任务，请使用BreadthCrawler.
 * 
 * 本教程实现了一个爬取Bing搜索前n页结果的爬虫，爬虫的结果直接输出到标准输出流
 * 如果希望将爬取结果输出到ArrayList等数据结构中，在类中定义一个ArrayList的成员变量，
 * 输出时将结果插入ArrayList即可，这里需要注意的是爬虫是多线程的，而ArrayList不是线程
 * 安全的，因此在执行插入操作时，可使用synchronized(this){ //插入操作}的方式上锁保证安全。
 * 
 * 本教程中对Bing搜索的解析规则可能会随Bing搜索的改版而失效
 * 
 * @author hu
 */
public class DemoAnnotatedBingCrawler extends RamCrawler {

    public DemoAnnotatedBingCrawler(String keyword, int pageNum) throws Exception {
        for (int pageIndex = 1; pageIndex <= pageNum; pageIndex++) {
            String url = createBingUrl(keyword, pageIndex);
            addSeedAndReturn(url)
                    .type("searchEngine")
                    .meta("keyword", keyword)
                    .meta("pageIndex", pageIndex)
                    .meta("depth", 1);
        }
    }


    // If the http status code is 301 or 302,
    // you have to obtain the redirected url, which is "Location" header of the http response
    // and add it to subsequent tasks by applying "next.add(redirectedUrl)"
    // Since the page may contains metadata,
    // you have to copy it to the added task by "xxxx.meta(page.copyMeta())"
    @MatchCode(codes = {301, 302})
    public void visitRedirect(Page page, CrawlDatums next){
        try {
            // page.location() may be relative url path
            // we have to construct an absolute url path
            String redirectUrl = new URL(new URL(page.url()), page.location()).toExternalForm();
            next.addAndReturn(redirectUrl).meta(page.copyMeta());
        } catch (MalformedURLException e) {
            //the way to handle exceptions in WebCollector
            ExceptionUtils.fail(e);
        }
    }

    @MatchType(types = "searchEngine")
    public void visitSearchEngine(Page page, CrawlDatums next) {
        String keyword = page.meta("keyword");
        int pageIndex = page.metaAsInt("pageIndex");
        System.out.println("成功抓取关键词" + keyword + "的第" + pageIndex + "页搜索结果");
        Elements results = page.select("li.b_algo>h2>a");

        for (int rank = 0; rank < results.size(); rank++) {
            Element result = results.get(rank);
            /*
            我们希望继续爬取每条搜索结果指向的网页，这里统称为外链。
            我们希望在访问外链时仍然能够知道外链处于搜索引擎的第几页、第几条，
            所以将页号和排序信息放入后续的CrawlDatum中，为了能够区分外链和
            搜索引擎结果页面，type设置为outlink，这里的值完全由
            用户定义，可以设置一个任意的值
            */
            String href = result.attr("abs:href");
            next.addAndReturn(href)
                    .type("outlink")
                    .meta("keyword", keyword)
                    .meta("pageIndex", pageIndex)
                    .meta("rank", rank);
        }
    }

    @MatchType(types = "outlink")
    public void visitOutlink(Page page, CrawlDatums next) {
        int depth = page.metaAsInt("depth");
        int pageIndex = page.metaAsInt("pageIndex");
        int rank = page.metaAsInt("rank");
        String referer=page.meta("referer");

        String line = String.format("第%s页第%s个结果:%s(%s字节)\tdepth=%s\treferer=%s",
                pageIndex, rank + 1, page.doc().title(),page.content().length, depth, referer);
        System.out.println(line);
    }

    @Override
    public void visit(Page page, CrawlDatums next) {

    }

    /*
    在经典爬虫中，每个网页都有一个referer信息，表示当前网页的链接来源。
    例如我们首先访问新浪首页，然后从新浪首页中解析出了新的新闻链接，
    则这些网页的referer值都是新浪首页。WebCollector不直接保存referer值，
    但我们可以通过下面的方式，将referer信息保存在metaData中，达到同样的效果。
    经典爬虫中锚文本的存储也可以通过下面方式实现。

    在一些需求中，希望得到当前页面在遍历树中的深度，利用metaData很容易实现
    这个功能，在将CrawlDatum添加到next中时，将其depth设置为当前访问页面
    的depth+1即可。
     */
    @AfterParse
    public void afterParse(Page page, CrawlDatums next){
        int depth = page.metaAsInt("depth");
        next.meta("depth", depth + 1).meta("referer", page.url());
    }



    public static void main(String[] args) throws Exception {
        DemoAnnotatedBingCrawler crawler = new DemoAnnotatedBingCrawler("网络爬虫", 3);
        crawler.start();
    }

    /**
     * construct the Bing Search url by the search keyword and the pageIndex
     * @param keyword
     * @param pageIndex
     * @return the constructed url
     * @throws Exception
     */
    public static String createBingUrl(String keyword, int pageIndex) throws Exception {
        int first = pageIndex * 10 - 9;
        keyword = URLEncoder.encode(keyword, "utf-8");
        return String.format("http://cn.bing.com/search?q=%s&first=%s", keyword, first);
    }

}