java source code of WxCrawler

package com.xuzp.crawler.weixin;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.ram.RamCrawler;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.xuzp.common.ResultBase;
import com.xuzp.common.WxCrawlerConstant;
import com.xuzp.common.utils.FileUtils;
import com.xuzp.config.ProxyConfig;
import com.xuzp.crawler.weixin.convert.CrawlDatumConvert;
import com.xuzp.crawler.weixin.convert.ResourceTransfer;
import com.xuzp.crawler.weixin.obj.ArticleSummaryObj;
import com.xuzp.crawler.weixin.vo.ArticleTransferVO;
import com.xuzp.service.IArticleService;
import com.xuzp.service.IRedisService;
import com.xuzp.service.IWxCrawlService;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.BooleanUtils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import java.io.File;
import java.io.UnsupportedEncodingException;
import java.net.Proxy;
import java.net.URLEncoder;
import java.util.List;

/**
 * @author za-xuzhiping
 * @Date 2018/7/30
 * @Time 16:13
 */
@Slf4j
public class WxCrawler extends RamCrawler {

    private ResourceTransfer resourceTransfer;

    private Long sleepTime;

    private String outputPath;

    private Boolean isResumable;

    private String proxyPolicy;

    private Boolean updateArticle;

    private ProxyConfig proxyConfig;

    private IWxCrawlService wxCrawlService;

    private IArticleService articleService;

    private IRedisService redisService;

    public WxCrawler(String outputPath, Long sleepTime, Boolean resumable, String proxyPolicy, Boolean updateArticle,
                     ResourceTransfer resourceTransfer, IArticleService articleService, IRedisService redisService,
                     ProxyConfig proxyConfig, IWxCrawlService wxCrawlService) throws Exception {
        super(false);
        this.outputPath = outputPath;
        this.sleepTime = sleepTime;
        this.isResumable = resumable;
        this.proxyPolicy = proxyPolicy;
        this.articleService = articleService;
        this.updateArticle = updateArticle;
        this.resourceTransfer = resourceTransfer;
        this.proxyConfig = proxyConfig;
        this.redisService = redisService;
        this.wxCrawlService = wxCrawlService;
    }

    @Override
    public void afterStop(){
        log.info("Finished Weixin Crawl job");
        super.afterStop();
    }

    @Override
    public void visit(Page page, CrawlDatums next) {
        try {
            Thread.sleep(sleepTime != null ? sleepTime.longValue() : 5000L);
        } catch (InterruptedException e) {
            log.info("Failed to sleep, e={}", e);
        }
        log.info("Visit {}", page.url());
        if (page.matchType(WxCrawlerConstant.CrawlDatumType.ACCOUNT_SEARCH)) {
            parseSogouSearchResult(page, next);
        } else if (page.matchType(WxCrawlerConstant.CrawlDatumType.ARTICLE_LIST)) {
            parseWxArticleList(page, next);
        } else if (page.matchType(WxCrawlerConstant.CrawlDatumType.ARTICLE_DETAIL)) {
            parseWxArticleDetail(page, next);
        }
    }

    /**
     * 解析搜狗的微信公众号搜索结果页
     * @param page
     * @param next
     */
    protected void parseSogouSearchResult(Page page, CrawlDatums next){
        String accountName = page.meta(WxCrawlerConstant.CrawlMetaKey.ACCOUNT_NAME);
        int triedCount = page.metaAsInt(WxCrawlerConstant.CrawlMetaKey.TRIED_COUNT);

        // 检查使用不同代理重试次数
        if (triedCount > WxCrawlerConstant.MAX_TRY_COUNT) {
            log.info("Tried so many times using different proxy but all failed" +
                    ", skip, accountName：{}", accountName);
            return;
        }
        log.info("Parsing sogou search result page，accountName: {}", accountName);
        Element accountLinkEle = page.select("p.tit>a").first();
        if (accountLinkEle == null) {
            processBlocked(page, next);
            return;
        }
        //防止公众号名错误
        String detectedAccount = accountLinkEle.text().trim();
        if (!accountName.equals(detectedAccount)) {
            log.info("accountName \"{}\" not matched \"{}\"", accountName, detectedAccount);
            return;
        }
        //解析出公众号搜索结果页面中的URL
        String accountUrl = accountLinkEle.attr("abs:href");
        Element wxAccountEl = page.select("p.info>label[name='em_weixinhao']").first();
        if (wxAccountEl == null || StringUtils.isEmpty(wxAccountEl.text())) {
            log.info("accountId \"{}\" not exist", accountName);
            return;
        }
        if(accountUrl.startsWith(WxCrawlerConstant.HTTP_PROTOCOL)) {
            accountUrl = accountUrl.replaceFirst(WxCrawlerConstant.HTTP_PROTOCOL, WxCrawlerConstant.HTTPS_PROTOCOL);
        }
        next.add(new CrawlDatum(accountUrl, WxCrawlerConstant.CrawlDatumType.ARTICLE_LIST)
                .meta(WxCrawlerConstant.CrawlMetaKey.ACCOUNT_NAME, accountName)
                .meta(WxCrawlerConstant.CrawlMetaKey.ACCOUNT_ID, wxAccountEl.text())
                .meta(WxCrawlerConstant.CrawlMetaKey.TRIED_COUNT, 0));
    }

    private void processBlocked(Page page, CrawlDatums next){
        log.info("Current proxy IP \"{}\" is blocked, use other proxy IP and try again...", getCurrentProxyInfo());
        next.add(reNewCrawlDatum(page.crawlDatum()));
    }

    private CrawlDatum reNewCrawlDatum(CrawlDatum old) {
        int triedCount = old.metaAsInt(WxCrawlerConstant.CrawlMetaKey.TRIED_COUNT) + 1;
        int index = old.url().indexOf(WxCrawlerConstant.CrawlMetaKey.TRIED_COUNT);
        String url = null;
        if (index != -1) {
            url = old.url().substring(0, index-1) + "&" + WxCrawlerConstant.CrawlMetaKey.TRIED_COUNT + "=" + triedCount;
        } else {
            url = old.url() + "&" + WxCrawlerConstant.CrawlMetaKey.TRIED_COUNT + "=" + triedCount;
        }
        CrawlDatum newObj = new CrawlDatum(url, old.type());
        newObj.meta(old.meta()).meta(WxCrawlerConstant.CrawlMetaKey.TRIED_COUNT, triedCount);
        return newObj;
    }

    /**
     * 是否爬取过。只有在打开断点续爬时才做检查
     * @param key
     * @return
     */
    private boolean hasCrawled(String key){
        if (BooleanUtils.isFalse(isResumable)) {
            return false;
        }
        return redisService.exists(key);
    }

    private List<ArticleSummaryObj> parseArticleListByPage(Page page) throws Exception {
        int startIndex = page.html().indexOf(WxCrawlerConstant.ArticleList.ARTICLE_LIST_KEY) +
                WxCrawlerConstant.ArticleList.ARTICLE_LIST_KEY.length();
        int endIndex = page.html().indexOf(WxCrawlerConstant.ArticleList.ARTICLE_LIST_SUFFIX);
        String jsonStr = page.html().substring(startIndex, endIndex).trim();
        jsonStr = jsonStr.substring(0,jsonStr.length()-1);
        JSONObject json = JSONObject.parseObject(jsonStr);
        return JSONArray.parseArray(json.getString("list"), ArticleSummaryObj.class);
    }

    /**
     * 解析微信公众号主页文章列表
     * @param page
     * @param next
     */
    protected void parseWxArticleList(Page page, CrawlDatums next){
        String accountName = page.meta(WxCrawlerConstant.CrawlMetaKey.ACCOUNT_NAME);
        log.info("Parsing weixin article list page，accountName:{}", accountName);
        String accountId = page.meta(WxCrawlerConstant.CrawlMetaKey.ACCOUNT_ID);
        int triedCount = page.metaAsInt(WxCrawlerConstant.CrawlMetaKey.TRIED_COUNT);

        // Step 1: 检查使用不同代理重试次数
        if (triedCount > WxCrawlerConstant.MAX_TRY_COUNT) {
            log.info("Tried so many times using different proxy but all failed" +
                    ", skip, accountName：{}", accountName);
            return;
        }

        // Step 2: 获取文章列表
        List<ArticleSummaryObj> articles = null;
        try {
            articles = parseArticleListByPage(page);
        } catch(Exception e1) {
            log.info("Need to enter identifying code, {}", page.url());
            processBlocked(page, next);
            return ;
        }

        // Step 3: 解析文章详情，加入爬虫种子
        ResultBase<List<ArticleTransferVO>> articleTransferResult = wxCrawlService.parseArticleList(accountId, accountName, articles);
        if(articleTransferResult.isSuccess() && CollectionUtils.isNotEmpty(articleTransferResult.getValue())) {
            articleTransferResult.getValue().forEach(article -> {
                CrawlDatum crawlDatum = parseArticleSummary(article);
                if (crawlDatum != null) {
                    next.add(crawlDatum);
                }
            });
        }
    }

    private String getCurrentProxyInfo(){
        if (this.getRequester() != null) {
            WxProxyRequest requester = (WxProxyRequest) this.getRequester();
            Proxy currentProxy = requester.getCurrentProxy();
            if (currentProxy != null) {
                return currentProxy.toString();
            }
        }
        return "no proxy";
    }

    /**
     * 解析文章列表中的文章信息
     * @param articleTransferVO
     * @return
     */
    private CrawlDatum parseArticleSummary(ArticleTransferVO articleTransferVO){
        String key = articleTransferVO.getAccountId().trim() + "###" + articleTransferVO.getTitle().trim()
                + "###" + articleTransferVO.getPublishDate().trim();
        if (hasCrawled(key)) {
            log.info("Article has crawled, skip, accountName：{}，article：{}", articleTransferVO.getAccountName(),
                    articleTransferVO.getTitle());
            return null;
        }
        String cover = articleTransferVO.getCover();
        ResultBase<String> newURL = resourceTransfer.getOssValue(cover);
        if (newURL.isSuccess()) {
            articleTransferVO.setOssCover(newURL.getValue());
        } else {
            log.info("Failed to CoverImage resourceTranslation, article: {}, cover: {}", articleTransferVO.getTitle(), cover);
        }
        return CrawlDatumConvert.convert2ArticleSummaryCrawlDatum(articleTransferVO);
    }

    /**
     * 解析微信公众号文章详情页
     * @param page
     */
    protected void parseWxArticleDetail (Page page, CrawlDatums next) {
        String accountName = page.meta(WxCrawlerConstant.CrawlMetaKey.ACCOUNT_NAME);
        String accountId = page.meta(WxCrawlerConstant.CrawlMetaKey.ACCOUNT_ID);
        String cover = page.meta(WxCrawlerConstant.CrawlMetaKey.ARTICLE_COVER);
        String title = page.meta(WxCrawlerConstant.CrawlMetaKey.ARTICLE_TITLE);
        String digest = page.meta(WxCrawlerConstant.CrawlMetaKey.ARTICLE_DIGEST);
        String publishDate = page.meta(WxCrawlerConstant.CrawlMetaKey.ARTICLE_PUBLISH_DATE);
        String author = page.meta(WxCrawlerConstant.CrawlMetaKey.ARTICLE_AUTHOR);
        int triedCount = page.metaAsInt(WxCrawlerConstant.CrawlMetaKey.TRIED_COUNT);

        // 检查使用不同代理重试次数
        if (triedCount > WxCrawlerConstant.MAX_TRY_COUNT) {
            log.info("Tried so many times using different proxy but all failed" +
                    ", skip, accountName：{}，article：{}", accountName, title);
            return;
        }

        String key = accountId.trim() + "###" + title.trim() + "###" + publishDate.trim();
        if (hasCrawled(key)) {
            log.info("This article has crawled, skip, accountName：{}，article：{}", accountName, title);
            return;
        }

        try {
            Document sourceDoc = Jsoup.parse(page.html());
            ResultBase<ArticleTransferVO> articleTransferVOResultBase = wxCrawlService.parseArticleDetail(sourceDoc);
            if(articleTransferVOResultBase.isSuccess()) {
                log.info("accountName: {}, accountId: {}, cover: {}, title: {}, author: {}, publishDate: {}, digest: {}",
                        accountName, accountId, cover, title, author, publishDate, digest);

                // 备份html文件
                if(StringUtils.isNotEmpty(outputPath)) {
                    Document targetDoc = articleTransferVOResultBase.getValue().getTargetDoc();
                    targetDoc.head().attr(WxCrawlerConstant.BackupArticle.AUTHOR, author)
                            .attr(WxCrawlerConstant.BackupArticle.COVER, cover)
                            .attr(WxCrawlerConstant.BackupArticle.DIGEST, digest)
                            .attr(WxCrawlerConstant.BackupArticle.ACCOUNT_ID, accountId)
                            .attr(WxCrawlerConstant.BackupArticle.ACCOUNT_NAME, accountName)
                            .attr(WxCrawlerConstant.BackupArticle.PUBLISH_DATE, publishDate)
                            .attr(WxCrawlerConstant.BackupArticle.ARTICLE_TITLE, title)
                            .attr(WxCrawlerConstant.BackupArticle.ARTICLE_TYPE, articleTransferVOResultBase.getValue().getArticleType());

                    org.apache.commons.io.FileUtils.writeStringToFile(new File(FileUtils.getOutputAccountPath(outputPath, accountName),
                                    FileUtils.normalize(accountName + "_" + title + ".html")),
                            FileUtils.replaceEmoji(targetDoc.outerHtml()), "UTF-8");
                }

                IArticleService.Operation operation = updateArticle ? IArticleService.Operation.SAVE :
                        IArticleService.Operation.ADD;
                ResultBase<String> result = articleService.save(articleTransferVOResultBase.getValue(), operation);
                log.info(result.getMessage());
                if (result.isSuccess()) {
                    setCrawlInfo(key, "");
                }
            } else if(StringUtils.isNotEmpty(articleTransferVOResultBase.getMessage())
                    && articleTransferVOResultBase.getMessage().contains("blocked ip")){
                processBlocked(page, next);
            } else {
                log.info("Failed to parse detail html, accountName：{}，article：{}", accountName, title);
            }
        } catch (Exception ex) {
            log.info("Failed to parseWxArticleDetail, exception={}", ex);
        }
    }

    private void setCrawlInfo(String key, String articleNo) {
        redisService.set(key, articleNo);
    }


    @Override
    public void start(int depth) throws Exception {
        if(WxCrawlerConstant.ProxyPolicy.ABUYUN.equalsIgnoreCase(proxyPolicy)) {
            setRequester(new AbuyunProxyRequester(proxyConfig.getAbuyunAccount(), proxyConfig.getAbuyunPassword()));
        } else {
            setRequester(new WxProxyRequest(proxyPolicy));
        }
        super.start(depth);
    }

    /**
     * 根据公众号名称设置种子URL
     * @param account
     * @throws UnsupportedEncodingException
     */
    public void addAccount(String account) throws UnsupportedEncodingException {
        String seedUrl = WxCrawlerConstant.SEARCH_URL + URLEncoder.encode(account, "utf-8");
        CrawlDatum seed = new CrawlDatum(seedUrl, WxCrawlerConstant.CrawlDatumType.ACCOUNT_SEARCH)
                .meta(WxCrawlerConstant.CrawlMetaKey.ACCOUNT_NAME, account)
                .meta(WxCrawlerConstant.CrawlMetaKey.TRIED_COUNT, 0);
        addSeed(seed);
    }
}