package com.jibug.cetty.sample.handler; import com.jibug.cetty.core.Page; import com.jibug.cetty.core.Seed; import com.jibug.cetty.core.handler.HandlerContext; import com.jibug.cetty.sample.entity.Article; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.parser.Tag; import org.jsoup.select.Elements; import org.springframework.stereotype.Component; import java.text.ParseException; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * 抓取顾小北博客 * http://www.guxiaobei.com/shopify * * @author heyingcai */ @Component public class GuxiaobeiPageHandler extends BasePageHandler { private static final Pattern PAGE_REGEX_PATTERN = Pattern.compile("/page/(\\d+)"); @Override public void process(HandlerContext ctx, Page page) { parseRoute(ctx, page); } @Override protected void parseRoute(HandlerContext ctx, Page page) { String pageUrl = page.getUrl(); if (pageUrl.contains("/page/")) { parseListing(ctx, page); } else { parseBody(ctx, page); } } /** * 解析文章列表 * * @param page */ @Override public void parseListing(HandlerContext ctx, Page page) { Document document = page.getDocument(); Elements articles = document.select("article.excerpt"); List<Seed> seeds = new ArrayList<>(); for (Element article : articles) { Elements header = article.select("header>h2>a"); String url = header.attr("href"); String title = header.attr("title"); Elements listPhoto = article.select("div.focus>a>img"); String listPhotoUrl = listPhoto.attr("src"); String summary = article.select("span.note").first().ownText(); Seed seed = new Seed(url); seed.putAttach("via", page.getSeed().getAttach("via")); seed.putAttach("listPhoto", listPhotoUrl); seed.putAttach("summary", summary); seed.putAttach("title", title); seeds.add(seed); } Matcher matcher = PAGE_REGEX_PATTERN.matcher(page.getUrl()); if (!matcher.find()) { return; } final String pageNumStr = matcher.group(0).replace("/page/", ""); int nextPageNum = Integer.parseInt(pageNumStr); int pageLimit = Integer.parseInt(page.getSeed().getAttach("pageLimit").toString()); if (++nextPageNum <= pageLimit) { String nextPageUrl = String.format("http://www.guxiaobei.com/search/shopify/page/%d", nextPageNum); Seed seed = new Seed(nextPageUrl); seed.putAttach("pageLimit", page.getSeed().getAttach("pageLimit").toString()); seed.putAttach("via", page.getSeed().getAttach("via").toString()); page.addNextSeed(seed); } page.addNextSeed(seeds); ctx.fireReduce(page); } /** * 解析文章内容 * * @param page */ @Override public void parseBody(HandlerContext ctx, Page page) { Document document = page.getDocument(); String publishTime = ""; try { publishTime = dealDateFormat(document.select("meta[property=article:published_time]").attr("content")); } catch (ParseException e) { e.printStackTrace(); } Article article = new Article(); article.setPublishTime(publishTime); Elements content = document.select("article.article-content>*"); buildArticle(article, page, appendBody(content)); page.getResult().putField("article", article); ctx.fireReduce(page); } @Override public Element appendBody(Elements tempBody) { final Element articleBody = new Element(Tag.valueOf("div"), ""); for (final Element pEl : tempBody) { if (pEl.select("div.open-message,div.jp-relatedposts,div.article-social").size() != 0) { continue; } if (pEl.tagName().equals("p")) { Element imgEl = pEl.select("img").first(); if (imgEl != null) { String src = imgEl.attr("src"); if (src.contains("data:image")) { src = imgEl.attr("data-src"); } else if (!src.contains("www.guxiaobei.com")) { src = "http://www.guxiaobei.com" + src; } imgEl.attr("src", src); articleBody.appendChild(buildFigure(imgEl)); continue; } } articleBody.appendChild(pEl); } return articleBody; } }