package com.geccocrawler.gecco.spider.render.html; import java.lang.reflect.Field; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.parser.Parser; import org.jsoup.select.Elements; import com.geccocrawler.gecco.annotation.Attr; import com.geccocrawler.gecco.annotation.Href; import com.geccocrawler.gecco.annotation.Html; import com.geccocrawler.gecco.annotation.Image; import com.geccocrawler.gecco.annotation.Text; import com.geccocrawler.gecco.request.HttpRequest; import com.geccocrawler.gecco.response.HttpResponse; import com.geccocrawler.gecco.spider.SpiderBean; import com.geccocrawler.gecco.spider.SpiderThreadLocal; import com.geccocrawler.gecco.spider.conversion.Conversion; import com.geccocrawler.gecco.spider.render.Render; import com.geccocrawler.gecco.spider.render.RenderContext; import com.geccocrawler.gecco.spider.render.RenderType; public class HtmlParser { private Log log; private Document document; private String baseUri; public HtmlParser(String baseUri, String content) { long beginTime = System.currentTimeMillis(); log = LogFactory.getLog(HtmlParser.class); this.baseUri = baseUri; if (isTable(content)) { this.document = Jsoup.parse(content, baseUri, Parser.xmlParser()); } else { this.document = Jsoup.parse(content, baseUri); } long endTime = System.currentTimeMillis(); if (log.isTraceEnabled()) { log.trace("init html parser : " + (endTime - beginTime) + "ms"); } } public String baseUri() { return baseUri; } public Object $basic(String selector, Field field) throws Exception { if (field.isAnnotationPresent(Text.class)) {// @Text Text text = field.getAnnotation(Text.class); String value = $text(selector, text.own()); return Conversion.getValue(field.getType(), value); } else if (field.isAnnotationPresent(Image.class)) {// @Image Image image = field.getAnnotation(Image.class); String imageSrc = $image(selector, image.value()); /*String localPath = DownloadImage.download(image.download(), imageSrc); if (StringUtils.isNotEmpty(localPath)) { return localPath; }*/ return imageSrc; } else if (field.isAnnotationPresent(Href.class)) {// @Href Href href = field.getAnnotation(Href.class); String url = $href(selector, href.value()); return url; } else if (field.isAnnotationPresent(Attr.class)) {// @Attr Attr attr = field.getAnnotation(Attr.class); String name = attr.value(); return Conversion.getValue(field.getType(), $attr(selector, name)); } else if (field.isAnnotationPresent(Html.class)) {// @Html Html html = field.getAnnotation(Html.class); return $html(selector, html.outer()); } else {// @Html return $html(selector); } } public List<Object> $basicList(String selector, Field field) throws Exception { List<Object> list = new ArrayList<Object>(); Elements els = $(selector); for (Element el : els) { if (field.isAnnotationPresent(Text.class)) {// @Text Text text = field.getAnnotation(Text.class); list.add(Conversion.getValue(field.getType(), $text(el, text.own()))); } else if (field.isAnnotationPresent(Image.class)) {// @Image Image image = field.getAnnotation(Image.class); String imageSrc = $image(el, image.value()); /*String localPath = DownloadImage.download(image.download(), imageSrc); if (StringUtils.isNotEmpty(localPath)) { list.add(localPath); }*/ list.add(imageSrc); } else if (field.isAnnotationPresent(Href.class)) {// @Href Href href = field.getAnnotation(Href.class); String url = $href(el, href.value()); list.add(url); } else if (field.isAnnotationPresent(Attr.class)) {// @Attr Attr attr = field.getAnnotation(Attr.class); String name = attr.value(); list.add(Conversion.getValue(field.getType(), $attr(el, name))); } else if (field.isAnnotationPresent(Html.class)) {// @Html Html html = field.getAnnotation(Html.class); list.add(html.outer() ? el.outerHtml() : el.html()); } else {// Other list.add(el.html()); } } return list; } public SpiderBean $bean(String selector, HttpRequest request, Class<? extends SpiderBean> clazz) { String subHtml = $html(selector); // table HttpResponse subResponse = HttpResponse.createSimple(subHtml); Render render = RenderContext.getRender(RenderType.HTML); return render.inject(clazz, request, subResponse); } public List<SpiderBean> $beanList(String selector, HttpRequest request, Class<? extends SpiderBean> clazz) { List<SpiderBean> list = new ArrayList<SpiderBean>(); List<String> els = $list(selector); for (String el : els) { // table HttpResponse subResponse = HttpResponse.createSimple(el); Render render = RenderContext.getRender(RenderType.HTML); SpiderBean subBean = render.inject(clazz, request, subResponse); list.add(subBean); } return list; } public Elements $(String selector) { Elements elements = document.select(selector); if (SpiderThreadLocal.get().getEngine().isDebug()) { if (!selector.equalsIgnoreCase("script")) { // log.debug("["+selector+"]--->["+elements+"]"); System.out.println("[" + selector + "]--->[" + elements + "]"); } } return elements; } public Element $element(String selector) { Elements elements = $(selector); if (elements != null && elements.size() > 0) { return elements.first(); } return null; } public List<String> $list(String selector) { List<String> list = new ArrayList<String>(); Elements elements = $(selector); if (elements != null) { for (Element ele : elements) { list.add(ele.outerHtml()); } } return list; } public String $html(String selector) { return $html(selector, false); } public String $html(String selector, boolean isOuter) { Elements elements = $(selector); if (elements != null) { if(isOuter) { return elements.outerHtml(); } return elements.html(); } return null; } public String $text(Element element, boolean own) { if (element == null) { return null; } String text = ""; if (own) { text = element.ownText(); } else { text = element.text(); } // 替换掉空格信息 return StringUtils.replace(text, "\u00A0", ""); } public String $text(String selector, boolean own) { Element element = $element(selector); if (element != null) { return $text(element, own); } return null; } public String $attr(Element element, String attr) { if (element == null) { return null; } return element.attr(attr); } public String $attr(String selector, String attr) { Element element = $element(selector); if (element == null) { return null; } return element.attr(attr); } public String $href(Element href, String attr) { if (href == null) { return null; } return href.absUrl(attr); } public String $href(Element href, String... attrs) { if (href == null) { return null; } for (String attr : attrs) { String value = $href(href, attr); if (StringUtils.isNotEmpty(value)) { return value; } } return $href(href, "href"); } public String $href(String selector, String attr) { return $href($element(selector), attr); } public String $href(String selector, String... attrs) { return $href($element(selector), attrs); } public String $image(Element img, String attr) { if (img == null) { return null; } return img.absUrl(attr); } public String $image(Element img, String... attrs) { if (img == null) { return null; } for (String attr : attrs) { String value = $image(img, attr); if (StringUtils.isNotEmpty(value)) { return value; } } return $image(img, "src"); } public String $image(String selector, String attr) { return $image($element(selector), attr); } public String $image(String selector, String... attrs) { return $image($element(selector), attrs); } public void setLogClass(Class<? extends SpiderBean> spiderBeanClass) { log = LogFactory.getLog(spiderBeanClass); } private boolean isTable(String content) { if (!StringUtils.contains(content, "</html>")) { String rege = "<\\s*(thead|tbody|tr|td|th)[\\s\\S]+"; Pattern pattern = Pattern.compile(rege); Matcher matcher = pattern.matcher(content); if (matcher.matches()) { return true; } } return false; } }