package com.xuxueli.crawler.util;

import com.xuxueli.crawler.conf.XxlCrawlerConf;
import com.xuxueli.crawler.model.PageRequest;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.HashSet;
import java.util.Set;

/**
 * jsoup tool
 *
 * @author xuxueli 2015-05-14 22:44:43
 */
public class JsoupUtil {
    private static Logger logger = LoggerFactory.getLogger(JsoupUtil.class);

    /**
     * 加载页面
     *
     * @param pageRequest
     *
     * @return Document
     */
    public static Document load(PageRequest pageRequest) {
        if (!UrlUtil.isUrl(pageRequest.getUrl())) {
            return null;
        }
        try {
            // 请求设置
            Connection conn = Jsoup.connect(pageRequest.getUrl());
            if (pageRequest.getParamMap() != null && !pageRequest.getParamMap().isEmpty()) {
                conn.data(pageRequest.getParamMap());
            }
            if (pageRequest.getCookieMap() != null && !pageRequest.getCookieMap().isEmpty()) {
                conn.cookies(pageRequest.getCookieMap());
            }
            if (pageRequest.getHeaderMap()!=null && !pageRequest.getHeaderMap().isEmpty()) {
                conn.headers(pageRequest.getHeaderMap());
            }
            if (pageRequest.getUserAgent()!=null) {
                conn.userAgent(pageRequest.getUserAgent());
            }
            if (pageRequest.getReferrer() != null) {
                conn.referrer(pageRequest.getReferrer());
            }
            conn.timeout(pageRequest.getTimeoutMillis());
            conn.validateTLSCertificates(pageRequest.isValidateTLSCertificates());
            conn.maxBodySize(0);    // 取消默认1M限制

            // 代理
            if (pageRequest.getProxy() != null) {
                conn.proxy(pageRequest.getProxy());
            }

            // 发出请求
            Document html = null;
            if (pageRequest.isIfPost()) {
                html = conn.post();
            } else {
                html = conn.get();
            }
            return html;
        } catch (IOException e) {
            logger.error(e.getMessage(), e);
            return null;
        }
    }


    public static String loadPageSource(PageRequest pageRequest) {
        if (!UrlUtil.isUrl(pageRequest.getUrl())) {
            return null;
        }
        try {
            // 请求设置
            Connection conn = Jsoup.connect(pageRequest.getUrl());
            if (pageRequest.getParamMap() != null && !pageRequest.getParamMap().isEmpty()) {
                conn.data(pageRequest.getParamMap());
            }
            if (pageRequest.getCookieMap() != null && !pageRequest.getCookieMap().isEmpty()) {
                conn.cookies(pageRequest.getCookieMap());
            }
            if (pageRequest.getHeaderMap()!=null && !pageRequest.getHeaderMap().isEmpty()) {
                conn.headers(pageRequest.getHeaderMap());
            }
            if (pageRequest.getUserAgent()!=null) {
                conn.userAgent(pageRequest.getUserAgent());
            }
            if (pageRequest.getReferrer() != null) {
                conn.referrer(pageRequest.getReferrer());
            }
            conn.timeout(pageRequest.getTimeoutMillis());
            conn.validateTLSCertificates(pageRequest.isValidateTLSCertificates());
            conn.maxBodySize(0);    // 取消默认1M限制

            // 代理
            if (pageRequest.getProxy() != null) {
                conn.proxy(pageRequest.getProxy());
            }

            conn.ignoreContentType(true);
            conn.method(pageRequest.isIfPost()?Connection.Method.POST:Connection.Method.GET);

            // 发出请求
            Connection.Response resp = conn.execute();
            String pageSource = resp.body();
            return pageSource;
        } catch (IOException e) {
            logger.error(e.getMessage(), e);
            return null;
        }
    }

    /**
     * 抽取元素数据
     *
     * @param fieldElement
     * @param selectType
     * @param selectVal
     * @return String
     */
    public static String parseElement(Element fieldElement, XxlCrawlerConf.SelectType selectType, String selectVal) {
        String fieldElementOrigin = null;
        if (XxlCrawlerConf.SelectType.HTML == selectType) {
            fieldElementOrigin = fieldElement.html();
        } else if (XxlCrawlerConf.SelectType.VAL == selectType) {
            fieldElementOrigin = fieldElement.val();
        } else if (XxlCrawlerConf.SelectType.TEXT == selectType) {
            fieldElementOrigin = fieldElement.text();
        } else if (XxlCrawlerConf.SelectType.ATTR == selectType) {
            fieldElementOrigin = fieldElement.attr(selectVal);
        }  else if (XxlCrawlerConf.SelectType.HAS_CLASS == selectType) {
            fieldElementOrigin = String.valueOf(fieldElement.hasClass(selectVal));
        }  else {
            fieldElementOrigin = fieldElement.toString();
        }
        return fieldElementOrigin;
    }

    /**
     * 获取页面上所有超链接地址 (<a>标签的href值)
     *
     * @param html  页面文档
     * @return Set<String>
     */
    public static Set<String> findLinks(Document html) {

        if (html == null) {
            return null;
        }

        // element
        /**
         *
         * Elements resultSelect = html.select(tagName);	// 选择器方式
         * Element resultId = html.getElementById(tagName);	// 元素ID方式
         * Elements resultClass = html.getElementsByClass(tagName);	// ClassName方式
         * Elements resultTag = html.getElementsByTag(tagName);	// html标签方式 "body"
         *
         */
        Elements hrefElements = html.select("a[href]");

        // 抽取数据
        Set<String> links = new HashSet<String>();
        if (hrefElements!=null && hrefElements.size() > 0) {
            for (Element item : hrefElements) {
                String href = item.attr("abs:href");    // href、abs:href
                if (UrlUtil.isUrl(href)) {
                    links.add(href);
                }
            }
        }
        return links;
    }

    /**
     * 获取页面上所有图片地址 (<a>标签的href值)
     *
     * @param html
     * @return Set<String>
     */
    public static Set<String> findImages(Document html) {

        Elements imgs = html.getElementsByTag("img");

        Set<String> images = new HashSet<String>();
        if (imgs!=null && imgs.size() > 0) {
            for (Element element: imgs) {
                String imgSrc = element.attr("abs:src");
                images.add(imgSrc);
            }
        }

        return images;
    }

}