/**
 *
 * APDPlat - Application Product Development Platform
 * Copyright (c) 2013, 杨尚川, [email protected]
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

package org.seo.rank.impl;

import org.apdplat.word.WordSegmenter;
import org.apdplat.word.segmentation.Word;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.seo.rank.SimilarChecker;
import org.seo.rank.tools.DynamicIp;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.math.BigDecimal;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * ITEYE博文相似性检测
 * @author 杨尚川
 */
public class ITEYEBlogSimilarChecker implements SimilarChecker{
    private static final Logger LOGGER = LoggerFactory.getLogger(ITEYEBlogSimilarChecker.class);
    private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
    private static final String ENCODING = "gzip, deflate";
    private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
    private static final String CONNECTION = "keep-alive";
    private static final String REFERER = "http://yangshangchuan.iteye.com";
    private static final String HOST = "yangshangchuan.iteye.com";
    private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0";
    private static final String BLOG_CSS_PATH = "html body div#page div#content.clearfix div#main div.blog_main";
    private static final String BLOG_TITLE_CSS_PATH = "div.blog_title";
    private static final String BLOG_CONTENT_CSS_PATH = "div#blog_content.blog_content";
    private static final float THRESHOLD_RATE = 0.8F;

    @Override
    public boolean isSimilar(String url1, String url2) {
        return similarScore(url1, url2)>=THRESHOLD_RATE;
    }
    @Override
    public double similarScore(String url1, String url2) {
        Blog blog1 = getBlog(url1);
        if(blog1!=null) {
            Blog blog2 = getBlog(url2);
            if(blog2!=null) {
                double score = score(blog1, blog2);
                //取两位小数
                score = (int)(score*100)/(double)100;
                return score;
            }
        }
        return 0;
    }

    private double score(Blog blog1, Blog blog2){
        //分词
        List<Word> blog1Words = WordSegmenter.seg(blog1.getTitle()+"\n"+blog1.getContent());
        List<Word> blog2Words = WordSegmenter.seg(blog2.getTitle()+"\n"+blog2.getContent());
        //词频统计
        Map<Word, AtomicInteger> blog1WordsFre = frequence(blog1Words);
        Map<Word, AtomicInteger> blog2WordsFre = frequence(blog2Words);
        //输出详细信息
        if(LOGGER.isDebugEnabled()){
            showDetail(blog1, blog1Words, blog1WordsFre);
            showDetail(blog2, blog2Words, blog2WordsFre);
        }
        //使用简单共有词判定
        return simpleScore(blog1WordsFre, blog2WordsFre);
        //使用余弦相似度判定
        //return cosScore(blog1WordsFre, blog2WordsFre);
    }

    /**
     * 判定相似性的方式一:简单共有词
     * @param blog1WordsFre
     * @param blog2WordsFre
     * @return
     */
    private double simpleScore(Map<Word, AtomicInteger> blog1WordsFre, Map<Word, AtomicInteger> blog2WordsFre){
        //判断有几个相同的词
        AtomicInteger intersectionLength = new AtomicInteger();
        blog1WordsFre.keySet().forEach(word -> {
            if (blog2WordsFre.keySet().contains(word)) {
                intersectionLength.incrementAndGet();
            }
        });
        LOGGER.info("网页1有的词数:" + blog1WordsFre.size());
        LOGGER.info("网页2有的词数:" + blog2WordsFre.size());
        LOGGER.info("网页1和2共有的词数:" + intersectionLength.get());
        double score = intersectionLength.get()/(double)Math.min(blog1WordsFre.size(), blog2WordsFre.size());
        LOGGER.info("相似度分值="+intersectionLength.get()+"/(double)Math.min("+blog1WordsFre.size()+", "+blog2WordsFre.size()+")="+score);
        return score;
    }

    /**
     *
     * 判定相似性的方式二:余弦相似度
     * 余弦夹角原理:
     * 向量a=(x1,y1),向量b=(x2,y2)
     * a.b=x1x2+y1y2
     * |a|=根号[(x1)^2+(y1)^2],|b|=根号[(x2)^2+(y2)^2]
     * a,b的夹角的余弦cos=a.b/|a|*|b|=(x1x2+y1y2)/根号[(x1)^2+(y1)^2]*根号[(x2)^2+(y2)^2]
     * @param blog1WordsFre
     * @param blog2WordsFre
     */
    private double cosScore(Map<Word, AtomicInteger> blog1WordsFre, Map<Word, AtomicInteger> blog2WordsFre){
        Set<Word> words = new HashSet<>();
        words.addAll(blog1WordsFre.keySet());
        words.addAll(blog2WordsFre.keySet());
        //向量的维度为words的大小,每一个维度的权重是词频,注意的是,中文分词的时候已经去了停用词
        //a.b
        AtomicInteger ab = new AtomicInteger();
        //|a|
        AtomicInteger aa = new AtomicInteger();
        //|b|
        AtomicInteger bb = new AtomicInteger();
        //计算
        words
            .stream()
            .forEach(word -> {
                AtomicInteger x1 = blog1WordsFre.get(word);
                AtomicInteger x2 = blog2WordsFre.get(word);
                if(x1!=null && x2!=null) {
                    //x1x2
                    int oneOfTheDimension = x1.get() * x2.get();
                    //+
                    ab.addAndGet(oneOfTheDimension);
                }
                if(x1!=null){
                    //(x1)^2
                    int oneOfTheDimension = x1.get() * x1.get();
                    //+
                    aa.addAndGet(oneOfTheDimension);
                }
                if(x2!=null){
                    //(x2)^2
                    int oneOfTheDimension = x2.get() * x2.get();
                    //+
                    bb.addAndGet(oneOfTheDimension);
                }
            });

        double aaa = Math.sqrt(aa.get());
        double bbb = Math.sqrt(bb.get());
        //使用BigDecimal保证精确计算浮点数
        BigDecimal aabb = BigDecimal.valueOf(aaa).multiply(BigDecimal.valueOf(bbb));
        double cos = ab.get()/aabb.doubleValue();
        return cos;
    }

    private void showDetail(Blog blog, List<Word> blogWords, Map<Word, AtomicInteger> blogWordsFre){
        LOGGER.debug("博文URL:");
        LOGGER.debug("\t"+blog.getUrl());
        LOGGER.debug("博文标题:");
        LOGGER.debug("\t"+blog.getTitle());
        LOGGER.debug("博文内容:");
        LOGGER.debug("\t"+blog.getContent());
        LOGGER.debug("博文长度:"+blog.getContent().length());
        LOGGER.debug("博文分词结果:");
        LOGGER.debug("\t" + blogWords);
        LOGGER.debug("博文词频统计:");
        AtomicInteger c = new AtomicInteger();
        blogWordsFre
                .entrySet()
                .stream()
                .sorted((a,b)->b.getValue().get()-a.getValue().get())
                .forEach(e->LOGGER.debug("\t"+c.incrementAndGet()+"、"+e.getKey()+"="+e.getValue()));
    }

    private Map<Word, AtomicInteger> frequence(List<Word> words){
        Map<Word, AtomicInteger> fre =new HashMap<>();
        words.forEach(word->{
            fre.putIfAbsent(word, new AtomicInteger());
            fre.get(word).incrementAndGet();
        });
        return fre;
    }

    private Blog getBlog(String url) {
        try {
            String html = getHtml(url);
            Document doc = Jsoup.parse(html);
            Elements elements = doc.select(BLOG_CSS_PATH);
            String title = null;
            String content = null;
            for(Element element : elements){
                Elements ts = element.select(BLOG_TITLE_CSS_PATH);
                if(ts.size()==1){
                    title = ts.get(0).text();
                }
                ts = element.select(BLOG_CONTENT_CSS_PATH);
                if(ts.size()==1){
                    content = ts.get(0).text();
                }
            }
            if(title!=null && content!=null){
                Blog blog = new Blog();
                blog.setUrl(url);
                blog.setTitle(title);
                blog.setContent(content);
                return blog;
            }
        } catch (Exception e) {
            LOGGER.error("获取博文失败", e);
        }
        return null;
    }
    private String getHtml(String url){
        String html = getHtmlInternal(url);
        int times = 1;
        while (html==null && times<4){
            times++;
            //使用新的IP地址
            DynamicIp.toNewIp();
            html = getHtmlInternal(url);
        }
        times = 1;
        //LOGGER.debug("获取到的HTML:" +html);
        while((html.contains("非常抱歉,来自您ip的请求异常频繁")
                || html.contains("请您点击按钮解除封锁")
                || html.contains("请输入以下验证码"))
                && times<4){
            times++;
            //使用新的IP地址
            DynamicIp.toNewIp();
            html = getHtmlInternal(url);
        }
        return html;
    }
    private String getHtmlInternal(String url) {
        try {
            return Jsoup.connect(url)
                    .header("Accept", ACCEPT)
                    .header("Accept-Encoding", ENCODING)
                    .header("Accept-Language", LANGUAGE)
                    .header("Connection", CONNECTION)
                    .header("Referer", REFERER)
                    .header("Host", HOST)
                    .header("User-Agent", USER_AGENT)
                    .header("X-Forwarded-For", getRandomIp())
                    .header("Proxy-Client-IP", getRandomIp())
                    .header("WL-Proxy-Client-IP", getRandomIp())
                    .ignoreContentType(true)
                    .timeout(30000)
                    .get().html();
        } catch (Exception e) {
            LOGGER.error("获取博文失败", e);
        }
        return null;
    }
    private String getRandomIp(){
        int first = new Random().nextInt(254)+1;
        //排除A类私有地址0.0.0.0--10.255.255.255
        while(first==10){
            first = new Random().nextInt(254)+1;
        }
        int second = new Random().nextInt(254)+1;
        //排除B类私有地址172.16.0.0--172.31.255.255
        while(first==172 && (second>=16 && second<=31)){
            first = new Random().nextInt(254)+1;
            second = new Random().nextInt(254)+1;
        }
        //排除C类私有地址192.168.0.0--192.168.255.255
        while(first==192 && second==168){
            first = new Random().nextInt(254)+1;
            second = new Random().nextInt(254)+1;
        }
        int third = new Random().nextInt(254)+1;
        int forth = new Random().nextInt(254)+1;
        return first+"."+second+"."+second+"."+forth;
    }
    private static class Blog{
        private String url;
        private String title;
        private String content;

        public String getUrl() {
            return url;
        }

        public void setUrl(String url) {
            this.url = url;
        }

        public String getTitle() {
            return title;
        }

        public void setTitle(String title) {
            this.title = title;
        }

        public String getContent() {
            return content;
        }

        public void setContent(String content) {
            this.content = content;
        }
    }

    public static void main(String[] args) {
        SimilarChecker similarChecker = new ITEYEBlogSimilarChecker();
        double score = similarChecker.similarScore("http://baidu-27233181.iteye.com/blog/2200707",
                "http://baidu-27233181.iteye.com/blog/2200706");
        LOGGER.info("相似度分值:"+score);
    }
}