/**
 *
 * APDPlat - Application Product Development Platform Copyright (c) 2013, 杨尚川,
 * [email protected]
 *
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software
 * Foundation, either version 3 of the License, or (at your option) any later
 * version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program. If not, see <http://www.gnu.org/licenses/>.
 *
 */

package org.apdplat.superword.extract;

import org.apache.commons.lang.StringUtils;
import org.apdplat.superword.model.SynonymAntonym;
import org.apdplat.superword.model.Word;
import org.apdplat.superword.tools.ProxyIp;
import org.eclipse.jetty.util.ConcurrentHashSet;
import org.jsoup.Connection;
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
 * 汉语同义词反义词提取工具
 * @author 杨尚川
 */
public class ChineseSynonymAntonymExtractor {

    private ChineseSynonymAntonymExtractor(){}

    private static final Logger LOGGER = LoggerFactory.getLogger(ChineseSynonymAntonymExtractor.class);
    private static final String SYNONYM_ANTONYM_CSS_PATH = "html body.bg_main div#layout div#center div#main_box div#dict_main div.simple div#dict_content_3.dict_content div.industry_box div.industry.cn_synon_box";
    private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
    private static final String ENCODING = "gzip, deflate";
    private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
    private static final String CONNECTION = "keep-alive";
    private static final String HOST = "www.iciba.com";
    private static final String REFERER = "http://www.iciba.com/";
    private static final List<String> USER_AGENTS = Arrays.asList("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:27.0) Gecko",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36",
            "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.117 Safari",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.102 Safari/537.36 OPR"
            );
    private static final AtomicInteger uac = new AtomicInteger();
    private static final Map<String, String> ANTONYM = new ConcurrentHashMap<>();
    private static final ExecutorService EXECUTOR_SERVICE = Executors.newCachedThreadPool();
    private static final Set<String> CHECKED_WORDS = new ConcurrentHashSet<>();
    //用来合并不同条目
    private static final Map<Word, Set<Word>> SYNONYM_MAP = new ConcurrentHashMap<>();
    private static final Path CHECKED_WORDS_PATH = Paths.get("src/main/resources/checked_words.txt");
    private static final Path CHINESE_SYNONYM = Paths.get("src/main/resources/chinese_synonym.txt");
    private static final Path CHINESE_ANTONYM = Paths.get("src/main/resources/chinese_antonym.txt");

    public static SynonymAntonym parseSynonymAntonym(String html, String word){
        SynonymAntonym synonymAntonym = new SynonymAntonym();
        synonymAntonym.setWord(new Word(word, ""));
        try {
            for(Element element : Jsoup.parse(html).select(SYNONYM_ANTONYM_CSS_PATH)){
                int size = element.children().size();
                LOGGER.debug("element size:" + size);
                for(int i=0;i<size/2;i++) {
                    String type = element.child(i*2).text();
                    LOGGER.debug("type:"+type);
                    if ("同义词".equals(type)) {
                        String synonym = element.child(i*2+1).text();
                        LOGGER.debug("synonym:"+synonym);
                        for(String w : synonym.split("\\s+")){
                            w=w.replaceAll("\\s+", "");
                            if(w.length()<2){
                                continue;
                            }
                            if(isNotChineseChar(w)){
                                LOGGER.debug("非中文字符:"+w);
                                continue;
                            }
                            if(w.equals(word)){
                                continue;
                            }
                            LOGGER.debug("word:"+w);
                            synonymAntonym.addSynonym(new Word(w, ""));
                        }
                    }
                    if ("反义词".equals(type)) {
                        String antonym = element.child(i*2+1).text();
                        LOGGER.debug("antonym:"+antonym);
                        for(String w : antonym.split("\\s+")){
                            w=w.replaceAll("\\s+", "");
                            if(w.length()<2){
                                continue;
                            }
                            if(isNotChineseChar(w)){
                                LOGGER.debug("非中文字符:"+w);
                                continue;
                            }
                            LOGGER.debug("word:"+w);
                            synonymAntonym.addAntonym(new Word(w, ""));
                        }
                    }
                }
            }
            if(!synonymAntonym.getAntonym().isEmpty() || !synonymAntonym.getSynonym().isEmpty()) {
                LOGGER.info("解析出同义词反义词:" + synonymAntonym);
            }
        }catch (Exception e){
            LOGGER.error("解析同义词反义词出错", e);
        }
        return synonymAntonym;
    }

    public static void parseSynonymAntonym(List<String> words){
        LOGGER.info("开始解析,词数:" + words.size());
        Set<String> SKIP_WORDS = new ConcurrentSkipListSet<>();
        try{
            if(Files.notExists(CHECKED_WORDS_PATH)){
                CHECKED_WORDS_PATH.toFile().createNewFile();
            }
            SKIP_WORDS.addAll(Files.readAllLines(CHECKED_WORDS_PATH));
        }catch (Exception e){
            LOGGER.error("读取文件失败", e);
        }
        int total = words.size()-SKIP_WORDS.size();
        LOGGER.info("之前已经解析的词数:" + SKIP_WORDS.size());
        LOGGER.info("现在还需解析的词数:" + total);
        String url = "http://www.iciba.com/";
        AtomicInteger i = new AtomicInteger();
        EXECUTOR_SERVICE.submit(()->{
            while(true){
                try {
                    Thread.sleep(60000);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }
                save();
            }
        });
        words.parallelStream().forEach(word -> {
            if (SKIP_WORDS.contains(word)) {
                return;
            }
            LOGGER.info("进度:" + total + "/" + i.incrementAndGet() + " 来自线程:" + Thread.currentThread());
            try {
                word = word.trim();
                if ("".equals(word) || isNotChineseChar(word) || word.length() < 2) {
                    return;
                }
                String html = getContent(url + word);
                int times = 1;
                while (StringUtils.isBlank(html) && times < 3) {
                    times++;
                    //使用新的IP地址
                    ProxyIp.toNewIp();
                    html = getContent(url + word);
                }
                if (StringUtils.isBlank(html)) {
                    LOGGER.error("获取页面失败:" + url + word);
                    return;
                }
                times = 1;
                //LOGGER.debug("获取到的HTML:" +html);
                while (html.contains("非常抱歉,来自您ip的请求异常频繁") && times < 3) {
                    times++;
                    //使用新的IP地址
                    ProxyIp.toNewIp();
                    html = getContent(url + word);
                }
                SynonymAntonym synonymAntonym = parseSynonymAntonym(html, word);
                if (!synonymAntonym.getSynonym().isEmpty()) {
                    SYNONYM_MAP.put(synonymAntonym.getWord(), synonymAntonym.getSynonym());
                }
                if (!synonymAntonym.getAntonym().isEmpty()) {
                    StringBuilder str = new StringBuilder();
                    synonymAntonym.getAntonym().forEach(w -> str.append(w.getWord()).append(" "));
                    ANTONYM.put(word, str.toString().trim());
                }
                CHECKED_WORDS.add(word);
            } catch (Exception e) {
                LOGGER.error("错误:", e);
            }
        });
        save();
        filterSameRecord(CHINESE_SYNONYM);
        filterSameRecord(CHINESE_ANTONYM);
    }
    private static synchronized void save(){
        System.out.println("开始保存文件");
        List<String> SYNONYM_LIST = null;
        List<String> ANTONYM_LIST = null;
        try {
            if(Files.notExists(CHINESE_SYNONYM)){
                CHINESE_SYNONYM.toFile().createNewFile();
            }
            if(Files.notExists(CHINESE_ANTONYM)){
                CHINESE_ANTONYM.toFile().createNewFile();
            }
            System.out.println("同义词数:"+SYNONYM_MAP.size());
            Set<String> SYNONYM_STR = new HashSet<>();
            SYNONYM_MAP.keySet().forEach(k -> {
                StringBuilder str = new StringBuilder();
                str.append(k.getWord()).append(" ");
                SYNONYM_MAP.get(k).stream().sorted().forEach(w -> {
                    str.append(w.getWord()).append(" ");
                });
                SYNONYM_STR.add(str.toString().trim());
            });

            List<String> existList = Files.readAllLines(CHINESE_SYNONYM);
            SYNONYM_STR.addAll(existList);
            SYNONYM_LIST = SYNONYM_STR.stream().sorted().collect(Collectors.toList());
            System.out.println("总的同义词数:"+SYNONYM_LIST.size());
            Files.write(CHINESE_SYNONYM, SYNONYM_LIST);

            Set<String> set = ANTONYM.keySet().stream().sorted().map(k -> k + " " + ANTONYM.get(k)).collect(Collectors.toSet());
            existList = Files.readAllLines(CHINESE_ANTONYM);
            set.addAll(existList);
            ANTONYM_LIST = set.stream().sorted().collect(Collectors.toList());
            System.out.println("总的反义词数:"+ANTONYM_LIST.size());
            Files.write(CHINESE_ANTONYM, ANTONYM_LIST);

            existList = Files.readAllLines(CHECKED_WORDS_PATH);
            CHECKED_WORDS.addAll(existList);
            System.out.println("总的已检查词数:" + CHECKED_WORDS.size());
            Files.write(CHECKED_WORDS_PATH, CHECKED_WORDS);
        } catch (Exception e) {
            LOGGER.error("同义词:",SYNONYM_LIST.toString());
            LOGGER.error("反义词:",ANTONYM_LIST.toString());
            LOGGER.error("保存文件失败", e);
        }
    }
    public static String getContent(String url) {
        LOGGER.debug("url:" + url);
        Connection conn = Jsoup.connect(url)
                .header("Accept", ACCEPT)
                .header("Accept-Encoding", ENCODING)
                .header("Accept-Language", LANGUAGE)
                .header("Connection", CONNECTION)
                .header("Referer", REFERER)
                .header("Host", HOST)
                .header("User-Agent", USER_AGENTS.get(uac.incrementAndGet() % USER_AGENTS.size()))
                .header("X-Forwarded-For", getRandomIp())
                .header("Proxy-Client-IP", getRandomIp())
                .header("WL-Proxy-Client-IP", getRandomIp())
                .ignoreContentType(true);
        String html = "";
        try {
            html = conn.post().html();
        }catch (Exception e){
            if(e instanceof  HttpStatusException) {
                HttpStatusException ex = (HttpStatusException) e;
                LOGGER.error("error code:"+ex.getStatusCode());
                if(ex.getStatusCode()==404){
                    return "404";
                }
            }
            LOGGER.error("获取URL:"+url+" 页面出错", e);
        }
        return html;
    }
    public static boolean isNotChineseChar(String str){
        boolean temp = false;
        Pattern p= Pattern.compile("[^\u4e00-\u9fa5]");
        Matcher m=p.matcher(str);
        if(m.find()){
            temp =  true;
        }
        return temp;
    }
    public static SynonymAntonym parseSynonymAntonym(String word){
        try {
            return parseSynonymAntonym(Jsoup.parse(new URL("http://www.iciba.com/" + word), 15000).html(), word);
        }catch (Exception e){
            LOGGER.error("解析同义词反义词出错", e);
        }
        return null;
    }
    public static String getRandomIp(){
        int first = new Random().nextInt(254)+1;
        //排除A类私有地址0.0.0.0--10.255.255.255
        while(first==10){
            first = new Random().nextInt(254)+1;
        }
        int second = new Random().nextInt(254)+1;
        //排除B类私有地址172.16.0.0--172.31.255.255
        while(first==172 && (second>=16 && second<=31)){
            first = new Random().nextInt(254)+1;
            second = new Random().nextInt(254)+1;
        }
        //排除C类私有地址192.168.0.0--192.168.255.255
        while(first==192 && second==168){
            first = new Random().nextInt(254)+1;
            second = new Random().nextInt(254)+1;
        }
        int third = new Random().nextInt(254)+1;
        int forth = new Random().nextInt(254)+1;
        return first+"."+second+"."+second+"."+forth;
    }

    /**
     * 去掉重复的记录,如:
     * 一丘之貉 比众不同
     * 比众不同 一丘之貉
     * 只保留一条记录
     * @param path
     */
    private static void filterSameRecord(Path path){
        try {
            AtomicInteger i = new AtomicInteger();
            Set<String> set = new HashSet<>();
            List<String> list = Files.readAllLines(path).stream().filter(line -> {
                String[] attr = line.split("\\s+");
                String words = Arrays.asList(attr).stream().sorted().collect(Collectors.toList()).toString();
                if (set.contains(words)) {
                    i.incrementAndGet();
                    LOGGER.info("去掉重复的记录:" + line);
                    return false;
                }
                set.add(words);
                return true;
            }).sorted().collect(Collectors.toList());
            Files.write(path, list);
            LOGGER.info("去掉重复的记录数:" + i.get());
        }catch (Exception e){
            LOGGER.error("去掉重复的记录出错", e);
        }
    }

    public static void main(String[] args) throws Exception{
        //parseSynonymAntonym("热爱");
        //parseSynonymAntonym("一举成名");
        //parseSynonymAntonym(Arrays.asList("热爱", "一举成名"));
        //System.out.println(getContent("http://www.iciba.com/%E7%83%AD%E7%88%B1"));
        parseSynonymAntonym(Files.readAllLines(Paths.get("src/main/resources/dic.txt")).stream().sorted((a, b) -> new Integer(a.length()).compareTo(b.length())).collect(Collectors.toList()));
    }
}