/**
 * Copyright 2015-2016 Abola Lee
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.github.abola.crawler;

import org.apache.commons.httpclient.Cookie;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.impl.SimpleLog;
import org.apache.commons.vfs2.CacheStrategy;
import org.apache.commons.vfs2.FileContent;
import org.apache.commons.vfs2.FileSystemException;
import org.apache.commons.vfs2.FileSystemOptions;
import org.apache.commons.vfs2.impl.StandardFileSystemManager;
import org.apache.commons.vfs2.provider.http.HttpFileSystemConfigBuilder;
import org.json.JSONArray;
import org.json.JSONObject;
import org.json.XML;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
import org.mozilla.universalchardet.UniversalDetector;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

/**
 * 資料爬蟲包 
 *  
 * 
 * @author Abola Lee <[email protected]>
 */
public class CrawlerPack {

    static SimpleLog log = new SimpleLog("simple.logger.com.github.abola.crawler.CrawlerPack");

    static StandardFileSystemManager fileSystem ;

    static{
        // create a Self-signed Server Certificates
        // for pass SSL
        XTrustProvider.install();

        // Set default logging level "ERROR"
        log.setLevel(SimpleLog.LOG_LEVEL_WARN);

        try {

            fileSystem = new StandardFileSystemManager();

            fileSystem.setCacheStrategy(CacheStrategy.ON_CALL);

            // change default logger to SimpleLog
            fileSystem.setLogger(log);
            fileSystem.init();
        }catch(FileSystemException fse){
            // ignore
        }
    }

    static CrawlerPack defaultCrawler ;

    /**
     * Create a CrawlerPack instance
     *
     * @return CrawlerPack
     */
    public static CrawlerPack start(){
        if (null == defaultCrawler)
            defaultCrawler = new CrawlerPack();
        return defaultCrawler;
    }

    /**
     * Setting global level logging
     *
     * example:
     *   CrawlerPack.setLoggerLevel( SimpleLog.LOG_LEVEL_INFO );
     *
     * @param level
     */
    public static void setLoggerLevel(int level){
        log.setLevel(level);
        fileSystem.setLogger(log);
    }

    private String userAgent = "Mozilla/5.0 (CrawlerPack; )";

    private List<Cookie> cookies = new ArrayList<>();


    /**
     * Creates a cookie with the given name and value.
     *
     * @param name    the cookie name
     * @param value   the cookie value
     * @return CrawlerPack
     */
    public CrawlerPack addCookie(String name, String value){
        if( null == name ) {
            log.warn("addCookie: Cookie name null.");
            return this;
        }

        cookies.add( new Cookie("", name, value) );

        return this;
    }

    /**
     * Creates a cookie with the given name, value, domain attribute,
     * path attribute, expiration attribute, and secure attribute
     *
     * @param name    the cookie name
     * @param value   the cookie value
     * @param domain  the domain this cookie can be sent to
     * @param path    the path prefix for which this cookie can be sent
     * @param expires the {@link Date} at which this cookie expires,
     *                or <tt>null</tt> if the cookie expires at the end
     *                of the session
     * @param secure if true this cookie can only be sent over secure
     * connections
     *
     */
    public CrawlerPack addCookie(String domain, String name, String value,
                                 String path, Date expires, boolean secure) {
        if( null == name ) {
            log.warn("addCookie: Cookie name null.");
            return this;
        }

        cookies.add(new Cookie(domain, name, value, path, expires, secure));
        return this;
    }

    /**
     * Return a Cookie array
     * and auto importing domain and path when domain was empty.
     *
     * @param uri required Apache Common VFS supported file systems and response JSON format content.
     * @return Cookie[]
     */
    Cookie[] getCookies(String uri){
        if( null == cookies || 0 == cookies.size()) return null;

        for(Cookie cookie: cookies){

            if("".equals(cookie.getDomain())){
                String domain = uri.replaceAll("^.*:\\/\\/([^\\/]+)[\\/]?.*$", "$1");
                cookie.setDomain(domain);
                cookie.setPath("/");
                cookie.setExpiryDate(null);
                cookie.setSecure(false);
            }
        }

        return cookies.toArray(new Cookie[cookies.size()]);
    }

    /**
     * Clear all cookies
     */
    void clearCookies(){
        log.trace("clearCookies: clear all cookies.");
        cookies = new ArrayList<>();
    }


    /**
     * 取得遠端格式為 JSON 的資料
     *
     * @param uri required Apache Common VFS supported file systems and response JSON format content.
     * @return org.jsoup.nodes.Document 
     */
    public org.jsoup.nodes.Document getFromJson(String uri){
        // 取回資料,並轉化為XML格式
        String json = getFromRemote(uri);

        // 將 json 轉化為 xml
        String xml  = jsonToXml(json);

        // 轉化為 Jsoup 物件
        return xmlToJsoupDoc(xml);
    }

    /**
     * 取得遠端格式為 HTML/Html5 的資料
     *
     * @param uri required Apache Common VFS supported file systems and response HTML format content.
     * @return org.jsoup.nodes.Document
     */
    public org.jsoup.nodes.Document getFromHtml(String uri){
        // 取回資料
        String html = getFromRemote(uri);

        // 轉化為 Jsoup 物件
        return htmlToJsoupDoc(html);
    }

    /**
     * 取得遠端格式為 XML 的資料
     *
     * @param uri required Apache Common VFS supported file systems and response XML format content.
     * @return org.jsoup.nodes.Document 
     */
    public org.jsoup.nodes.Document getFromXml(String uri){
        // 取回資料,並轉化為XML格式
        String xml = getFromRemote(uri);

        // 轉化為 Jsoup 物件
        return xmlToJsoupDoc(xml);
    }


    /**
     * 將 json 轉為 XML
     *
     * @param json a json format string.
     * @return XML format string
     */
    public String jsonToXml(String json){
        String xml = "";
        // 處理直接以陣列開頭的JSON,並指定給予 row 的 tag
        if ( "[".equals( json.substring(0,1) ) ){
            xml = XML.toString(new JSONArray(json), "row");
        }else{
            xml = XML.toString(new JSONObject(json));
        }

        return xml;
    }

    /**
     * 透過 Apache Common VFS 套件 取回遠端的資源
     *
     * 能使用的協定參考:
     * @see <a href="https://commons.apache.org/proper/commons-vfs/filesystems.html">commons-vfs filesystems</a>
     */
    public String getFromRemote(String uri){

        // clear cache
        fileSystem.getFilesCache().close();

        String remoteContent ;
        String remoteEncoding = "utf-8";

        log.debug("getFromRemote: Loading remote URI=" + uri);
        FileContent fileContent ;

        try {

            FileSystemOptions fsOptions = new FileSystemOptions();
            // set userAgent
            HttpFileSystemConfigBuilder.getInstance().setUserAgent(fsOptions, userAgent);

            // set cookie if cookies set
            if (0 < this.cookies.size()) {
                HttpFileSystemConfigBuilder.getInstance().setCookies(fsOptions, getCookies(uri));
            }

            log.debug("getFromRemote: userAgent=" + userAgent);
            log.debug("getFromRemote: cookieSize=" + cookies.size());
            log.debug("getFromRemote: cookies=" + cookies.toString());

            fileContent = fileSystem.resolveFile(uri, fsOptions).getContent();

            // 2016-03-22 only pure http/https auto detect encoding
            if ("http".equalsIgnoreCase(uri.substring(0, 4))) {
                fileContent.getSize();  // pass a bug {@link https://issues.apache.org/jira/browse/VFS-427}
                remoteEncoding = fileContent.getContentInfo().getContentEncoding();
            }

            log.debug("getFromRemote: remoteEncoding=" + remoteEncoding + "(auto detect) ");

            // 2016-03-21 修正zip file getContentEncoding 為null
            if (null == remoteEncoding) remoteEncoding = "utf-8";

            if (!"utf".equalsIgnoreCase(remoteEncoding.substring(0, 3))) {
                log.debug("getFromRemote: remote content encoding=" + remoteEncoding);

                // force charset encoding if setRemoteEncoding set
                if (!"utf".equalsIgnoreCase(encoding.substring(0, 3))) {
                    remoteEncoding = encoding;
                } else {
                    // auto detecting encoding
                    remoteEncoding = detectCharset(IOUtils.toByteArray(fileContent.getInputStream()));
                    log.debug("getFromRemote: real encoding=" + remoteEncoding);
                }
            }

            // 透過  Apache VFS 取回指定的遠端資料
            // 2016-02-29 fixed
            remoteContent = IOUtils.toString(fileContent.getInputStream(), remoteEncoding);

        } catch(FileSystemException fse){
            log.warn("getFromRemote: FileSystemException=" + fse.getMessage());
            remoteContent =null;
        }catch(IOException ioe){
            // return empty
            log.warn("getFromRemote: IOException=" + ioe.getMessage());
            remoteContent =null;
        }catch(StringIndexOutOfBoundsException stre){
            log.warn("getFromRemote: StringIndexOutOfBoundsException=" + stre.getMessage());
            log.warn("getFromRemote: uri=" + uri );
            log.warn(stre.getMessage());
            remoteContent =null;
        }

        clearCookies();

        log.debug("getFromRemote: remoteContent=\n" + remoteContent);
        // any exception will return "null"
        return remoteContent;
    }

    /**
     * 將 HTML 轉化為 Jsoup Document 物件
     *
     * HTML的內容就使用Jsoup原生的 HTML Parser
     *
     * @param html Html document
     * @return org.jsoup.nodes.Document
     */
    public org.jsoup.nodes.Document htmlToJsoupDoc(String html){

        // 將 html(html/html5) 轉為 jsoup Document 物件
        Document jsoupDoc = Jsoup.parse(html, "UTF-8", Parser.htmlParser() );
        jsoupDoc.charset(StandardCharsets.UTF_8);

        return jsoupDoc;
    }

    // 替換字元:一定要是 a-zA-Z 開頭的組合
    final static String prefix = "all-lower-case-prefix";

    /**
     * 將 XML 轉化為 Jsoup Document 物件
     *
     * Jsoup 1.9.1+ supported non-ascii tag
     * -----
     * 如果碰到Tag 名稱首字元非 a-zA-Z 的字元,jsoup 會解析為註解
     * 所以必需用騙的先置入 prefix
     * 再改寫xmlParse 在回傳時移除prefix
     *
     * @param xml XML format string
     * @return org.jsoup.nodes.Document
     */
    public org.jsoup.nodes.Document xmlToJsoupDoc(String xml){

        // Tag 首字元非 a-zA-Z 時轉化為註解的問題
        //xml = xml.replaceAll("<([^A-Za-z\\/! ][^\\/>]*)>", "<"+prefix.toLowerCase()+"$1>")
        //         .replaceAll("<\\/([^A-Za-z\\/ ][^\\/>]*)>", "</"+prefix.toLowerCase()+"$1>");

        // 將 xml 轉為 jsoup Document 物件
        //Document jsoupDoc = Jsoup.parse(xml, "", new Parser( new PrefixXmlTreeBuilder(prefix.toLowerCase()) ) );

        Document jsoupDoc = Jsoup.parse(xml, "", Parser.xmlParser() );
        jsoupDoc.charset(StandardCharsets.UTF_8);

        return jsoupDoc;
    }

    private String encoding = "utf-8";

    /**
     * 指定來源資料的編碼格式
     * 必需要在 get 前設定
     *
     * @return CrawlerPack
     */
    public CrawlerPack setRemoteEncoding(String encoding){
        log.debug("setRemoteEncoding: encoding=" + encoding);
        this.encoding = encoding;
        return this;
    }

    private String detectCharset(byte[] content){
        log.debug("detectCharset: ");
        return detectCharset(content, 0);
    }

    final Integer detectBuffer = 1000;

    /**
     * Detecting real content encoding
     * @param content
     * @param offset
     * @return real charset encoding
     */
    private String detectCharset(byte[] content, Integer offset){
        log.debug("detectCharset: offset=" + offset);

        // detect failed
        if( offset > content.length ) return null;

        UniversalDetector detector = new UniversalDetector(null);
        detector.handleData(content, offset, content.length - offset > detectBuffer ? detectBuffer : content.length - offset);
        detector.dataEnd();

        String detectEncoding = detector.getDetectedCharset();

        return null==detectEncoding?detectCharset(content,offset+detectBuffer):detectEncoding;
    }


    /**
     * set header userAgent
     *
     * @param userAgent
     * @return CrawlerPack
     */
    public CrawlerPack setUserAgent(String userAgent){
        log.debug("setUserAgent: userAgent=\"" + userAgent + "\"");
        this.userAgent = userAgent;
        return this;
    }
}