package edu.ualberta.storyteller.core.parameter;

import edu.ualberta.storyteller.core.util.*;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.Serializable;
import java.util.HashMap;
import java.util.HashSet;
import java.util.StringTokenizer;

public class Parameters implements Serializable {

    //! Language.
    public String language;

    //! Data type.
    public String dataType;

    //! TF boost rate for normal words.
	//! NOTICE: currently, if it is not zero, the program will be out of memory!!!
    public double boostRateNormalWord = 0;

    //! TF boost rate for document's main keywords.
    public double boostRateMainKeyword = 3;

    //! TF boost rate for document's normal keywords.
    public double boostRateNormalKeyword = 1;

    //! Minimum cosine similarity between a keyword graph and its related document.
    //! If the similarity is bigger than this threshold, the document will be included
    //! in the topic.
    public double minSimDoc2KeyGraph = .25;

    //! Algorithm for keyword graph community detection.
    public String communityDetectAlg = "Betweenness";

    //! Algorithm for event splitting.
    //! Currently: "DocRelation", "DocGraph", "Rule"
    public String eventSplitAlg = "DocRelation";

    //! Minimum number of related documents for a topic.
    //! Detected topics with a size smaller than it will be filtered.
    public int minTopicSize = 5;

    //! Minimum document frequency of keyword graph nodes.
    //! Nodes in a keyword graph will a smaller DF will be filtered (too rare).
    public int minNodeDF = 4;

	//! Minimum document frequency of keyword graph edges.
	//! Edges in a keyword graph will a smaller DF will be filtered (too rare).
	public int minEdgeDF = 3;

	//! Minimum edge correlation of keyword graph edges.
	//! KeywordEdge (n1, n2) will smaller correlation will be filtered
	//! (which means node n1 n2 may also be connected with a lot of other nodes).
    //! NOTICE: this param influence the final clusters a lot.
	public double minEdgeCorrelation = .15;

    //! Maximum document frequency percentage of keyword graph nodes.
    //! Nodes appear in over than this percentage documents will be filtered (too normal).
    public double maxNodeDFPercent = .3;

    //! Minimum number of nodes for a cluster (a keyword graph that represents a topic).
    //! Clusters with a smaller size will be filtered.
    public int minClusterNodeSize = 3;

    //! Maximum number of nodes for a cluster (a keyword graph that represents a topic).
    //! Clusters with a bigger size will continue be split by community detection algorithm.
    public int maxClusterNodeSize = 800;

    //! Minimum number of keywords for a document.
    //! Documents that contains less keywords will be filtered.
    public int minDocKeywordSize = 5;

    //! Merge two document clusters if their intersect proportion is bigger than a threshold
    public double minIntersectPercentToMergeCluster = .3;

    //! Minimum conditional probability to duplicate an edge rather than delete it from keyword graph.
    //! Duplicate the edge if the conditional probability is higher than this threshold.
    public double minCpToDuplicateEdge = 1.7;

    //! Whether use document's topic to split a document cluster into sub clusters.
    public boolean useDocumentTopic = false;

    //! Whether use document's title common word's count to split a document cluster into sub clusters.
    public boolean useDocumentTitleCommonWords = false;

    //! Minimum number of common words in title to taken two documents into one group.
    public int minTitleCommonWordsSize = 2;

    //! Minimum percentage of common words in title to taken two documents into one group.
    public double minTitleCommonWordsPercent = .4;

    //! The file that contains stop words. Each line is a word.
    public String fStopwords;
    public HashSet<String> stopwords;

    //! The file that contains SVM model to determine whether two documents are of same event.
    public String fModel;
    public libsvm.svm_model model;

    public String fSameStoryModel;
    public libsvm.svm_model sameStoryModel;

    //! Minimum key graph compatibility for matching a document cluster to an existing story tree.
    public double minKeygraphCompatibilityDc2St = .6;

    //! Minimum compatibility for matching a document cluster to an existing story tree's node.
    public double minCompatibilityDc2Sn = .3;

    //! Minimum tf cosine similarity for matching a document cluster to an existing story tree's node.
    public double minTFCosineSimilarityDc2Sn = .02;

    //! Used to control the influence of timestamp difference between new document and old document.
    //! Assume if time gap is bigger, the possibility that two documents are connected will decay by exponential func.
    //! If this param is bigger, the decay will be faster; otherwise, the decay is smoother.
    //! Calculate time proximity.
    public double deltaTimeGap = .5;

    //! Used to control the influence of document distribution.
    //! Calculate document distributional proximity.
    public double deltaDocDistribution = .5;

    //! How many day's data to keep.
    public int historyLength = 3;

    //! Whether use extra title for topic match.
    public boolean useRelatedNewsTitlesForMatch = false;

    //! The file that contains SVM model to determine whether a document matches a query.
    public String fQueryDocMatchModel;
    public libsvm.svm_model queryDocMatchModel;

    //! Maximum number of docs to match for a topic.
    public int maxMatchedDocsSize = 20;

    /**
     * Parametric constructor.
     * Create Parameters instance from file.
     * @param paramsFile File name of parameters' file.
     * @throws Exception
     */
	public Parameters(String paramsFile) throws Exception {
		load(new DataInputStream(new FileInputStream(paramsFile)));
	}

    /**
     * Load parameters from parameter file.
     * @param in Parameter file input stream.
     * @throws Exception
     */
	public void load(DataInputStream in) throws Exception {
        // create variable to save parameters
		HashMap<String, String> conf = new HashMap<String, String>();

        // read parameter file and parse each line.
        // lines started with "//" are considered to be comments
		String line = null;
		while ((line = in.readLine()) != null) {
			line = line.trim();
			if (line.startsWith("//") || line.length() == 0) {
                continue;
            }

			StringTokenizer st = new StringTokenizer(line, "= ;");
			conf.put(st.nextToken(), st.nextToken());
		}

        // parameters for experimental setup
        language = conf.get("language");
        dataType = conf.get("dataType");

        // parameters to boost word tf
        boostRateNormalWord = Double.parseDouble(conf.get("boostRateNormalWord"));
        boostRateMainKeyword = Double.parseDouble(conf.get("boostRateMainKeyword"));
        boostRateNormalKeyword = Double.parseDouble(conf.get("boostRateNormalKeyword"));

        // parameters to filter documents
        minDocKeywordSize = Integer.parseInt(conf.get("minDocKeywordSize"));

        // parameters to filter keyword graph nodes
        minNodeDF = Integer.parseInt(conf.get("minNodeDF"));
		maxNodeDFPercent = Double.parseDouble(conf.get("maxNodeDFPercent"));

        // parameters to filter keyword graph edges
		minEdgeDF = Integer.parseInt(conf.get("minEdgeDF"));
        minEdgeCorrelation = Double.parseDouble(conf.get("minEdgeCorrelation"));

        // parameters to detect keyword graph communities
        communityDetectAlg = conf.get("communityDetectAlg");

        // parameters to split or filter keyword graphs
		maxClusterNodeSize = Integer.parseInt(conf.get("maxClusterNodeSize"));
		minClusterNodeSize = Integer.parseInt(conf.get("minClusterNodeSize"));
		minIntersectPercentToMergeCluster = Double.parseDouble(conf.get("minIntersectPercentToMergeCluster"));
        minCpToDuplicateEdge = Double.parseDouble(conf.get("minCpToDuplicateEdge"));

        // parameters to assign document to keyword graphs
        minSimDoc2KeyGraph = Double.parseDouble(conf.get("minSimDoc2KeyGraph"));

        // parameters to filter document clusters
        minTopicSize = Integer.parseInt(conf.get("minTopicSize"));

        // parameters to processing document clusters
        useDocumentTopic = Boolean.parseBoolean(conf.get("useDocumentTopic"));
        useDocumentTitleCommonWords = Boolean.parseBoolean(conf.get("useDocumentTitleCommonWords"));
        minTitleCommonWordsSize = Integer.parseInt(conf.get("minTitleCommonWordsSize"));
        minTitleCommonWordsPercent = Double.parseDouble(conf.get("minTitleCommonWordsPercent"));
        fStopwords = conf.get("fStopwords");
        stopwords = NlpUtils.importStopwords(fStopwords, language);
        eventSplitAlg = conf.get("eventSplitAlg");

        // parameters to merge new documents wit stories
        minKeygraphCompatibilityDc2St = Double.parseDouble(conf.get("minKeygraphCompatibilityDc2St"));
        minCompatibilityDc2Sn = Double.parseDouble(conf.get("minCompatibilityDc2Sn"));
        minTFCosineSimilarityDc2Sn = Double.parseDouble(conf.get("minTFCosineSimilarityDc2Sn"));
        deltaTimeGap = Double.parseDouble(conf.get("deltaTimeGap"));
        deltaDocDistribution = Double.parseDouble(conf.get("deltaDocDistribution"));

        // parameters for filter corpora
        historyLength = Integer.parseInt(conf.get("historyLength"));

        // parameters related to event classification supervised learning
        fModel = conf.get("fModel");
        model = libsvm.svm.svm_load_model(fModel);
        fSameStoryModel = conf.get("fSameStoryModel");
        sameStoryModel = libsvm.svm.svm_load_model(fSameStoryModel);

        // parameters for query doc matching
        useRelatedNewsTitlesForMatch = Boolean.parseBoolean(conf.get("useRelatedNewsTitlesForMatch"));
        fQueryDocMatchModel = conf.get("fQueryDocMatchModel");
        queryDocMatchModel = libsvm.svm.svm_load_model(fQueryDocMatchModel);
        maxMatchedDocsSize = Integer.parseInt(conf.get("maxMatchedDocsSize"));

    }

}