java source code of SentimentAnalyser

package gr.iti.mklab.sentiment;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;

import org.apache.commons.collections4.BidiMap;
import org.apache.commons.collections4.bidimap.DualHashBidiMap;

import weka.classifiers.Classifier;
import weka.classifiers.bayes.NaiveBayesMultinomial;
import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instances;
import weka.core.SparseInstance;
import weka.core.tokenizers.NGramTokenizer;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.Remove;
import weka.filters.unsupervised.attribute.StringToWordVector;

public class SentimentAnalyser {
	
	Trainer tr;
	PolarityClassifier pc;
	TweetPreprocessor tp;
	
	boolean useSlidingWindow;
	StringToWordVector stwv;

	Instances train;
	Instances test;
	
	BidiMap<String, Integer> train_attributes;	// bigram-position
	Classifier multiNB;
	Instances training_text;
	
	/**Constructor. 
	 * "main_folder" is provided in order to define the initial directory to work on; 
	 * "useSW" refers to whether the training should be made on the most recent 1,000 tweets or no.
	 * @throws Exception */
	public SentimentAnalyser(String main_folder, boolean useSW, String test_dataset) throws Exception{
		tr = new Trainer(main_folder);			//tr.train();
		pc = new PolarityClassifier(main_folder, tr.getTextAttributes(), tr.getFeatureAttributes(), tr.getComplexAttributes());
		tp = new TweetPreprocessor(main_folder);
		initializeFilter();
		
		useSlidingWindow = useSW;
        ArrayList<Attribute> attributes = new ArrayList<Attribute>();
        ArrayList<String> classVal = new ArrayList<String>();
        classVal.add("positive");
        classVal.add("negative");
        attributes.add(new Attribute("text",(ArrayList<String>)null));
        attributes.add(new Attribute("sentimentClassValue",classVal));
		train = new Instances("somerel", attributes, 0);
		train.setClassIndex(1);
		test = new Instances("somerel", attributes, 0);
		test.setClassIndex(1);
		
		if (useSlidingWindow == false){
			multiNB = (Classifier) weka.core.SerializationHelper.read(main_folder+"/test_models/"+test_dataset+".model");
			BufferedReader rd = new BufferedReader(new FileReader(new File(main_folder+"test_models/"+test_dataset+"-attributes.tsv")));
			train_attributes = new DualHashBidiMap<String, Integer>();
			String inline;
			int cnt = 0;
			while ((inline=rd.readLine())!=null){
				train_attributes.put(inline, cnt);
				cnt++;
			}
			rd.close();
			BufferedReader frd = new BufferedReader(new FileReader(new File(main_folder+"test_models/"+test_dataset+"-attributes.arff")));
			training_text = new Instances(frd);
			frd.close();
		}
	}

	

	/**Starts the whole process: preprocesses the given tweet, creates different representations
	 * of it (stored in "all[]" Instances) and tests it in the PolarityClassifier class.*/
	public String getPolarity(String tweet){
		tp.setTweet(tweet);
		String dataset = tp.startProc();
		Instances[] all = tp.getAllInstances();
		String out = pc.test(dataset, all);
		if (useSlidingWindow == true){
			if (out.contains("pos") || out.contains("neg")){	//if HC and LC agree ("positive"/"negative"), then put this document in the training set			
				double[] instanceValues = new double[train.numAttributes()];
		        instanceValues[0] = train.attribute(0).addStringValue(tweet);
		        if (out.contains("pos"))
		        	instanceValues[1] = 0;
		        else
		        	instanceValues[1] = 1;
		        if (train.numInstances()>1000){
		        	train.remove(0);
		        }
		        train.add(new DenseInstance(1.0, instanceValues));
			} else{		// in case of HC & LC disagreement, add the document in the test set; it will be classified in the end of the process.
				if (train.numInstances()>0)
					out = clarifyOnSlidingWindow(tweet);
				else
					out = "positive (random)";
			}
		} else{		// if useSlidingWindow is set to "true", then use the model
			if (out.contains("pos") || out.contains("neg")){
				return out;
			} else{
				out = clarifyOnModel(tweet);
			}
		}
		return out;
	}
	
	/**Decides upon a "disagreed" document by applying the learned model based on the last 1,000 "agreed" documents.*/
	private String clarifyOnSlidingWindow(String tweet){
		String out = "";
        double[] instanceValues = new double[train.numAttributes()];
        instanceValues[0] = train.attribute(0).addStringValue(tweet);
		train.add(new SparseInstance(1.0, instanceValues));
		try {
			stwv.setInputFormat(train);
			Instances newData = Filter.useFilter(train, stwv);
			Instances train_ins = new Instances(newData, 0, train.size()-1);
			Instances test_ins = new Instances(newData, train.size()-1, 1);
			Classifier mnb = (Classifier)new NaiveBayesMultinomial();
			mnb.buildClassifier(train_ins);
			double[] preds = mnb.distributionForInstance(test_ins.get(0));
			if (preds[0]>0.5)
				out = "positive";
			else
				out = "negative";
		} catch (Exception e) {
			e.printStackTrace();
		}
		train.remove(train.numInstances()-1);
		return out;
	}
	
	/**Decides upon a "disagreed" document by applying the learned model based on the previously build model.*/
	private String clarifyOnModel(String tweet){
		String out = "";
		
		// get the text-based representation of the document
        double[] instanceValues = new double[2];
        instanceValues[0] = test.attribute(0).addStringValue(tweet);
        test.add(new SparseInstance(1.0, instanceValues));
        try{
        	stwv.setInputFormat(test);
        	Instances newData = Filter.useFilter(test, stwv);
    		
        	// re-order attributes so that they are compatible with the training set's ones
        	Instances test_instance = reformatText(newData);
        	
        	// find the polarity of the document based on the previously built model
        	test_instance.setClassIndex(0);
        	double[] preds = multiNB.distributionForInstance(test_instance.get(0));
        	if (preds[0]>0.5)
        		out = "light positive";
        	else
        		out = "light negative";
        } catch (Exception e){
        	e.printStackTrace();
        }
        test.remove(0);
		return out;
	}
	
	/**Re-order the attributes of the given Instances according to the training file; keeps Weka happy.*/
	private Instances reformatText(Instances text_test){	
		// remove the attributes from the test set that are not used in the train set
		String[] options = new String[2];
		options[0] = "-R";
		String opt = "";
		boolean found = false;
		for (int j=0; j<text_test.numAttributes(); j++){
			if (train_attributes.get(text_test.attribute(j).name())==null){
				int pos = j+1;
				found = true;
				opt = opt+pos+",";
			} 
		}
		if (found==true)
			options[1] = opt.substring(0,opt.length()-1);
		else
			options[1] = "";
		Remove remove = new Remove();
		try {
			remove.setOptions(options);
			remove.setInputFormat(text_test);
			Instances newData = Filter.useFilter(text_test, remove);
			
			double[] values = new double[train_attributes.size()];			
			for (int at=0; at<newData.numAttributes(); at++){			
				int pos  = train_attributes.get(newData.attribute(at).name());		// get the index of this attribute in the train set
				values[pos] = newData.get(0).value(at);					// ...and its value
			}
			training_text.add(0, new SparseInstance(1.0, values));
			Instances tw = new Instances(training_text,0,1);
			training_text.remove(0);
			return tw;
		} catch (Exception e) {
			e.printStackTrace();
		}
		return null;
	}
	/**StringToWordVector filter initialization.*/
	private void initializeFilter(){
		stwv = new StringToWordVector();
		stwv.setLowerCaseTokens(true);
		stwv.setMinTermFreq(1);
		stwv.setUseStoplist(false);
		stwv.setTFTransform(false);
		stwv.setIDFTransform(false);		
		stwv.setWordsToKeep(1000000000);
		NGramTokenizer tokenizer = new NGramTokenizer();
		tokenizer.setNGramMinSize(2);
		tokenizer.setNGramMaxSize(2);
		stwv.setTokenizer(tokenizer);
		stwv.setAttributeIndices("first");
	}
}