Java Code Examples for weka.filters.unsupervised.attribute.StringToWordVector

The following examples show how to use weka.filters.unsupervised.attribute.StringToWordVector. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: sentiment-analysis   Source File: Trainer.java    License: Apache License 2.0 6 votes vote down vote up
/**Returns the text-based Representations.*/
private Instances getText(String fileText) throws Exception{
	DataSource ds = new DataSource(fileText);
	Instances data =  ds.getDataSet();
	data.setClassIndex(1);
	StringToWordVector filter = new StringToWordVector();
	filter.setInputFormat(data);
	filter.setLowerCaseTokens(true);
	filter.setMinTermFreq(1);
	filter.setUseStoplist(false);
	filter.setTFTransform(false);
	filter.setIDFTransform(false);		
	filter.setWordsToKeep(1000000000);
	NGramTokenizer tokenizer = new NGramTokenizer();
	tokenizer.setNGramMinSize(2);
	tokenizer.setNGramMaxSize(2);
	filter.setTokenizer(tokenizer);	
	Instances newData = weka.filters.Filter.useFilter(data, filter);
	return newData;
}
 
Example 2
Source Project: sentiment-analysis   Source File: Trainer.java    License: Apache License 2.0 6 votes vote down vote up
/**Returns the Feature-based Representations.*/
private Instances getFeature(String fileFeature) throws Exception{
	DataSource ds = new DataSource(fileFeature);
	Instances data =  ds.getDataSet();
	data.setClassIndex(1);
	StringToWordVector filter = new StringToWordVector();
	filter.setInputFormat(data);
	filter.setLowerCaseTokens(true);
	filter.setMinTermFreq(1);
	filter.setUseStoplist(false);
	filter.setTFTransform(false);
	filter.setIDFTransform(false);		
	filter.setWordsToKeep(1000000000);
	NGramTokenizer tokenizer = new NGramTokenizer();
	tokenizer.setNGramMinSize(1);
	tokenizer.setNGramMaxSize(1);
	filter.setTokenizer(tokenizer);	
	Instances newData = weka.filters.Filter.useFilter(data, filter);
	return newData;
}
 
Example 3
Source Project: sentiment-analysis   Source File: Trainer.java    License: Apache License 2.0 6 votes vote down vote up
/**Returns the Combined (text+POS) Representations.*/
private Instances getComplex(String fileComplex) throws Exception{
	DataSource ds = new DataSource(fileComplex);
	Instances data =  ds.getDataSet();
	data.setClassIndex(1);
	StringToWordVector filter = new StringToWordVector();
	filter.setInputFormat(data);
	filter.setLowerCaseTokens(true);
	filter.setMinTermFreq(1);
	filter.setUseStoplist(false);
	filter.setTFTransform(false);
	filter.setIDFTransform(false);		
	filter.setWordsToKeep(1000000000);
	NGramTokenizer tokenizer = new NGramTokenizer();
	tokenizer.setNGramMinSize(2);
	tokenizer.setNGramMaxSize(2);
	filter.setTokenizer(tokenizer);	
	Instances newData = weka.filters.Filter.useFilter(data, filter);
	return newData;
}
 
Example 4
Source Project: sentiment-analysis   Source File: SentimentAnalyser.java    License: Apache License 2.0 5 votes vote down vote up
/**StringToWordVector filter initialization.*/
private void initializeFilter(){
	stwv = new StringToWordVector();
	stwv.setLowerCaseTokens(true);
	stwv.setMinTermFreq(1);
	stwv.setUseStoplist(false);
	stwv.setTFTransform(false);
	stwv.setIDFTransform(false);		
	stwv.setWordsToKeep(1000000000);
	NGramTokenizer tokenizer = new NGramTokenizer();
	tokenizer.setNGramMinSize(2);
	tokenizer.setNGramMaxSize(2);
	stwv.setTokenizer(tokenizer);
	stwv.setAttributeIndices("first");
}
 
Example 5
Source Project: sentiment-analysis   Source File: PolarityClassifier.java    License: Apache License 2.0 5 votes vote down vote up
/**Initializes the StringToWordVector filter to be used in the representations.*/
private void initialiseTextFilter(){
	stwv = new StringToWordVector();
	stwv.setLowerCaseTokens(true);
	stwv.setMinTermFreq(1);
	stwv.setUseStoplist(false);
	stwv.setTFTransform(false);
	stwv.setIDFTransform(false);		
	stwv.setWordsToKeep(1000000000);
	NGramTokenizer tokenizer = new NGramTokenizer();
	tokenizer.setNGramMinSize(2);
	tokenizer.setNGramMaxSize(2);
	stwv.setTokenizer(tokenizer);
}
 
Example 6
Source Project: jMetal   Source File: DecisionTreeEstimator.java    License: MIT License 4 votes vote down vote up
public double doPrediction(int index,S testSolution) {
  double result = 0.0d;

  try {
    int numberOfObjectives = solutionList.get(0).getNumberOfObjectives();
    //Attributes
    //numeric
    Attribute attr = new Attribute("my-numeric");

    //nominal
    ArrayList<String> myNomVals = new ArrayList<>();

    for (int i=0; i<numberOfObjectives; i++)
      myNomVals.add(VALUE_STRING+i);
    Attribute attr1 = new Attribute(NOMINAL_STRING, myNomVals);
    //System.out.println(attr1.isNominal());

    //string
    Attribute attr2 = new Attribute(MY_STRING, (List<String>)null);
    //System.out.println(attr2.isString());

    //2.create dataset
    ArrayList<Attribute> attrs = new ArrayList<>();
    attrs.add(attr);
    attrs.add(attr1);
    attrs.add(attr2);
    Instances dataset = new Instances("my_dataset", attrs, 0);

    //Add instances
    for (S solution : solutionList) {
      //instaces
      for (int i = 0; i <numberOfObjectives ; i++) {
        double[] attValues = new double[dataset.numAttributes()];
        attValues[0] = solution.getObjective(i);
        attValues[1] = dataset.attribute(NOMINAL_STRING).indexOfValue(VALUE_STRING+i);
        attValues[2] = dataset.attribute(MY_STRING).addStringValue(solution.toString()+i);
        dataset.add(new DenseInstance(1.0, attValues));
      }
    }


    //DataSet test
    Instances datasetTest = new Instances("my_dataset_test", attrs, 0);

    //Add instances
    for (int i = 0; i < numberOfObjectives; i++) {
      Instance test = new DenseInstance(3);
      test.setValue(attr, testSolution.getObjective(i));
      test.setValue(attr1, VALUE_STRING+i);
      test.setValue(attr2, testSolution.toString()+i);
      datasetTest.add(test);
    //  dataset.add(test);
    }


    //split to 70:30 learn and test set

    //Preprocess strings (almost no classifier supports them)
    StringToWordVector filter = new StringToWordVector();

    filter.setInputFormat(dataset);
    dataset = Filter.useFilter(dataset, filter);

    //Buid classifier
    dataset.setClassIndex(1);
    Classifier classifier = new J48();
    classifier.buildClassifier(dataset);
    //resample if needed
    //dataset = dataset.resample(new Random(42));
    dataset.setClassIndex(1);
    datasetTest.setClassIndex(1);
    //do eval
    Evaluation eval = new Evaluation(datasetTest); //trainset
    eval.evaluateModel(classifier, datasetTest); //testset
    result = classifier.classifyInstance(datasetTest.get(index));
  } catch (Exception e) {
    result = testSolution.getObjective(index);
  }
  return result;
}
 
Example 7
Source Project: jMetal   Source File: DecisionTreeEstimator.java    License: MIT License 4 votes vote down vote up
public double doPredictionVariable(int index,S testSolution) {
  double result = 0.0d;

  try {
    int numberOfVariables = solutionList.get(0).getNumberOfVariables();
    //Attributes
    //numeric
    Attribute attr = new Attribute("my-numeric");

    //nominal
    ArrayList<String> myNomVals = new ArrayList<>();

    for (int i=0; i<numberOfVariables; i++)
      myNomVals.add(VALUE_STRING+i);
    Attribute attr1 = new Attribute(NOMINAL_STRING, myNomVals);

    //string
    Attribute attr2 = new Attribute(MY_STRING, (List<String>)null);

    //2.create dataset
    ArrayList<Attribute> attrs = new ArrayList<>();
    attrs.add(attr);
    attrs.add(attr1);
    attrs.add(attr2);
    Instances dataset = new Instances("my_dataset", attrs, 0);

    //Add instances
    for (S solution : solutionList) {
      //instaces
      for (int i = 0; i <numberOfVariables ; i++) {
        double[] attValues = new double[dataset.numAttributes()];
        attValues[0] = ((DoubleSolution)solution).getVariable(i);
        attValues[1] = dataset.attribute(NOMINAL_STRING).indexOfValue(VALUE_STRING+i);
        attValues[2] = dataset.attribute(MY_STRING).addStringValue(solution.toString()+i);
        dataset.add(new DenseInstance(1.0, attValues));
      }
    }


    //DataSet test
    Instances datasetTest = new Instances("my_dataset_test", attrs, 0);

    //Add instances
    for (int i = 0; i < numberOfVariables; i++) {
      Instance test = new DenseInstance(3);
      test.setValue(attr, ((DoubleSolution)testSolution).getVariable(i));
      test.setValue(attr1, VALUE_STRING+i);
      test.setValue(attr2, testSolution.toString()+i);
      datasetTest.add(test);
      //  dataset.add(test);
    }


    //split to 70:30 learn and test set

    //Preprocess strings (almost no classifier supports them)
    StringToWordVector filter = new StringToWordVector();

    filter.setInputFormat(dataset);
    dataset = Filter.useFilter(dataset, filter);

    //Buid classifier
    dataset.setClassIndex(1);
    Classifier classifier = new J48();
    classifier.buildClassifier(dataset);
    //resample if needed
    //dataset = dataset.resample(new Random(42));
    dataset.setClassIndex(1);
    datasetTest.setClassIndex(1);
    //do eval
    Evaluation eval = new Evaluation(datasetTest); //trainset
    eval.evaluateModel(classifier, datasetTest); //testset
    result = classifier.classifyInstance(datasetTest.get(index));
  } catch (Exception e) {
    result = ((DoubleSolution)testSolution).getVariable(index);
  }
  return result;
}