Java Code Examples for weka.core.Instances#randomize()

The following examples show how to use weka.core.Instances#randomize() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AttributeSelection.java    From tsml with GNU General Public License v3.0 7 votes vote down vote up
/**
  * Perform a cross validation for attribute selection. With subset
  * evaluators the number of times each attribute is selected over
  * the cross validation is reported. For attribute evaluators, the
  * average merit and average ranking + std deviation is reported for
  * each attribute.
  *
  * @return the results of cross validation as a String
  * @exception Exception if an error occurs during cross validation
  */
 public String CrossValidateAttributes () throws Exception {
   Instances cvData = new Instances(m_trainInstances);
   Instances train;

   Random random = new Random(m_seed);
   cvData.randomize(random);

   if (!(m_ASEvaluator instanceof UnsupervisedSubsetEvaluator) && 
!(m_ASEvaluator instanceof UnsupervisedAttributeEvaluator)) {
     if (cvData.classAttribute().isNominal()) {
cvData.stratify(m_numFolds);
     }

   }

   for (int i = 0; i < m_numFolds; i++) {
     // Perform attribute selection
     train = cvData.trainCV(m_numFolds, i, random);
     selectAttributesCVSplit(train);
   }

   return  CVResultsString();
 }
 
Example 2
Source File: Bagging.java    From tsml with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Returns a training set for a particular iteration.
 * 
 * @param iteration the number of the iteration for the requested training set.
 * @return the training set for the supplied iteration number
 * @throws Exception if something goes wrong when generating a training set.
 */
protected synchronized Instances getTrainingSet(int iteration) throws Exception {
  int bagSize = m_data.numInstances() * m_BagSizePercent / 100;
  Instances bagData = null;
  Random r = new Random(m_Seed + iteration);

  // create the in-bag dataset
  if (m_CalcOutOfBag) {
    m_inBag[iteration] = new boolean[m_data.numInstances()];
    bagData = m_data.resampleWithWeights(r, m_inBag[iteration]);
  } else {
    bagData = m_data.resampleWithWeights(r);
    if (bagSize < m_data.numInstances()) {
      bagData.randomize(r);
      Instances newBagData = new Instances(bagData, 0, bagSize);
      bagData = newBagData;
    }
  }
  
  return bagData;
}
 
Example 3
Source File: EvaluationUtils.java    From tsml with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Generate a bunch of predictions ready for processing, by performing a
 * cross-validation on the supplied dataset.
 *
 * @param classifier the Classifier to evaluate
 * @param data the dataset
 * @param numFolds the number of folds in the cross-validation.
 * @exception Exception if an error occurs
 */
public FastVector getCVPredictions(Classifier classifier, 
                                   Instances data, 
                                   int numFolds) 
  throws Exception {

  FastVector predictions = new FastVector();
  Instances runInstances = new Instances(data);
  Random random = new Random(m_Seed);
  runInstances.randomize(random);
  if (runInstances.classAttribute().isNominal() && (numFolds > 1)) {
    runInstances.stratify(numFolds);
  }
  int inst = 0;
  for (int fold = 0; fold < numFolds; fold++) {
    Instances train = runInstances.trainCV(numFolds, fold, random);
    Instances test = runInstances.testCV(numFolds, fold);
    FastVector foldPred = getTrainTestPredictions(classifier, train, test);
    predictions.appendElements(foldPred);
  } 
  return predictions;
}
 
Example 4
Source File: InstanceTools.java    From tsml with GNU General Public License v3.0 6 votes vote down vote up
public static Instances subSampleFixedProportion(Instances data, double proportion, long seed){
   Map<Double, Instances> classBins = createClassInstancesMap(data);
   ClassCounts trainDistribution = new TreeSetClassCounts(data);
   
   Random r = new Random(seed);

   //empty instances.
   Instances output = new Instances(data, 0);

   Iterator<Double> keys = trainDistribution.keySet().iterator();
   while(keys.hasNext()){
       double classVal = keys.next();
       int occurences = trainDistribution.get(classVal);
       int numInstances = (int) (proportion * occurences);
       Instances bin = classBins.get(classVal);
       bin.randomize(r); //randomise the bin.

       output.addAll(bin.subList(0,numInstances));//copy the first portion of the bin into the train set
   }
   return output; 
}
 
Example 5
Source File: StatUtils.java    From meka with GNU General Public License v3.0 5 votes vote down vote up
/**
 * LEAD - Performs LEAD on dataset 'D', using BR with base classifier 'h', under random seed 'r'.
 * <br>
 * WARNING: changing this method will affect the perfomance of e.g., BCC -- on the other hand the original BCC paper did not use LEAD, so don't worry.
 */
public static double[][] LEAD(Instances D, Classifier h, Random r)  throws Exception {
	Instances D_r = new Instances(D);
	D_r.randomize(r);
	Instances D_train = new Instances(D_r,0,D_r.numInstances()*60/100);
	Instances D_test = new Instances(D_r,D_train.numInstances(),D_r.numInstances()-D_train.numInstances());
	BR br = new BR();
	br.setClassifier(h);
	Result result = Evaluation.evaluateModel((MultiLabelClassifier)br,D_train,D_test,"PCut1","1");
	return LEAD2(D_test,result);
}
 
Example 6
Source File: ThresholdSelector.java    From tsml with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Collects the classifier predictions using the specified evaluation method.
 *
 * @param instances the set of <code>Instances</code> to generate
 * predictions for.
 * @param mode the evaluation mode.
 * @param numFolds the number of folds to use if not evaluating on the
 * full training set.
 * @return a <code>FastVector</code> containing the predictions.
 * @throws Exception if an error occurs generating the predictions.
 */
protected FastVector getPredictions(Instances instances, int mode, int numFolds) 
  throws Exception {

  EvaluationUtils eu = new EvaluationUtils();
  eu.setSeed(m_Seed);
  
  switch (mode) {
  case EVAL_TUNED_SPLIT:
    Instances trainData = null, evalData = null;
    Instances data = new Instances(instances);
    Random random = new Random(m_Seed);
    data.randomize(random);
    data.stratify(numFolds);
    
    // Make sure that both subsets contain at least one positive instance
    for (int subsetIndex = 0; subsetIndex < numFolds; subsetIndex++) {
      trainData = data.trainCV(numFolds, subsetIndex, random);
      evalData = data.testCV(numFolds, subsetIndex);
      if (checkForInstance(trainData) && checkForInstance(evalData)) {
        break;
      }
    }
    return eu.getTrainTestPredictions(m_Classifier, trainData, evalData);
  case EVAL_TRAINING_SET:
    return eu.getTrainTestPredictions(m_Classifier, instances, instances);
  case EVAL_CROSS_VALIDATION:
    return eu.getCVPredictions(m_Classifier, instances, numFolds);
  default:
    throw new RuntimeException("Unrecognized evaluation mode");
  }
}
 
Example 7
Source File: WekaDeeplearning4jExamples.java    From wekaDeeplearning4j with GNU General Public License v3.0 5 votes vote down vote up
private static void dl4jResnet50() throws Exception {
        String folderPath = "src/test/resources/nominal/plant-seedlings-small";
        ImageDirectoryLoader loader = new ImageDirectoryLoader();
        loader.setInputDirectory(new File(folderPath));
        Instances inst = loader.getDataSet();
        inst.setClassIndex(1);

        Dl4jMlpClassifier classifier = new Dl4jMlpClassifier();
        classifier.setNumEpochs(3);

        KerasEfficientNet kerasEfficientNet = new KerasEfficientNet();
        kerasEfficientNet.setVariation(EfficientNet.VARIATION.EFFICIENTNET_B1);
        classifier.setZooModel(kerasEfficientNet);

        ImageInstanceIterator iterator = new ImageInstanceIterator();
        iterator.setImagesLocation(new File(folderPath));

        classifier.setInstanceIterator(iterator);

        // Stratify and split the data
        Random rand = new Random(0);
        inst.randomize(rand);
        inst.stratify(5);
        Instances train = inst.trainCV(5, 0);
        Instances test = inst.testCV(5, 0);

// Build the classifier on the training data
        classifier.buildClassifier(train);

// Evaluate the model on test data
        Evaluation eval = new Evaluation(test);
        eval.evaluateModel(classifier, test);

// Output some summary statistics
        System.out.println(eval.toSummaryString());
        System.out.println(eval.toMatrixString());
    }
 
Example 8
Source File: Stacking.java    From tsml with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Buildclassifier selects a classifier from the set of classifiers
 * by minimising error on the training data.
 *
 * @param data the training data to be used for generating the
 * boosted classifier.
 * @throws Exception if the classifier could not be built successfully
 */
public void buildClassifier(Instances data) throws Exception {

  if (m_MetaClassifier == null) {
    throw new IllegalArgumentException("No meta classifier has been set");
  }

  // can classifier handle the data?
  getCapabilities().testWithFail(data);

  // remove instances with missing class
  Instances newData = new Instances(data);
  m_BaseFormat = new Instances(data, 0);
  newData.deleteWithMissingClass();
  
  Random random = new Random(m_Seed);
  newData.randomize(random);
  if (newData.classAttribute().isNominal()) {
    newData.stratify(m_NumFolds);
  }

  // Create meta level
  generateMetaLevel(newData, random);

  // restart the executor pool because at the end of processing
  // a set of classifiers it gets shutdown to prevent the program
  // executing as a server
  super.buildClassifier(newData);
  
  // Rebuild all the base classifiers on the full training data
  buildClassifiers(newData);
}
 
Example 9
Source File: Ridor.java    From tsml with GNU General Public License v3.0 5 votes vote down vote up
/**
    * Builds a single rule learner with REP dealing with 2 classes.
    * This rule learner always tries to predict the class with label 
    * m_Class.
    *
    * @param instances the training data
    * @throws Exception if classifier can't be built successfully
    */
   public void buildClassifier(Instances instances) throws Exception {
     m_ClassAttribute = instances.classAttribute();
     if (!m_ClassAttribute.isNominal()) 
throw new UnsupportedClassTypeException(" Only nominal class, please.");
     if(instances.numClasses() != 2)
throw new Exception(" Only 2 classes, please.");
    
     Instances data = new Instances(instances);
     if(Utils.eq(data.sumOfWeights(),0))
throw new Exception(" No training data.");
    
     data.deleteWithMissingClass();
     if(Utils.eq(data.sumOfWeights(),0))
throw new Exception(" The class labels of all the training data are missing.");	
    
     if(data.numInstances() < m_Folds)
throw new Exception(" Not enough data for REP.");
    
     m_Antds = new FastVector();	
    
     /* Split data into Grow and Prune*/
     m_Random = new Random(m_Seed);
     data.randomize(m_Random);
     data.stratify(m_Folds);
     Instances growData=data.trainCV(m_Folds, m_Folds-1, m_Random);
     Instances pruneData=data.testCV(m_Folds, m_Folds-1);
    
     grow(growData);      // Build this rule
    
     prune(pruneData);    // Prune this rule
   }
 
Example 10
Source File: WekaDeeplearning4jExamples.java    From wekaDeeplearning4j with GNU General Public License v3.0 5 votes vote down vote up
private static void dl4jResnet50() throws Exception {
        String folderPath = "src/test/resources/nominal/plant-seedlings-small";
        ImageDirectoryLoader loader = new ImageDirectoryLoader();
        loader.setInputDirectory(new File(folderPath));
        Instances inst = loader.getDataSet();
        inst.setClassIndex(1);

        Dl4jMlpClassifier classifier = new Dl4jMlpClassifier();
        classifier.setNumEpochs(3);

        KerasEfficientNet kerasEfficientNet = new KerasEfficientNet();
        kerasEfficientNet.setVariation(EfficientNet.VARIATION.EFFICIENTNET_B1);
        classifier.setZooModel(kerasEfficientNet);

        ImageInstanceIterator iterator = new ImageInstanceIterator();
        iterator.setImagesLocation(new File(folderPath));

        classifier.setInstanceIterator(iterator);

        // Stratify and split the data
        Random rand = new Random(0);
        inst.randomize(rand);
        inst.stratify(5);
        Instances train = inst.trainCV(5, 0);
        Instances test = inst.testCV(5, 0);

// Build the classifier on the training data
        classifier.buildClassifier(train);

// Evaluate the model on test data
        Evaluation eval = new Evaluation(test);
        eval.evaluateModel(classifier, test);

// Output some summary statistics
        System.out.println(eval.toSummaryString());
        System.out.println(eval.toMatrixString());
    }
 
Example 11
Source File: InstanceTools.java    From tsml with GNU General Public License v3.0 5 votes vote down vote up
/** 
 * Modified from Aaron's shapelet resampling code in development.ReasamplingExperiments. Used to resample
 * train and test instances while maintaining original train/test class distributions
 * 
 * @param train Input training instances
 * @param test Input test instances
 * @param seed Used to create reproducible folds by using a consistent seed value
 * @return Instances[] with two elements; [0] is the output training instances, [1] output test instances
 */
public static Instances[] resampleTrainAndTestInstances(Instances train, Instances test, long seed){
    if(seed==0){    //For consistency, I have made this clone the data. Its not necessary generally, but not doing it introduced a bug indiagnostics elsewhere
        Instances newTrain = new Instances(train);
        Instances newTest = new Instances(test);
        return new Instances[]{newTrain,newTest};
    }
    Instances all = new Instances(train);
    all.addAll(test);
    ClassCounts trainDistribution = new TreeSetClassCounts(train);
    
    Map<Double, Instances> classBins = createClassInstancesMap(all);
   
    Random r = new Random(seed);

    //empty instances.
    Instances outputTrain = new Instances(all, 0);
    Instances outputTest = new Instances(all, 0);

    Iterator<Double> keys = classBins.keySet().iterator();
    while(keys.hasNext()){
        double classVal = keys.next();
        int occurences = trainDistribution.get(classVal);
        Instances bin = classBins.get(classVal);
        bin.randomize(r); //randomise the bin.

        outputTrain.addAll(bin.subList(0,occurences));//copy the first portion of the bin into the train set
        outputTest.addAll(bin.subList(occurences, bin.size()));//copy the remaining portion of the bin into the test set.
    }

    return new Instances[]{outputTrain,outputTest};
}
 
Example 12
Source File: StatUtils.java    From meka with GNU General Public License v3.0 5 votes vote down vote up
public static double[][] LEAD(Instances D, Classifier h, Random r, String MDType)  throws Exception {
	Instances D_r = new Instances(D);
	D_r.randomize(r);
	Instances D_train = new Instances(D_r,0,D_r.numInstances()*60/100);
	Instances D_test = new Instances(D_r,D_train.numInstances(),D_r.numInstances()-D_train.numInstances());
	BR br = new BR();
	br.setClassifier(h);
	Result result = Evaluation.evaluateModel((MultiLabelClassifier)br,D_train,D_test,"PCut1","1");

	return LEAD(D_test, result, MDType);
}
 
Example 13
Source File: ConjunctiveRule.java    From tsml with GNU General Public License v3.0 4 votes vote down vote up
/**
  * Builds a single rule learner with REP dealing with nominal classes or
  * numeric classes.
  * For nominal classes, this rule learner predicts a distribution on
  * the classes.
  * For numeric classes, this learner predicts a single value.
  *
  * @param instances the training data
  * @throws Exception if classifier can't be built successfully
  */
 public void buildClassifier(Instances instances) throws Exception {
   // can classifier handle the data?
   getCapabilities().testWithFail(instances);

   // remove instances with missing class
   Instances data = new Instances(instances);
   data.deleteWithMissingClass();
   
   if(data.numInstances() < m_Folds)
     throw new Exception("Not enough data for REP.");

   m_ClassAttribute = data.classAttribute();
   if(m_ClassAttribute.isNominal())
     m_NumClasses = m_ClassAttribute.numValues();
   else
     m_NumClasses = 1;

   m_Antds = new FastVector();
   m_DefDstr = new double[m_NumClasses];
   m_Cnsqt = new double[m_NumClasses];
   m_Targets = new FastVector();	    
   m_Random = new Random(m_Seed);
   
   if(m_NumAntds != -1){
     grow(data);
   }
   else{

     data.randomize(m_Random);

     // Split data into Grow and Prune	   
     data.stratify(m_Folds);

     Instances growData=data.trainCV(m_Folds, m_Folds-1, m_Random);
     Instances pruneData=data.testCV(m_Folds, m_Folds-1);

     grow(growData);      // Build this rule  
     prune(pruneData);    // Prune this rule		  	  
   }

   if(m_ClassAttribute.isNominal()){			   
     Utils.normalize(m_Cnsqt);
     if(Utils.gr(Utils.sum(m_DefDstr), 0))
Utils.normalize(m_DefDstr);
   }	
 }
 
Example 14
Source File: Dl4jMlpTest.java    From wekaDeeplearning4j with GNU General Public License v3.0 4 votes vote down vote up
@Test
public void testTextCnnClassification() throws Exception {
  CnnTextEmbeddingInstanceIterator cnnTextIter = new CnnTextEmbeddingInstanceIterator();
  cnnTextIter.setTrainBatchSize(128);
  cnnTextIter.setWordVectorLocation(DatasetLoader.loadGoogleNewsVectors());
  clf.setInstanceIterator(cnnTextIter);

  cnnTextIter.initialize();
  final WordVectors wordVectors = cnnTextIter.getWordVectors();
  int vectorSize = wordVectors.getWordVector(wordVectors.vocab().wordAtIndex(0)).length;

  ConvolutionLayer conv1 = new ConvolutionLayer();
  conv1.setKernelSize(new int[]{4, vectorSize});
  conv1.setNOut(10);
  conv1.setStride(new int[]{1, vectorSize});
  conv1.setConvolutionMode(ConvolutionMode.Same);
  conv1.setActivationFunction(new ActivationReLU());

  BatchNormalization bn1 = new BatchNormalization();

  ConvolutionLayer conv2 = new ConvolutionLayer();
  conv2.setKernelSize(new int[]{3, vectorSize});
  conv2.setNOut(10);
  conv2.setStride(new int[]{1, vectorSize});
  conv2.setConvolutionMode(ConvolutionMode.Same);
  conv2.setActivationFunction(new ActivationReLU());

  BatchNormalization bn2 = new BatchNormalization();

  ConvolutionLayer conv3 = new ConvolutionLayer();
  conv3.setKernelSize(new int[]{2, vectorSize});
  conv3.setNOut(10);
  conv3.setStride(new int[]{1, vectorSize});
  conv3.setConvolutionMode(ConvolutionMode.Same);
  conv3.setActivationFunction(new ActivationReLU());

  BatchNormalization bn3 = new BatchNormalization();

  GlobalPoolingLayer gpl = new GlobalPoolingLayer();

  OutputLayer out = new OutputLayer();

  //    clf.setLayers(conv1, bn1, conv2, bn2, conv3, bn3, gpl, out);
  clf.setLayers(conv1, conv2, conv3, gpl, out);
  //    clf.setNumEpochs(50);
  clf.setCacheMode(CacheMode.MEMORY);
  final EpochListener l = new EpochListener();
  l.setN(1);
  clf.setIterationListener(l);

  clf.setEarlyStopping(new EarlyStopping(10, 15));
  clf.setDebug(true);

  // NNC
  NeuralNetConfiguration nnc = new NeuralNetConfiguration();
  nnc.setL2(1e-3);
  final Dropout dropout = new Dropout();
  dropout.setP(0.2);
  nnc.setDropout(dropout);
  clf.setNeuralNetConfiguration(nnc);

  // Data
  final Instances data = DatasetLoader.loadImdb();
  data.randomize(new Random(42));
  RemovePercentage rp = new RemovePercentage();
  rp.setInputFormat(data);
  rp.setPercentage(98);
  final Instances dataFiltered = Filter.useFilter(data, rp);

  TestUtil.holdout(clf, dataFiltered);
}
 
Example 15
Source File: CDTClassifierEvaluation.java    From NLIWOD with GNU Affero General Public License v3.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {		
	/*
	 * For multilable classification:
	 */
	
	//load the data
	Path datapath= Paths.get("./src/main/resources/old/Qald6Logs.arff");
	BufferedReader reader = new BufferedReader(new FileReader(datapath.toString()));
	ArffReader arff = new ArffReader(reader);
	Instances data = arff.getData();
	data.setClassIndex(6);
	
    // randomize data
	long seed = System.currentTimeMillis();
	int folds = 100;
	
	String qasystem = "KWGAnswer";
	
	
    Random rand = new Random(seed);
    Instances randData = new Instances(data);
    randData.randomize(rand);
	ArrayList<String> systems = Lists.newArrayList("KWGAnswer", "NbFramework", "PersianQA", "SemGraphQA", "UIQA_withoutManualEntries", "UTQA_English");
	
	
	// perform cross-validation
	Double foldavep = 0.0;
	Double foldaver = 0.0;
	Double foldavef = 0.0;
	Double foldsys = 0.0;

    for (int n = 0; n < folds; n++) {
      Instances train = randData.trainCV(folds, n);
      Instances test = randData.testCV(folds, n);
      // build and evaluate classifier
      PSt pst = new PSt();
      pst.buildClassifier(train);
		float ave_p = 0;
		float ave_r = 0;
		float sysp = 0;
		float sysr = 0;

		for(int j = 0; j < test.size(); j++){
			Instance ins = test.get(j);
			double[] confidences = pst.distributionForInstance(ins);
			int argmax = -1;
			double max = -1;
				for(int i = 0; i < 6; i++){
					if(confidences[i]>max){
						max = confidences[i];
						argmax = i;
					}
				}	
			String sys2ask = systems.get(systems.size() - argmax -1);
			ave_p += Float.parseFloat(loadSystemP(sys2ask).get(j));				
			ave_r += Float.parseFloat(loadSystemR(sys2ask).get(j));
			sysp += Float.parseFloat(loadSystemP(qasystem).get(j));				
			sysr += Float.parseFloat(loadSystemR(sys2ask).get(j));
			}
		double p = ave_p/test.size();
		double r = ave_r/test.size();
		double syspave = sysp/test.size();
		double sysrave = sysr/test.size();
		double sysfmeasure = 2*sysrave*syspave/(sysrave + syspave);
		System.out.println(" RESULT FOR FOLD " + n);
		System.out.println("macro P : " + p);
		System.out.println("macro R : " + r);
		double fmeasure = 2*p*r/(p + r);
		System.out.println("macro F : " + fmeasure + '\n');
		foldavep += p/folds;
		foldaver += r/folds;
		foldavef += fmeasure/folds;
		foldsys += sysfmeasure/folds;
   }
	System.out.println(" RESULT FOR CV ");
	System.out.println("macro aveP : " + foldavep);
	System.out.println("macro aveR : " + foldaver);
	System.out.println("macro aveF : " + foldavef);
	System.out.println("macro aveF " + qasystem + " : " + foldsys);


}
 
Example 16
Source File: PropositionalToMultiInstance.java    From tsml with GNU General Public License v3.0 4 votes vote down vote up
/**
 * Signify that this batch of input to the filter is finished. 
 * If the filter requires all instances prior to filtering,
 * output() may now be called to retrieve the filtered instances.
 *
 * @return true if there are instances pending output
 * @throws IllegalStateException if no input structure has been defined
 */
public boolean batchFinished() {

  if (getInputFormat() == null) {
    throw new IllegalStateException("No input instance format defined");
  }

  Instances input = getInputFormat();
  input.sort(0);   // make sure that bagID is sorted
  Instances output = getOutputFormat();
  Instances bagInsts = output.attribute(1).relation();
  Instance inst = new DenseInstance(bagInsts.numAttributes());
  inst.setDataset(bagInsts);

  double bagIndex   = input.instance(0).value(0);
  double classValue = input.instance(0).classValue(); 
  double bagWeight  = 0.0;

  // Convert pending input instances
  for(int i = 0; i < input.numInstances(); i++) {
    double currentBagIndex = input.instance(i).value(0);

    // copy the propositional instance value, except the bagIndex and the class value
    for (int j = 0; j < input.numAttributes() - 2; j++) 
      inst.setValue(j, input.instance(i).value(j + 1));
    inst.setWeight(input.instance(i).weight());

    if (currentBagIndex == bagIndex){
      bagInsts.add(inst);
      bagWeight += inst.weight();
    }
    else{
      addBag(input, output, bagInsts, (int) bagIndex, classValue, bagWeight);

      bagInsts   = bagInsts.stringFreeStructure();  
      bagInsts.add(inst);
      bagIndex   = currentBagIndex;
      classValue = input.instance(i).classValue();
      bagWeight  = inst.weight();
    }
  }

  // reach the last instance, create and add the last bag
  addBag(input, output, bagInsts, (int) bagIndex, classValue, bagWeight);

  if (getRandomize())
    output.randomize(new Random(getSeed()));
  
  for (int i = 0; i < output.numInstances(); i++)
    push(output.instance(i));
  
  // Free memory
  flushInput();

  m_NewBatch = true;
  m_FirstBatchDone = true;
  
  return (numPendingOutput() != 0);
}
 
Example 17
Source File: CVParameterSelection.java    From tsml with GNU General Public License v3.0 4 votes vote down vote up
/**
 * Generates the classifier.
 *
 * @param instances set of instances serving as training data 
 * @throws Exception if the classifier has not been generated successfully
 */
public void buildClassifier(Instances instances) throws Exception {

  // can classifier handle the data?
  getCapabilities().testWithFail(instances);

  // remove instances with missing class
  Instances trainData = new Instances(instances);
  trainData.deleteWithMissingClass();
  
  if (!(m_Classifier instanceof OptionHandler)) {
    throw new IllegalArgumentException("Base classifier should be OptionHandler.");
  }
  m_InitOptions = ((OptionHandler)m_Classifier).getOptions();
  m_BestPerformance = -99;
  m_NumAttributes = trainData.numAttributes();
  Random random = new Random(m_Seed);
  trainData.randomize(random);
  m_TrainFoldSize = trainData.trainCV(m_NumFolds, 0).numInstances();

  // Check whether there are any parameters to optimize
  if (m_CVParams.size() == 0) {
     m_Classifier.buildClassifier(trainData);
     m_BestClassifierOptions = m_InitOptions;
     return;
  }

  if (trainData.classAttribute().isNominal()) {
    trainData.stratify(m_NumFolds);
  }
  m_BestClassifierOptions = null;
  
  // Set up m_ClassifierOptions -- take getOptions() and remove
  // those being optimised.
  m_ClassifierOptions = ((OptionHandler)m_Classifier).getOptions();
  for (int i = 0; i < m_CVParams.size(); i++) {
    Utils.getOption(((CVParameter)m_CVParams.elementAt(i)).m_ParamChar,
      m_ClassifierOptions);
  }
  findParamsByCrossValidation(0, trainData, random);

  String [] options = (String [])m_BestClassifierOptions.clone();
  ((OptionHandler)m_Classifier).setOptions(options);
  m_Classifier.buildClassifier(trainData);
}
 
Example 18
Source File: RaceSearch.java    From tsml with GNU General Public License v3.0 4 votes vote down vote up
/**
 * Searches the attribute subset space by racing cross validation
 * errors of competing subsets
 *
 * @param ASEval the attribute evaluator to guide the search
 * @param data the training instances.
 * @return an array (not necessarily ordered) of selected attribute indexes
 * @throws Exception if the search can't be completed
 */
public int[] search (ASEvaluation ASEval, Instances data)
  throws Exception {
  if (!(ASEval instanceof SubsetEvaluator)) {
    throw  new Exception(ASEval.getClass().getName() 
                         + " is not a " 
                         + "Subset evaluator! (RaceSearch)");
  }

  if (ASEval instanceof UnsupervisedSubsetEvaluator) {
    throw new Exception("Can't use an unsupervised subset evaluator "
                        +"(RaceSearch).");
  }

  if (!(ASEval instanceof HoldOutSubsetEvaluator)) {
    throw new Exception("Must use a HoldOutSubsetEvaluator, eg. "
                        +"weka.attributeSelection.ClassifierSubsetEval "
                        +"(RaceSearch)");
  }

  if (!(ASEval instanceof ErrorBasedMeritEvaluator)) {
    throw new Exception("Only error based subset evaluators can be used, "
                        +"eg. weka.attributeSelection.ClassifierSubsetEval "
                        +"(RaceSearch)");
  }

  m_Instances = new Instances(data);
  m_Instances.deleteWithMissingClass();
  if (m_Instances.numInstances() == 0) {
    throw new Exception("All train instances have missing class! (RaceSearch)");
  }
  if (m_rankingRequested && m_numToSelect > m_Instances.numAttributes()-1) {
    throw new Exception("More attributes requested than exist in the data "
                        +"(RaceSearch).");
  }
  m_theEvaluator = (HoldOutSubsetEvaluator)ASEval;
  m_numAttribs = m_Instances.numAttributes();
  m_classIndex = m_Instances.classIndex();

  if (m_rankingRequested) {
    m_rankedAtts = new double[m_numAttribs-1][2];
    m_rankedSoFar = 0;
  }

  if (m_xvalType == LEAVE_ONE_OUT) {
    m_numFolds = m_Instances.numInstances();
  } else {
    m_numFolds = 10;
  }

  Random random = new Random(1); // I guess this should really be a parameter?
  m_Instances.randomize(random);
  int [] bestSubset=null;

  switch (m_raceType) {
  case FORWARD_RACE:
  case BACKWARD_RACE: 
    bestSubset = hillclimbRace(m_Instances, random);
    break;
  case SCHEMATA_RACE:
    bestSubset = schemataRace(m_Instances, random);
    break;
  case RANK_RACE:
    bestSubset = rankRace(m_Instances, random);
    break;
  }

  return bestSubset;
}
 
Example 19
Source File: Dl4jMlpTest.java    From wekaDeeplearning4j with GNU General Public License v3.0 4 votes vote down vote up
@Test
public void testTextCnnClassification() throws Exception {
  CnnTextEmbeddingInstanceIterator cnnTextIter = new CnnTextEmbeddingInstanceIterator();
  cnnTextIter.setTrainBatchSize(128);
  cnnTextIter.setWordVectorLocation(DatasetLoader.loadGoogleNewsVectors());
  clf.setInstanceIterator(cnnTextIter);

  cnnTextIter.initialize();
  final WordVectors wordVectors = cnnTextIter.getWordVectors();
  int vectorSize = wordVectors.getWordVector(wordVectors.vocab().wordAtIndex(0)).length;

  ConvolutionLayer conv1 = new ConvolutionLayer();
  conv1.setKernelSize(new int[]{4, vectorSize});
  conv1.setNOut(10);
  conv1.setStride(new int[]{1, vectorSize});
  conv1.setConvolutionMode(ConvolutionMode.Same);
  conv1.setActivationFunction(new ActivationReLU());

  BatchNormalization bn1 = new BatchNormalization();

  ConvolutionLayer conv2 = new ConvolutionLayer();
  conv2.setKernelSize(new int[]{3, vectorSize});
  conv2.setNOut(10);
  conv2.setStride(new int[]{1, vectorSize});
  conv2.setConvolutionMode(ConvolutionMode.Same);
  conv2.setActivationFunction(new ActivationReLU());

  BatchNormalization bn2 = new BatchNormalization();

  ConvolutionLayer conv3 = new ConvolutionLayer();
  conv3.setKernelSize(new int[]{2, vectorSize});
  conv3.setNOut(10);
  conv3.setStride(new int[]{1, vectorSize});
  conv3.setConvolutionMode(ConvolutionMode.Same);
  conv3.setActivationFunction(new ActivationReLU());

  BatchNormalization bn3 = new BatchNormalization();

  GlobalPoolingLayer gpl = new GlobalPoolingLayer();

  OutputLayer out = new OutputLayer();

  //    clf.setLayers(conv1, bn1, conv2, bn2, conv3, bn3, gpl, out);
  clf.setLayers(conv1, conv2, conv3, gpl, out);
  //    clf.setNumEpochs(50);
  clf.setCacheMode(CacheMode.MEMORY);
  final EpochListener l = new EpochListener();
  l.setN(1);
  clf.setIterationListener(l);

  clf.setEarlyStopping(new EarlyStopping(10, 15));
  clf.setDebug(true);

  // NNC
  NeuralNetConfiguration nnc = new NeuralNetConfiguration();
  nnc.setL2(1e-3);
  final Dropout dropout = new Dropout();
  dropout.setP(0.2);
  nnc.setDropout(dropout);
  clf.setNeuralNetConfiguration(nnc);

  // Data
  final Instances data = DatasetLoader.loadImdb();
  data.randomize(new Random(42));
  RemovePercentage rp = new RemovePercentage();
  rp.setInputFormat(data);
  rp.setPercentage(98);
  final Instances dataFiltered = Filter.useFilter(data, rp);

  TestUtil.holdout(clf, dataFiltered);
}
 
Example 20
Source File: Sampling.java    From tsml with GNU General Public License v3.0 2 votes vote down vote up
/** 
 * Randomize the dataset
 * @param data
 * @return
 */
public static Instances random(Instances data) {
	data.randomize(new Random());
	return data;
}