Java Code Examples for weka.core.Instances#sort()

The following examples show how to use weka.core.Instances#sort() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BinC45Split.java    From tsml with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Creates a C4.5-type split on the given data.
 *
 * @exception Exception if something goes wrong
 */
public void buildClassifier(Instances trainInstances)
     throws Exception {

  // Initialize the remaining instance variables.
  m_numSubsets = 0;
  m_splitPoint = Double.MAX_VALUE;
  m_infoGain = 0;
  m_gainRatio = 0;

  // Different treatment for enumerated and numeric
  // attributes.
  if (trainInstances.attribute(m_attIndex).isNominal()){
    handleEnumeratedAttribute(trainInstances);
  }else{
    trainInstances.sort(trainInstances.attribute(m_attIndex));
    handleNumericAttribute(trainInstances);
  }
}
 
Example 2
Source File: NBTreeSplit.java    From tsml with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Creates a NBTree-type split on the given data. Assumes that none of
 * the class values is missing.
 *
 * @exception Exception if something goes wrong
 */
public void buildClassifier(Instances trainInstances) 
     throws Exception {

  // Initialize the remaining instance variables.
  m_numSubsets = 0;
  m_splitPoint = Double.MAX_VALUE;
  m_errors = 0;
  if (m_globalNB != null) {
    m_errors = m_globalNB.getErrors();
  }

  // Different treatment for enumerated and numeric
  // attributes.
  if (trainInstances.attribute(m_attIndex).isNominal()) {
    m_complexityIndex = trainInstances.attribute(m_attIndex).numValues();
    handleEnumeratedAttribute(trainInstances);
  }else{
    m_complexityIndex = 2;
    trainInstances.sort(trainInstances.attribute(m_attIndex));
    handleNumericAttribute(trainInstances);
  }
}
 
Example 3
Source File: C45Split.java    From tsml with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Creates a C4.5-type split on the given data. Assumes that none of
 * the class values is missing.
 *
 * @exception Exception if something goes wrong
 */
public void buildClassifier(Instances trainInstances) 
     throws Exception {

  // Initialize the remaining instance variables.
  m_numSubsets = 0;
  m_splitPoint = Double.MAX_VALUE;
  m_infoGain = 0;
  m_gainRatio = 0;

  // Different treatment for enumerated and numeric
  // attributes.
  if (trainInstances.attribute(m_attIndex).isNominal()) {
    m_complexityIndex = trainInstances.attribute(m_attIndex).numValues();
    m_index = m_complexityIndex;
    handleEnumeratedAttribute(trainInstances);
  }else{
    m_complexityIndex = 2;
    m_index = 0;
    trainInstances.sort(trainInstances.attribute(m_attIndex));
    handleNumericAttribute(trainInstances);
  }
}
 
Example 4
Source File: Discretize.java    From tsml with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Set cutpoints for a single attribute using MDL.
 *
 * @param index the index of the attribute to set cutpoints for
 * @param data the data to work with
 */
protected void calculateCutPointsByMDL(int index,
			 Instances data) {

  // Sort instances
  data.sort(data.attribute(index));

  // Find first instances that's missing
  int firstMissing = data.numInstances();
  for (int i = 0; i < data.numInstances(); i++) {
    if (data.instance(i).isMissing(index)) {
      firstMissing = i;
      break;
    }
  }
  m_CutPoints[index] = cutPointsForSubset(data, index, 0, firstMissing);
}
 
Example 5
Source File: Sampling.java    From tsml with GNU General Public License v3.0 5 votes vote down vote up
/** 
 * Reorder the dataset by its largest class
 * @param data
 * @return
 */
public static Instances orderByLargestClass(Instances data) {
	Instances newData = new Instances(data, data.numInstances());
	
	// get the number of class in the data
	int nbClass = data.numClasses();
	int[] instancePerClass = new int[nbClass];
	int[] labels = new int[nbClass];
	int[] classIndex = new int[nbClass];
	
	// sort the data base on its class
	data.sort(data.classAttribute());
	
	// get the number of instances per class in the data
	for (int i = 0; i < nbClass; i++) {
		instancePerClass[i] = data.attributeStats(data.classIndex()).nominalCounts[i];
		labels[i] = i;
		if (i > 0)
			classIndex[i] = classIndex[i-1] + instancePerClass[i-1];
	}
	QuickSort.sort(instancePerClass, labels);
	
	for (int i = nbClass-1; i >=0 ; i--) {
		for (int j = 0; j < instancePerClass[i]; j++) {
			newData.add(data.instance(classIndex[labels[i]] + j));
		}
	}
	
	return newData;
}
 
Example 6
Source File: Segmenter.java    From gsn with GNU General Public License v3.0 5 votes vote down vote up
public SegmentedClassifier computeErrors(Instances i,Double[] seg) throws Exception{
	Classifier cl = Tools.getClassifierById(model);
	Filter f = new DummyFilter();
	f.setInputFormat(i);
	SegmentedClassifier sc = new SegmentedClassifier(cl, 1, seg,f);
	sc.buildClassifier(i);	
	i.sort(0);
	Pred_errors = Tools.get_errors(sc, i); 
	return sc;

}
 
Example 7
Source File: Sampling.java    From tsml with GNU General Public License v3.0 4 votes vote down vote up
/** 
 * Reorder the data by compactness of each class using Euclidean distance
 * @param data
 * @return
 */
public static Instances orderByCompactClass(Instances data) {
	Instances newData = new Instances(data, data.numInstances());
	
	// get the number of class in the data
	int nbClass = data.numClasses();
	int[] instancePerClass = new int[nbClass];
	int[] labels = new int[nbClass];
	int[] classIndex = new int[nbClass];
	double[] compactness = new double[nbClass];
	
	// sort the data base on its class
	data.sort(data.classAttribute());
	
	int start = 0;
	// get the number of instances per class in the data
	for (int i = 0; i < nbClass; i++) {
		instancePerClass[i] = data.attributeStats(data.classIndex()).nominalCounts[i];
		labels[i] = i;
		if (i > 0) 
			classIndex[i] = classIndex[i-1] + instancePerClass[i-1];
		int end = start + instancePerClass[i];
		int counter = 0;
		double[][] dataPerClass = new double[instancePerClass[i]][data.numAttributes()-1];
		for (int j = start; j < end; j++) {
			dataPerClass[counter++] = data.instance(j).toDoubleArray();
		}
		double[] mean = arithmeticMean(dataPerClass);
		double d = 0;
		for (int j = 0; j < instancePerClass[i]; j++) {
			double temp = euclideanDistance(mean, dataPerClass[j]);
			temp *= temp;
			temp -= (mean[0] - dataPerClass[j][0]) * (mean[0] - dataPerClass[j][0]);
			d += temp;
		}
		compactness[i] = d / instancePerClass[i];
		start = end;
	}
	
	QuickSort.sort(compactness, labels);
	
	for (int i = nbClass-1; i >=0 ; i--) {
		for (int j = 0; j < instancePerClass[labels[i]]; j++) {
			newData.add(data.instance(classIndex[labels[i]] + j));
		}
	}
	
	return newData;
}
 
Example 8
Source File: ResidualSplit.java    From tsml with GNU General Public License v3.0 4 votes vote down vote up
/**
  * Selects split point for numeric attribute.
  */
 protected boolean getSplitPoint() throws Exception{

   //compute possible split points
   double[] splitPoints = new double[m_numInstances];
   int numSplitPoints = 0;

   Instances sortedData = new Instances(m_data);
   sortedData.sort(sortedData.attribute(m_attIndex));

   double last, current;

   last = sortedData.instance(0).value(m_attIndex);	

   for (int i = 0; i < m_numInstances - 1; i++) {
     current = sortedData.instance(i+1).value(m_attIndex);	
     if (!Utils.eq(current, last)){
splitPoints[numSplitPoints++] = (last + current) / 2.0;
     }
     last = current;
   }

   //compute entropy for all split points
   double[] entropyGain = new double[numSplitPoints];

   for (int i = 0; i < numSplitPoints; i++) {
     m_splitPoint = splitPoints[i];
     entropyGain[i] = entropyGain();
   }

   //get best entropy gain
   int bestSplit = -1;
   double bestGain = -Double.MAX_VALUE;

   for (int i = 0; i < numSplitPoints; i++) {
     if (entropyGain[i] > bestGain) {
bestGain = entropyGain[i];
bestSplit = i;
     }
   }

   if (bestSplit < 0) return false;

   m_splitPoint = splitPoints[bestSplit];	
   return true;
 }
 
Example 9
Source File: IsotonicRegression.java    From tsml with GNU General Public License v3.0 4 votes vote down vote up
/**
 * Does the actual regression.
 */
protected void regress(Attribute attribute, Instances insts, boolean ascending) 
  throws Exception {

  // Sort values according to current attribute
  insts.sort(attribute);
  
  // Initialize arrays
  double[] values = new double[insts.numInstances()];
  double[] weights = new double[insts.numInstances()];
  double[] cuts = new double[insts.numInstances() - 1];
  int size = 0;
  values[0] = insts.instance(0).classValue();
  weights[0] = insts.instance(0).weight();
  for (int i = 1; i < insts.numInstances(); i++) {
    if (insts.instance(i).value(attribute) >
        insts.instance(i - 1).value(attribute)) {
      cuts[size] = (insts.instance(i).value(attribute) +
                    insts.instance(i - 1).value(attribute)) / 2;
      size++;
    }
    values[size] += insts.instance(i).classValue();
    weights[size] += insts.instance(i).weight();
  }
  size++;
  
  // While there is a pair of adjacent violators
  boolean violators;
  do {
    violators = false;
    
    // Initialize arrays
    double[] tempValues = new double[size];
    double[] tempWeights = new double[size];
    double[] tempCuts = new double[size - 1];
    
    // Merge adjacent violators
    int newSize = 0;
    tempValues[0] = values[0];
    tempWeights[0] = weights[0];
    for (int j = 1; j < size; j++) {
      if ((ascending && (values[j] / weights[j] > 
                         tempValues[newSize] / tempWeights[newSize])) ||
          (!ascending && (values[j] / weights[j] < 
                          tempValues[newSize] / tempWeights[newSize]))) {
        tempCuts[newSize] = cuts[j - 1];
        newSize++;
        tempValues[newSize] = values[j];
        tempWeights[newSize] = weights[j];
      } else {
        tempWeights[newSize] += weights[j];
        tempValues[newSize] += values[j];
        violators = true;
      }
    }
    newSize++;
    
    // Copy references
    values = tempValues;
    weights = tempWeights;
    cuts = tempCuts;
    size = newSize;
  } while (violators);
  
  // Compute actual predictions
  for (int i = 0; i < size; i++) {
    values[i] /= weights[i];
  }
  
  // Backup best instance variables
  Attribute attributeBackedup = m_attribute;
  double[] cutsBackedup = m_cuts;
  double[] valuesBackedup = m_values;
  
  // Set instance variables to values computed for this attribute
  m_attribute = attribute;
  m_cuts = cuts;
  m_values = values;
  
  // Compute sum of squared errors
  Evaluation eval = new Evaluation(insts);
  eval.evaluateModel(this, insts);
  double msq = eval.rootMeanSquaredError();
  
  // Check whether this is the best attribute
  if (msq < m_minMsq) {
    m_minMsq = msq;
  } else {
    m_attribute = attributeBackedup;
    m_cuts = cutsBackedup;
    m_values = valuesBackedup;
  }
}
 
Example 10
Source File: Discretize.java    From tsml with GNU General Public License v3.0 4 votes vote down vote up
/**
  * Set cutpoints for a single attribute.
  *
  * @param index the index of the attribute to set cutpoints for
  */
 protected void calculateCutPointsByEqualFrequencyBinning(int index) {

   // Copy data so that it can be sorted
   Instances data = new Instances(getInputFormat());

   // Sort input data
   data.sort(index);

   // Compute weight of instances without missing values
   double sumOfWeights = 0;
   for (int i = 0; i < data.numInstances(); i++) {
     if (data.instance(i).isMissing(index)) {
break;
     } else {
sumOfWeights += data.instance(i).weight();
     }
   }
   double freq;
   double[] cutPoints = new double[m_NumBins - 1];
   if (getDesiredWeightOfInstancesPerInterval() > 0) {
     freq = getDesiredWeightOfInstancesPerInterval();
     cutPoints = new double[(int)(sumOfWeights / freq)];
   } else {
     freq = sumOfWeights / m_NumBins;
     cutPoints = new double[m_NumBins - 1];
   }

   // Compute break points
   double counter = 0, last = 0;
   int cpindex = 0, lastIndex = -1;
   for (int i = 0; i < data.numInstances() - 1; i++) {

     // Stop if value missing
     if (data.instance(i).isMissing(index)) {
break;
     }
     counter += data.instance(i).weight();
     sumOfWeights -= data.instance(i).weight();

     // Do we have a potential breakpoint?
     if (data.instance(i).value(index) <
  data.instance(i + 1).value(index)) {

// Have we passed the ideal size?
if (counter >= freq) {

  // Is this break point worse than the last one?
  if (((freq - last) < (counter - freq)) && (lastIndex != -1)) {
    cutPoints[cpindex] = (data.instance(lastIndex).value(index) +
			  data.instance(lastIndex + 1).value(index)) / 2;
    counter -= last;
    last = counter;
    lastIndex = i;
  } else {
    cutPoints[cpindex] = (data.instance(i).value(index) +
			  data.instance(i + 1).value(index)) / 2;
    counter = 0;
    last = 0;
    lastIndex = -1;
  }
  cpindex++;
  freq = (sumOfWeights + counter) / ((cutPoints.length + 1) - cpindex);
} else {
  lastIndex = i;
  last = counter;
}
     }
   }

   // Check whether there was another possibility for a cut point
   if ((cpindex < cutPoints.length) && (lastIndex != -1)) {
     cutPoints[cpindex] = (data.instance(lastIndex).value(index) +
		    data.instance(lastIndex + 1).value(index)) / 2;
     cpindex++;
   }

   // Did we find any cutpoints?
   if (cpindex == 0) {
     m_CutPoints[index] = null;
   } else {
     double[] cp = new double[cpindex];
     for (int i = 0; i < cpindex; i++) {
cp[i] = cutPoints[i];
     }
     m_CutPoints[index] = cp;
   }
 }
 
Example 11
Source File: PropositionalToMultiInstance.java    From tsml with GNU General Public License v3.0 4 votes vote down vote up
/**
 * Signify that this batch of input to the filter is finished. 
 * If the filter requires all instances prior to filtering,
 * output() may now be called to retrieve the filtered instances.
 *
 * @return true if there are instances pending output
 * @throws IllegalStateException if no input structure has been defined
 */
public boolean batchFinished() {

  if (getInputFormat() == null) {
    throw new IllegalStateException("No input instance format defined");
  }

  Instances input = getInputFormat();
  input.sort(0);   // make sure that bagID is sorted
  Instances output = getOutputFormat();
  Instances bagInsts = output.attribute(1).relation();
  Instance inst = new DenseInstance(bagInsts.numAttributes());
  inst.setDataset(bagInsts);

  double bagIndex   = input.instance(0).value(0);
  double classValue = input.instance(0).classValue(); 
  double bagWeight  = 0.0;

  // Convert pending input instances
  for(int i = 0; i < input.numInstances(); i++) {
    double currentBagIndex = input.instance(i).value(0);

    // copy the propositional instance value, except the bagIndex and the class value
    for (int j = 0; j < input.numAttributes() - 2; j++) 
      inst.setValue(j, input.instance(i).value(j + 1));
    inst.setWeight(input.instance(i).weight());

    if (currentBagIndex == bagIndex){
      bagInsts.add(inst);
      bagWeight += inst.weight();
    }
    else{
      addBag(input, output, bagInsts, (int) bagIndex, classValue, bagWeight);

      bagInsts   = bagInsts.stringFreeStructure();  
      bagInsts.add(inst);
      bagIndex   = currentBagIndex;
      classValue = input.instance(i).classValue();
      bagWeight  = inst.weight();
    }
  }

  // reach the last instance, create and add the last bag
  addBag(input, output, bagInsts, (int) bagIndex, classValue, bagWeight);

  if (getRandomize())
    output.randomize(new Random(getSeed()));
  
  for (int i = 0; i < output.numInstances(); i++)
    push(output.instance(i));
  
  // Free memory
  flushInput();

  m_NewBatch = true;
  m_FirstBatchDone = true;
  
  return (numPendingOutput() != 0);
}
 
Example 12
Source File: SubSample.java    From gsn with GNU General Public License v3.0 4 votes vote down vote up
@Override
protected Instances process(Instances instances) throws Exception {

	instances.sort(m_index);
	
	Instances output = new Instances(instances);
	
	if(instances.numInstances() <= m_ratio){return output;}
	
	for(int i=output.numInstances()-1;i>=0;i--){
		if((i+1) % m_ratio != 0){output.delete(i);}
	}
	//output.compactify();
	
	
	return output;
}