weka.core.AttributeStats Java Examples

The following examples show how to use weka.core.AttributeStats. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: Cobweb.java From tsml with GNU General Public License v3.0

5 votes

/**
    * Update attribute stats using the supplied instance. 
    *
    * @param updateInstance the instance for updating
    * @param delete true if the values of the supplied instance are
    * to be removed from the statistics
    */
   protected void updateStats(Instance updateInstance, 
		       boolean delete) {

     if (m_attStats == null) {
m_attStats = new AttributeStats[m_numAttributes];
for (int i = 0; i < m_numAttributes; i++) {
  m_attStats[i] = new AttributeStats();
  if (m_clusterInstances.attribute(i).isNominal()) {
    m_attStats[i].nominalCounts = 
      new int [m_clusterInstances.attribute(i).numValues()];
  } else {
    m_attStats[i].numericStats = new Stats();
  }
}
     }
     for (int i = 0; i < m_numAttributes; i++) {
if (!updateInstance.isMissing(i)) {
  double value = updateInstance.value(i);
  if (m_clusterInstances.attribute(i).isNominal()) {
    m_attStats[i].nominalCounts[(int)value] += (delete) ? 
      (-1.0 * updateInstance.weight()) : 
      updateInstance.weight();
    m_attStats[i].totalCount += (delete) ?
      (-1.0 * updateInstance.weight()) :
      updateInstance.weight();
  } else {
    if (delete) {
      m_attStats[i].numericStats.subtract(value, 
					  updateInstance.weight());
    } else {
      m_attStats[i].numericStats.add(value, updateInstance.weight());
    }
  }
}
     }
     m_totalInstances += (delete) 
? (-1.0 * updateInstance.weight()) 
: (updateInstance.weight());
   }

Example #2

Source File: Chopper.java From collective-classification-weka-package with GNU General Public License v3.0

5 votes

/**
 * builds the classifier
 * 
 * @throws Exception	if something goes wrong
 */
@Override
protected void build() throws Exception {
  AttributeStats        stats;
  int                   i;
  
  // determine class distribution
  m_ClassDistribution = new double[2];
  stats = m_Trainset.attributeStats(m_Trainset.classIndex());
  for (i = 0; i < 2; i++)
    m_ClassDistribution[i] = stats.nominalCounts[i] / stats.totalCount;

  // the number of instances added to the training set in each iteration
  m_InstancesPerIteration =   (double) m_Testset.numInstances() 
                            / getFolds();
  if (getDebug())
    System.out.println("InstancesPerIteration: " + m_InstancesPerIteration);

  // build classifier
  m_Random = new Random(getSeed());
  for (i = 0; i <= getFolds(); i++) {
    if (getVerbose() || getDebug()) {
      if (getCutOff() > 0)
        System.out.println(   "\nFold " + i + "/" + getFolds() 
                            + " (CutOff at " + getCutOff() + ")");
      else
        System.out.println("\nFold " + i + "/" + getFolds());
    }
    buildTrainSet(i);
    buildClassifier();
    
    // cutoff of folds reached?
    if ( (i > 0) && (i == getCutOff()) )
      break;
  }
}

Example #3

Source File: DecisionTreeNode.java From collective-classification-weka-package with GNU General Public License v3.0

5 votes

/**
 * sets the class probabilities based on the given data
 * 
 * @param data	the data to get the class probabilities from
 */
public void setClassProbabilities(Instances data) {
  AttributeStats	stats;
  int			total;
  int			i;
  
  stats = data.attributeStats(data.classIndex());
  total = Utils.sum(stats.nominalCounts);
  m_ClassProbs = new double[data.classAttribute().numValues()];
  for (i = 0; i < m_ClassProbs.length; i++)
    m_ClassProbs[i] = (double) stats.nominalCounts[i] / (double) total;
}

Example #4

Source File: CollectiveInstances.java From collective-classification-weka-package with GNU General Public License v3.0

5 votes

/**
 * randomly initializes the class labels in the given set according to the
 * class distribution in the training set
 * @param train       the training instances to retrieve the class
 *                    distribution from
 * @param instances   the instances to initialize
 * @param from        the first instance to initialize
 * @param count       the number of instances to initialize
 * @return            the initialize instances
 * @throws Exception  if something goes wrong
 */
public Instances initializeLabels( Instances train, Instances instances, 
                                   int from, int count )
  throws Exception {
    
  int             i;
  AttributeStats  stats;
  Attribute       classAttr;
  double          percentage;
  
  // reset flip count
  m_FlippedLabels = 0;
  
  // explicitly set labels to "missing"
  for (i = from; i < from + count; i++)
    instances.instance(i).setClassMissing();
  
  // determining the percentage of the first class
  stats      = train.attributeStats(train.classIndex());
  percentage = (double) stats.nominalCounts[0] / (double) stats.totalCount;
  
  // set lables
  classAttr = instances.attribute(instances.classIndex());
  for (i = from; i < from + count; i++) {
    // random class
    if (m_Random.nextDouble() < percentage)
      instances.instance(i).setClassValue(classAttr.value(0));
    else
      instances.instance(i).setClassValue(classAttr.value(1));
  }

  return instances;
}

Example #5

Source File: CobWeb.java From moa with GNU General Public License v3.0

5 votes

/**
 * Update attribute stats using the supplied instance.
 *
 * @param updateInstance the instance for updating
 * @param delete true if the values of the supplied instance are
 * to be removed from the statistics
 */
protected void updateStats(Instance updateInstance,
        boolean delete) {

    if (m_attStats == null) {
        m_attStats = new AttributeStats[m_numAttributes];
        for (int i = 0; i < m_numAttributes; i++) {
            m_attStats[i] = new AttributeStats();
            if (m_clusterInstances.attribute(i).isNominal()) {
                m_attStats[i].nominalCounts =
                        new int[m_clusterInstances.attribute(i).numValues()];
            } else {
                m_attStats[i].numericStats = new Stats();
            }
        }
    }
    for (int i = 0; i < m_numAttributes; i++) {
        if (!updateInstance.isMissing(i)) {
            double value = updateInstance.value(i);
            if (m_clusterInstances.attribute(i).isNominal()) {
                m_attStats[i].nominalCounts[(int) value] += (delete)
                        ? (-1.0 * updateInstance.weight())
                        : updateInstance.weight();
                m_attStats[i].totalCount += (delete)
                        ? (-1.0 * updateInstance.weight())
                        : updateInstance.weight();
            } else {
                if (delete) {
                    m_attStats[i].numericStats.subtract(value,
                            updateInstance.weight());
                } else {
                    m_attStats[i].numericStats.add(value, updateInstance.weight());
                }
            }
        }
    }
    m_totalInstances += (delete)
            ? (-1.0 * updateInstance.weight())
            : (updateInstance.weight());
}

Example #6

Source File: Apriori.java From tsml with GNU General Public License v3.0

4 votes

/**
 * Removes columns that are all missing from the data
 * 
 * @param instances the instances
 * @return a new set of instances with all missing columns removed
 * @throws Exception if something goes wrong
 */
protected Instances removeMissingColumns(Instances instances)
    throws Exception {

  int numInstances = instances.numInstances();
  StringBuffer deleteString = new StringBuffer();
  int removeCount = 0;
  boolean first = true;
  int maxCount = 0;

  for (int i = 0; i < instances.numAttributes(); i++) {
    AttributeStats as = instances.attributeStats(i);
    if (m_upperBoundMinSupport == 1.0 && maxCount != numInstances) {
      // see if we can decrease this by looking for the most frequent value
      int[] counts = as.nominalCounts;
      if (counts[Utils.maxIndex(counts)] > maxCount) {
        maxCount = counts[Utils.maxIndex(counts)];
      }
    }
    if (as.missingCount == numInstances) {
      if (first) {
        deleteString.append((i + 1));
        first = false;
      } else {
        deleteString.append("," + (i + 1));
      }
      removeCount++;
    }
  }
  if (m_verbose) {
    System.err.println("Removed : " + removeCount
        + " columns with all missing " + "values.");
  }
  if (m_upperBoundMinSupport == 1.0 && maxCount != numInstances) {
    m_upperBoundMinSupport = (double) maxCount / (double) numInstances;
    if (m_verbose) {
      System.err.println("Setting upper bound min support to : "
          + m_upperBoundMinSupport);
    }
  }

  if (deleteString.toString().length() > 0) {
    Remove af = new Remove();
    af.setAttributeIndices(deleteString.toString());
    af.setInvertSelection(false);
    af.setInputFormat(instances);
    Instances newInst = Filter.useFilter(instances, af);

    return newInst;
  }
  return instances;
}

Example #7

Source File: RemoveUseless.java From tsml with GNU General Public License v3.0

4 votes

/**
  * Signify that this batch of input to the filter is finished.
  *
  * @return true if there are instances pending output
  * @throws Exception if no input format defined
  */  
 public boolean batchFinished() throws Exception {

   if (getInputFormat() == null) {
     throw new IllegalStateException("No input instance format defined");
   }
   if (m_removeFilter == null) {

     // establish attributes to remove from first batch

     Instances toFilter = getInputFormat();
     int[] attsToDelete = new int[toFilter.numAttributes()];
     int numToDelete = 0;
     for(int i = 0; i < toFilter.numAttributes(); i++) {
if (i==toFilter.classIndex()) continue; // skip class
AttributeStats stats = toFilter.attributeStats(i);
if (stats.missingCount == toFilter.numInstances()) {
  attsToDelete[numToDelete++] = i;
} else if (stats.distinctCount < 2) {
  // remove constant attributes
  attsToDelete[numToDelete++] = i;
} else if (toFilter.attribute(i).isNominal()) {
  // remove nominal attributes that vary too much
  double variancePercent = (double) stats.distinctCount
    / (double)(stats.totalCount - stats.missingCount) * 100.0;
  if (variancePercent > m_maxVariancePercentage) {
      attsToDelete[numToDelete++] = i;
  }
}
     }
     
     int[] finalAttsToDelete = new int[numToDelete];
     System.arraycopy(attsToDelete, 0, finalAttsToDelete, 0, numToDelete);
     
     m_removeFilter = new Remove();
     m_removeFilter.setAttributeIndicesArray(finalAttsToDelete);
     m_removeFilter.setInvertSelection(false);
     m_removeFilter.setInputFormat(toFilter);
     
     for (int i = 0; i < toFilter.numInstances(); i++) {
m_removeFilter.input(toFilter.instance(i));
     }
     m_removeFilter.batchFinished();

     Instance processed;
     Instances outputDataset = m_removeFilter.getOutputFormat();
   
     // restore old relation name to hide attribute filter stamp
     outputDataset.setRelationName(toFilter.relationName());
   
     setOutputFormat(outputDataset);
     while ((processed = m_removeFilter.output()) != null) {
processed.setDataset(outputDataset);
push(processed);
     }
   }
   flushInput();
   
   m_NewBatch = true;
   return (numPendingOutput() != 0);
 }

Example #8

Source File: RemoveFrequentValues.java From tsml with GNU General Public License v3.0

4 votes

/**
 * determines the values to retain, it is always at least 1
 * and up to the maximum number of distinct values
 * 
 * @param inst the Instances to determine the values from which are kept  
 */
public void determineValues(Instances inst) {
   int					i;
   AttributeStats		stats;
   int					attIdx;
   int					min;
   int					max;
   int					count;

   m_AttIndex.setUpper(inst.numAttributes() - 1);
   attIdx = m_AttIndex.getIndex();
   
   // init names
   m_Values = new HashSet();
   
   if (inst == null)
      return;
   
   // number of values to retain
   stats = inst.attributeStats(attIdx);
   if (m_Invert)
      count = stats.nominalCounts.length - m_NumValues;
   else
      count = m_NumValues;
   // out of bounds? -> fix
   if (count < 1)
      count = 1;  // at least one value!
   if (count > stats.nominalCounts.length)
      count = stats.nominalCounts.length;  // at max the existing values
   
   // determine min/max occurences
   Arrays.sort(stats.nominalCounts);
   if (m_LeastValues) {
      min = stats.nominalCounts[0];
      max = stats.nominalCounts[count - 1];
   }
   else {
      min = stats.nominalCounts[(stats.nominalCounts.length - 1) - count + 1];
      max = stats.nominalCounts[stats.nominalCounts.length - 1];
   }
   
   // add values if they are inside min/max (incl. borders) and not more than count
   stats = inst.attributeStats(attIdx);
   for (i = 0; i < stats.nominalCounts.length; i++) {
      if ( (stats.nominalCounts[i] >= min) && (stats.nominalCounts[i] <= max) && (m_Values.size() < count) )
         m_Values.add(inst.attribute(attIdx).value(i));
   }
}

Example #9

Source File: CollectiveForest.java From collective-classification-weka-package with GNU General Public License v3.0

4 votes

/**
 * here initialization and building, possible iterations will happen
 * 
 * @throws Exception	if something goes wrong
 */
@Override
protected void build() throws Exception {
  AttributeStats        stats;
  int                   i;
  
  // determine number of features to be selected
  m_KValue = getNumFeatures();
  if (m_KValue < 1) 
    m_KValue = (int) Utils.log2(m_Trainset.numAttributes()) + 1;

  // determine class distribution
  m_ClassDistribution = new double[2];
  stats = m_Trainset.attributeStats(m_Trainset.classIndex());
  for (i = 0; i < 2; i++) {
    if (stats.totalCount > 0)
      m_ClassDistribution[i] = stats.nominalCounts[i] / stats.totalCount;
    else
      m_ClassDistribution[i] = 0;
  }

  // the number of instances added to the training set in each iteration
  m_InstancesPerIteration =   (double) m_Testset.numInstances() 
                            / getFolds();
  if (getDebug())
    System.out.println("InstancesPerIteration: " + m_InstancesPerIteration);

  // build list of sorted test instances
  m_List = new RankedList(m_Testset, m_ClassDistribution);

  // build classifier
  m_Random = new Random(getSeed());
  for (i = 0; i <= getFolds(); i++) {
    if (getVerbose()) {
      if (getCutOff() > 0)
        System.out.println(   "\nFold " + i + "/" + getFolds() 
                            + " (CutOff at " + getCutOff() + ")");
      else
        System.out.println("\nFold " + i + "/" + getFolds());
    }
    buildTrainSet(i);
    buildClassifier();
    
    // cutoff of folds reached?
    if ( (i > 0) && (i == getCutOff()) )
      break;
  }
}