Java Code Examples for weka.core.Utils#log2()

The following examples show how to use weka.core.Utils#log2() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Evaluation.java    From tsml with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Calculate the entropy of the prior distribution.
 * 
 * @return the entropy of the prior distribution
 * @throws Exception if the class is not nominal
 */
public final double priorEntropy() throws Exception {

  if (!m_ClassIsNominal) {
    throw new Exception("Can't compute entropy of class prior: "
        + "class numeric!");
  }

  if (m_NoPriors)
    return Double.NaN;

  double entropy = 0;
  for (int i = 0; i < m_NumClasses; i++) {
    entropy -= m_ClassPriors[i] / m_ClassPriorsSum
        * Utils.log2(m_ClassPriors[i] / m_ClassPriorsSum);
  }
  return entropy;
}
 
Example 2
Source File: Id3.java    From tsml with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Computes the entropy of a dataset.
 * 
 * @param data the data for which entropy is to be computed
 * @return the entropy of the data's class distribution
 * @throws Exception if computation fails
 */
private double computeEntropy(Instances data) throws Exception {

  double [] classCounts = new double[data.numClasses()];
  Enumeration instEnum = data.enumerateInstances();
  while (instEnum.hasMoreElements()) {
    Instance inst = (Instance) instEnum.nextElement();
    classCounts[(int) inst.classValue()]++;
  }
  double entropy = 0;
  for (int j = 0; j < data.numClasses(); j++) {
    if (classCounts[j] > 0) {
      entropy -= classCounts[j] * Utils.log2(classCounts[j]);
    }
  }
  entropy /= (double) data.numInstances();
  return entropy + Utils.log2(data.numInstances());
}
 
Example 3
Source File: CollectiveTree.java    From collective-classification-weka-package with GNU General Public License v3.0 6 votes vote down vote up
/**
 * here initialization and building, possible iterations will happen
 *
 * @throws Exception	if something goes wrong
 */
@Override
protected void build() throws Exception {
  // determine number of features to be selected
  m_KValue = getNumFeatures();
  if (m_KValue < 1)
    m_KValue = (int) Utils.log2(m_Trainset.numAttributes()) + 1;

  // Make sure K value is in range
  if (m_KValue > m_Trainset.numAttributes() - 1)
    m_KValue = m_Trainset.numAttributes() - 1;

  // build classifier
  m_Random = m_Trainset.getRandomNumberGenerator(getSeed());
  buildClassifier();
}
 
Example 4
Source File: Evaluation.java    From tsml with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Updates stats for conditional density estimator based on current test
 * instance.
 * 
 * @param classifier the conditional density estimator
 * @param classMissing the instance for which density is to be computed,
 *          without a class value
 * @param classValue the class value of this instance
 * @throws Exception if density could not be computed successfully
 */
protected void updateStatsForConditionalDensityEstimator(
    ConditionalDensityEstimator classifier, Instance classMissing,
    double classValue) throws Exception {

  if (m_PriorEstimator == null) {
    setNumericPriorsFromBuffer();
  }
  m_SumSchemeEntropy -= classifier.logDensity(classMissing, classValue)
      * classMissing.weight() / Utils.log2;
  m_SumPriorEntropy -= m_PriorEstimator.logDensity(classValue)
      * classMissing.weight() / Utils.log2;
}
 
Example 5
Source File: InfoGainSplitMetric.java    From tsml with GNU General Public License v3.0 5 votes vote down vote up
@Override
public double getMetricRange(Map<String, WeightMass> preDist) {

  int numClasses = preDist.size();
  if (numClasses < 2) {
    numClasses = 2;
  }

  return Utils.log2(numClasses);
}
 
Example 6
Source File: BFTree.java    From tsml with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Compute and return entropy for a given distribution of a node.
 * 
 * @param dist 	class distributions
 * @param total 	class distributions
 * @return 		entropy of the class distributions
 */
protected double computeEntropy(double[] dist, double total) {
  if (total==0) return 0;
  double entropy = 0;
  for (int i=0; i<dist.length; i++) {
    if (dist[i]!=0) entropy -= dist[i]/total * Utils.log2(dist[i]/total);
  }
  return entropy;
}
 
Example 7
Source File: RuleStats.java    From tsml with GNU General Public License v3.0 5 votes vote down vote up
/** 
 * The description length of the theory for a given rule.  Computed as:<br>
 *                 0.5* [||k||+ S(t, k, k/t)]<br>
 * where k is the number of antecedents of the rule; t is the total
 * possible antecedents that could appear in a rule; ||K|| is the 
 * universal prior for k , log2*(k) and S(t,k,p) = -k*log2(p)-(n-k)log2(1-p)
 * is the subset encoding length.<p>
 *
 * Details see Quilan: "MDL and categorical theories (Continued)",ML95
 *
 * @param index the index of the given rule (assuming correct)
 * @return the theory DL, weighted if weight != 1.0
 */
public double theoryDL(int index){
	
  double k = ((Rule)m_Ruleset.elementAt(index)).size();
	
  if(k == 0)
    return 0.0;
	
  double tdl = Utils.log2(k);               	    
  if(k > 1)                           // Approximation
    tdl += 2.0 * Utils.log2(tdl);   // of log2 star	
  tdl += subsetDL(m_Total, k, k/m_Total);
  //System.out.println("!!!theory: "+MDL_THEORY_WEIGHT * REDUNDANCY_FACTOR * tdl);
  return MDL_THEORY_WEIGHT * REDUNDANCY_FACTOR * tdl;
}
 
Example 8
Source File: RuleStats.java    From tsml with GNU General Public License v3.0 5 votes vote down vote up
/** 
  * The description length of data given the parameters of the data
  * based on the ruleset. <p>
  * Details see Quinlan: "MDL and categorical theories (Continued)",ML95<p>
  *
  * @param expFPOverErr expected FP/(FP+FN)
  * @param cover coverage
  * @param uncover uncoverage
  * @param fp False Positive
  * @param fn False Negative
  * @return the description length
  */
 public static double dataDL(double expFPOverErr, double cover, 
		      double uncover, double fp, double fn){
   double totalBits = Utils.log2(cover+uncover+1.0); // how many data?
   double coverBits, uncoverBits; // What's the error?
   double expErr;                 // Expected FP or FN

   if(Utils.gr(cover, uncover)){
     expErr = expFPOverErr*(fp+fn);
     coverBits = subsetDL(cover, fp, expErr/cover);
     uncoverBits = Utils.gr(uncover, 0.0) ? 
subsetDL(uncover, fn, fn/uncover) : 0.0;	    
   }
   else{
     expErr = (1.0-expFPOverErr)*(fp+fn);
     coverBits = Utils.gr(cover, 0.0) ? 
subsetDL(cover, fp, fp/cover) : 0.0;
     uncoverBits = subsetDL(uncover, fn, expErr/uncover);
   }

   /*
     System.err.println("!!!cover: " + cover + "|uncover" + uncover +
     "|coverBits: "+coverBits+"|uncBits: "+ uncoverBits+
     "|FPRate: "+expFPOverErr + "|expErr: "+expErr+
     "|fp: "+fp+"|fn: "+fn+"|total: "+totalBits);
   */
   return (totalBits + coverBits + uncoverBits);
 }
 
Example 9
Source File: TunedRandomForest.java    From tsml with GNU General Public License v3.0 5 votes vote down vote up
@Override
    public void setParametersFromIndex(int x) {
        tuneParameters=false;
//Three paras, evenly distributed, 1 to maxPerPara.
//Note that if maxPerPara > numFeaturesInProblem, we have a problem, so it will throw an exception later        
        paras=new int[3];
        if(x<1 || x>maxPerPara*maxPerPara*maxPerPara)//Error, invalid range
            throw new UnsupportedOperationException("ERROR parameter index "+x+" out of range for PolyNomialKernel"); //To change body of generated methods, choose Tools | Templates.
        int numLevelsIndex=(x-1)/(maxPerPara*maxPerPara);
        int numFeaturesIndex=((x-1)/(maxPerPara))%maxPerPara;
        int numTreesIndex=x%maxPerPara;
//Need to know number of attributes        
        if(numFeaturesInProblem==0)
            throw new RuntimeException("Error in TunedRandomForest in set ParametersFromIndex: we do not know the number of attributes, need to call setNumFeaturesInProblem before this call");
//Para 1. Maximum tree depth, m_MaxDepth
        if(numLevelsIndex==0)
            paras[0]=0;
        else
            paras[0]=numLevelsIndex*(numFeaturesInProblem/maxPerPara);
//Para 2. Num features
        if(numFeaturesIndex==0)
            paras[1]=(int)Math.sqrt(numFeaturesInProblem);
        else if(numFeaturesIndex==1)
            paras[1]=(int) Utils.log2(numFeaturesInProblem)+1;
        else
            paras[1]=((numFeaturesIndex-1)*numFeaturesInProblem)/maxPerPara;
        if(numTreesIndex==0)
            paras[2]=10; //Weka default
        else
            paras[2]=100*numTreesIndex;
        setMaxDepth(paras[0]);
        setNumFeaturesForEachTree(paras[1]);
        setNumTrees(paras[2]);
        if(m_Debug)
            System.out.println("Index ="+x+" Num Features ="+numFeaturesInProblem+" Max Depth="+paras[0]+" Num Features ="+paras[1]+" Num Trees ="+paras[2]);
    }
 
Example 10
Source File: TSF.java    From tsml with GNU General Public License v3.0 4 votes vote down vote up
/**
   * Parses a given list of options to set the parameters of the classifier.
   * We use this for the tuning mechanism, setting parameters through setOptions 
   <!-- options-start -->
   * Valid options are: <p/>
   * <pre> -T
   * Number of trees.</pre>
   * 
   * <pre> -I
   * Number of intervals to fit.</pre>
   * 
   <!-- options-end -->
   *
   * @param options the list of options as an array of strings
   * @throws Exception if an option is not supported
   */
    @Override
    public void setOptions(String[] options) throws Exception{
/*        System.out.print("TSF para sets ");
        for (String str:options)
            System.out.print(","+str);
        System.out.print("\n");
*/
        String numTreesString=Utils.getOption('T', options);

        if (numTreesString.length() != 0) {
            numClassifiers = Integer.parseInt(numTreesString);
        }

        String numFeaturesString=Utils.getOption('I', options);
//Options here are a double between 0 and 1 (proportion of features), a text 
//string sqrt or log, or an integer number 
        if (numFeaturesString.length() != 0){
            try{
                if(numFeaturesString.equals("sqrt"))
                    numIntervalsFinder = (numAtts) -> (int)(Math.sqrt(numAtts));
                else if(numFeaturesString.equals("log"))
                    numIntervalsFinder = (numAtts) -> (int) Utils.log2(numAtts) + 1;
                else{
                        double d=Double.parseDouble(numFeaturesString);
                        if(d<=0)
                            throw new Exception("proportion of features of of range 0 to 1");
                        if(d<=1)
                            numIntervalsFinder = (numAtts) -> (int)(d*numAtts);
                        else
                            numIntervalsFinder = (numAtts) -> (int)(d);
 
                 }
            }catch(Exception e){
                System.err.print(" Error: invalid parameter passed to TSF setOptions for number of parameters. Setting to default");
                System.err.print("Value"+numIntervalsFinder+" Permissable values: sqrt, log, or a double range 0...1");
                numIntervalsFinder = (numAtts) -> (int)(Math.sqrt(numAtts));
            }
        }
        else
            numIntervalsFinder = (numAtts) -> (int)(Math.sqrt(numAtts));
    }
 
Example 11
Source File: C45Split.java    From tsml with GNU General Public License v3.0 4 votes vote down vote up
/**
 * Returns coding cost for split (used in rule learner).
 */
public final double codingCost() {

  return Utils.log2(m_index);
}
 
Example 12
Source File: Discretize.java    From tsml with GNU General Public License v3.0 4 votes vote down vote up
/**
  * Test using Kononenko's MDL criterion.
  *
  * @param priorCounts
  * @param bestCounts
  * @param numInstances
  * @param numCutPoints
  * @return true if the split is acceptable
  */
 private boolean KononenkosMDL(double[] priorCounts,
			double[][] bestCounts,
			double numInstances,
			int numCutPoints) {

   double distPrior, instPrior, distAfter = 0, sum, instAfter = 0;
   double before, after;
   int numClassesTotal;

   // Number of classes occuring in the set
   numClassesTotal = 0;
   for (int i = 0; i < priorCounts.length; i++) {
     if (priorCounts[i] > 0) {
numClassesTotal++;
     }
   }

   // Encode distribution prior to split
   distPrior = SpecialFunctions.log2Binomial(numInstances
				      + numClassesTotal - 1,
				      numClassesTotal - 1);

   // Encode instances prior to split.
   instPrior = SpecialFunctions.log2Multinomial(numInstances,
					 priorCounts);

   before = instPrior + distPrior;

   // Encode distributions and instances after split.
   for (int i = 0; i < bestCounts.length; i++) {
     sum = Utils.sum(bestCounts[i]);
     distAfter += SpecialFunctions.log2Binomial(sum + numClassesTotal - 1,
					 numClassesTotal - 1);
     instAfter += SpecialFunctions.log2Multinomial(sum,
					    bestCounts[i]);
   }

   // Coding cost after split
   after = Utils.log2(numCutPoints) + distAfter + instAfter;

   // Check if split is to be accepted
   return (before > after);
 }
 
Example 13
Source File: Discretize.java    From tsml with GNU General Public License v3.0 4 votes vote down vote up
/**
  * Test using Fayyad and Irani's MDL criterion.
  *
  * @param priorCounts
  * @param bestCounts
  * @param numInstances
  * @param numCutPoints
  * @return true if the splits is acceptable
  */
 private boolean FayyadAndIranisMDL(double[] priorCounts,
			     double[][] bestCounts,
			     double numInstances,
			     int numCutPoints) {

   double priorEntropy, entropy, gain;
   double entropyLeft, entropyRight, delta;
   int numClassesTotal, numClassesRight, numClassesLeft;

   // Compute entropy before split.
   priorEntropy = ContingencyTables.entropy(priorCounts);

   // Compute entropy after split.
   entropy = ContingencyTables.entropyConditionedOnRows(bestCounts);

   // Compute information gain.
   gain = priorEntropy - entropy;

   // Number of classes occuring in the set
   numClassesTotal = 0;
   for (int i = 0; i < priorCounts.length; i++) {
     if (priorCounts[i] > 0) {
numClassesTotal++;
     }
   }

   // Number of classes occuring in the left subset
   numClassesLeft = 0;
   for (int i = 0; i < bestCounts[0].length; i++) {
     if (bestCounts[0][i] > 0) {
numClassesLeft++;
     }
   }

   // Number of classes occuring in the right subset
   numClassesRight = 0;
   for (int i = 0; i < bestCounts[1].length; i++) {
     if (bestCounts[1][i] > 0) {
numClassesRight++;
     }
   }

   // Entropy of the left and the right subsets
   entropyLeft = ContingencyTables.entropy(bestCounts[0]);
   entropyRight = ContingencyTables.entropy(bestCounts[1]);

   // Compute terms for MDL formula
   delta = Utils.log2(Math.pow(3, numClassesTotal) - 2) -
     (((double) numClassesTotal * priorEntropy) -
      (numClassesRight * entropyRight) -
      (numClassesLeft * entropyLeft));

   // Check if split is to be accepted
   return (gain > (Utils.log2(numCutPoints) + delta) / (double)numInstances);
 }
 
Example 14
Source File: CollectiveForest.java    From collective-classification-weka-package with GNU General Public License v3.0 4 votes vote down vote up
/**
 * here initialization and building, possible iterations will happen
 * 
 * @throws Exception	if something goes wrong
 */
@Override
protected void build() throws Exception {
  AttributeStats        stats;
  int                   i;
  
  // determine number of features to be selected
  m_KValue = getNumFeatures();
  if (m_KValue < 1) 
    m_KValue = (int) Utils.log2(m_Trainset.numAttributes()) + 1;

  // determine class distribution
  m_ClassDistribution = new double[2];
  stats = m_Trainset.attributeStats(m_Trainset.classIndex());
  for (i = 0; i < 2; i++) {
    if (stats.totalCount > 0)
      m_ClassDistribution[i] = stats.nominalCounts[i] / stats.totalCount;
    else
      m_ClassDistribution[i] = 0;
  }

  // the number of instances added to the training set in each iteration
  m_InstancesPerIteration =   (double) m_Testset.numInstances() 
                            / getFolds();
  if (getDebug())
    System.out.println("InstancesPerIteration: " + m_InstancesPerIteration);

  // build list of sorted test instances
  m_List = new RankedList(m_Testset, m_ClassDistribution);

  // build classifier
  m_Random = new Random(getSeed());
  for (i = 0; i <= getFolds(); i++) {
    if (getVerbose()) {
      if (getCutOff() > 0)
        System.out.println(   "\nFold " + i + "/" + getFolds() 
                            + " (CutOff at " + getCutOff() + ")");
      else
        System.out.println("\nFold " + i + "/" + getFolds());
    }
    buildTrainSet(i);
    buildClassifier();
    
    // cutoff of folds reached?
    if ( (i > 0) && (i == getCutOff()) )
      break;
  }
}
 
Example 15
Source File: RuleStats.java    From tsml with GNU General Public License v3.0 2 votes vote down vote up
/**
 * Subset description length: <br>
 * S(t,k,p) = -k*log2(p)-(n-k)log2(1-p)
 *
 * Details see Quilan: "MDL and categorical theories (Continued)",ML95
 *
 * @param t the number of elements in a known set
 * @param k the number of elements in a subset
 * @param p the expected proportion of subset known by recipient
 * @return the subset description length
 */
public static double subsetDL(double t, double k, double p){
  double rt = Utils.gr(p, 0.0) ? (- k*Utils.log2(p)) : 0.0;
  rt -= (t-k)*Utils.log2(1-p);
  return rt;
}
 
Example 16
Source File: PMILexiconExpander.java    From AffectiveTweets with GNU General Public License v3.0 2 votes vote down vote up
@Override
protected Instances process(Instances instances) throws Exception {



	Instances result = getOutputFormat();


	this.calculateWordCounts(instances);


	String[] sortedWords=this.wordInfo.keySet().toArray(new String[0]);

	Arrays.sort(sortedWords);

	for(String word:sortedWords){
		WordCount wordCount=this.wordInfo.get(word);

		if(wordCount.posCount+wordCount.negCount>=this.minFreq){

			double posProb=wordCount.posCount/posCount;
			double negProb=wordCount.negCount/negCount;
			double semanticOrientation=Utils.log2(posProb)-Utils.log2(negProb);



			double[] values = new double[result.numAttributes()];

			int wordNameIndex=result.attribute("WORD_NAME").index();
			values[wordNameIndex]=result.attribute(wordNameIndex).addStringValue(word);	

			values[result.numAttributes()-1]=semanticOrientation;


			Instance inst=new DenseInstance(1, values);

			inst.setDataset(result);

			result.add(inst);
		}

	}


	return result;



}