Java Code Examples for org.apache.commons.math.stat.descriptive.DescriptiveStatistics#addValue()

The following examples show how to use org.apache.commons.math.stat.descriptive.DescriptiveStatistics#addValue() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: HeatMapTask.java    From mzmine3 with GNU General Public License v2.0 6 votes vote down vote up
private void scale(double[][] peakList) {
  DescriptiveStatistics stdDevStats = new DescriptiveStatistics();

  for (int columns = 0; columns < peakList.length; columns++) {
    stdDevStats.clear();
    for (int row = 0; row < peakList[columns].length; row++) {
      if (!Double.isInfinite(peakList[columns][row]) && !Double.isNaN(peakList[columns][row])) {
        stdDevStats.addValue(peakList[columns][row]);
      }
    }

    double stdDev = stdDevStats.getStandardDeviation();

    for (int row = 0; row < peakList[columns].length; row++) {
      if (stdDev != 0) {
        peakList[columns][row] = peakList[columns][row] / stdDev;
      }
    }
  }
}
 
Example 2
Source File: StatUtilsTest.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
/**
 * Run with 77 random values, assuming that the outcome has a mean of 0 and a standard deviation of 1 with a
 * precision of 1E-10.
 */

@Test
public void testNormalize2() {
    // create an sample with 77 values    
    int length = 77;
    double sample[] = new double[length];
    for (int i = 0; i < length; i++) {
        sample[i] = Math.random();
    }
    // normalize this sample
    double standardizedSample[] = StatUtils.normalize(sample);

    DescriptiveStatistics stats = new DescriptiveStatistics();
    // Add the data from the array
    for (int i = 0; i < length; i++) {
        stats.addValue(standardizedSample[i]);
    }
    // the calculations do have a limited precision    
    double distance = 1E-10;
    // check the mean an standard deviation
    Assert.assertEquals(0.0, stats.getMean(), distance);
    Assert.assertEquals(1.0, stats.getStandardDeviation(), distance);

}
 
Example 3
Source File: CollectionUtils.java    From argument-reasoning-comprehension-task with Apache License 2.0 6 votes vote down vote up
public static DescriptiveStatistics frequencyToStatistics(Frequency frequency)
{
    Iterator<Comparable<?>> comparableIterator = frequency.valuesIterator();
    DescriptiveStatistics result = new DescriptiveStatistics();
    while (comparableIterator.hasNext()) {
        Comparable<?> next = comparableIterator.next();
        long count = frequency.getCount(next);

        for (int i = 0; i < count; i++) {
            if (next instanceof Number) {
                result.addValue(((Number) next).doubleValue());
            }
        }
    }

    return result;
}
 
Example 4
Source File: StatUtils.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
/**
 * Normalize (standardize) the series, so in the end it is having a mean of 0 and a standard deviation of 1.
 *
 * @param sample Sample to normalize.
 * @return normalized (standardized) sample.
 * @since 2.2
 */
public static double[] normalize(final double[] sample) {
    DescriptiveStatistics stats = new DescriptiveStatistics();

    // Add the data from the series to stats
    for (int i = 0; i < sample.length; i++) {
        stats.addValue(sample[i]);
    }

    // Compute mean and standard deviation
    double mean = stats.getMean();
    double standardDeviation = stats.getStandardDeviation();

    // initialize the standardizedSample, which has the same length as the sample
    double[] standardizedSample = new double[sample.length];

    for (int i = 0; i < sample.length; i++) {
        // z = (x- mean)/standardDeviation
        standardizedSample[i] = (sample[i] - mean) / standardDeviation;
    }
    return standardizedSample;
}
 
Example 5
Source File: SimpleRPNDHeterogeneousReductionStumpExperimentRunner.java    From AILibs with GNU Affero General Public License v3.0 6 votes vote down vote up
public void conductExperiment(final MySQLReductionExperiment exp) throws Exception {
	List<Map<String, Object>> mccvResults = Util.conductSingleOneStepReductionExperiment(exp.getExperiment());
	DescriptiveStatistics errorRate = new DescriptiveStatistics();
	DescriptiveStatistics runtime = new DescriptiveStatistics();
	for (Map<String, Object> result : mccvResults) {
		errorRate.addValue((double) result.get("errorRate"));
		runtime.addValue((long) result.get("trainTime"));
	}

	/* prepapre values for experiment update */
	Map<String, Object> values = new HashMap<>();
	values.put(ERROR_RATE_MIN, errorRate.getMin());
	values.put(ERROR_RATE_MAX, errorRate.getMax());
	values.put(ERROR_RATE_MEAN, errorRate.getMean());
	values.put(ERROR_RATE_STD, errorRate.getStandardDeviation());
	values.put(RUNTIME_MIN, runtime.getMin());
	values.put(RUNTIME_MAX, runtime.getMax());
	values.put(RUNTIME_MEAN, runtime.getMean());
	values.put(RUNTIME_STD, runtime.getStandardDeviation());
	this.updateExperiment(exp, values);
}
 
Example 6
Source File: StatUtilsTest.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
/**
 * Run with 77 random values, assuming that the outcome has a mean of 0 and a standard deviation of 1 with a
 * precision of 1E-10.
 */

@Test
public void testNormalize2() {
    // create an sample with 77 values    
    int length = 77;
    double sample[] = new double[length];
    for (int i = 0; i < length; i++) {
        sample[i] = Math.random();
    }
    // normalize this sample
    double standardizedSample[] = StatUtils.normalize(sample);

    DescriptiveStatistics stats = new DescriptiveStatistics();
    // Add the data from the array
    for (int i = 0; i < length; i++) {
        stats.addValue(standardizedSample[i]);
    }
    // the calculations do have a limited precision    
    double distance = 1E-10;
    // check the mean an standard deviation
    Assert.assertEquals(0.0, stats.getMean(), distance);
    Assert.assertEquals(1.0, stats.getStandardDeviation(), distance);

}
 
Example 7
Source File: StatUtils.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
/**
 * Normalize (standardize) the series, so in the end it is having a mean of 0 and a standard deviation of 1.
 *
 * @param sample Sample to normalize.
 * @return normalized (standardized) sample.
 * @since 2.2
 */
public static double[] normalize(final double[] sample) {
    DescriptiveStatistics stats = new DescriptiveStatistics();

    // Add the data from the series to stats
    for (int i = 0; i < sample.length; i++) {
        stats.addValue(sample[i]);
    }

    // Compute mean and standard deviation
    double mean = stats.getMean();
    double standardDeviation = stats.getStandardDeviation();

    // initialize the standardizedSample, which has the same length as the sample
    double[] standardizedSample = new double[sample.length];

    for (int i = 0; i < sample.length; i++) {
        // z = (x- mean)/standardDeviation
        standardizedSample[i] = (sample[i] - mean) / standardDeviation;
    }
    return standardizedSample;
}
 
Example 8
Source File: HeatMapTask.java    From mzmine2 with GNU General Public License v2.0 6 votes vote down vote up
private void scale(double[][] peakList) {
  DescriptiveStatistics stdDevStats = new DescriptiveStatistics();

  for (int columns = 0; columns < peakList.length; columns++) {
    stdDevStats.clear();
    for (int row = 0; row < peakList[columns].length; row++) {
      if (!Double.isInfinite(peakList[columns][row]) && !Double.isNaN(peakList[columns][row])) {
        stdDevStats.addValue(peakList[columns][row]);
      }
    }

    double stdDev = stdDevStats.getStandardDeviation();

    for (int row = 0; row < peakList[columns].length; row++) {
      if (stdDev != 0) {
        peakList[columns][row] = peakList[columns][row] / stdDev;
      }
    }
  }
}
 
Example 9
Source File: StatUtilsTest.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
/**
 * Run with 77 random values, assuming that the outcome has a mean of 0 and a standard deviation of 1 with a
 * precision of 1E-10.
 */

public void testNormalize2() {
    // create an sample with 77 values    
    int length = 77;
    double sample[] = new double[length];
    for (int i = 0; i < length; i++) {
        sample[i] = Math.random();
    }
    // normalize this sample
    double standardizedSample[] = StatUtils.normalize(sample);

    DescriptiveStatistics stats = new DescriptiveStatistics();
    // Add the data from the array
    for (int i = 0; i < length; i++) {
        stats.addValue(standardizedSample[i]);
    }
    // the calculations do have a limited precision    
    double distance = 1E-10;
    // check the mean an standard deviation
    assertEquals(0.0, stats.getMean(), distance);
    assertEquals(1.0, stats.getStandardDeviation(), distance);

}
 
Example 10
Source File: StatUtils.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
/**
 * Normalize (standardize) the series, so in the end it is having a mean of 0 and a standard deviation of 1.
 *
 * @param sample Sample to normalize.
 * @return normalized (standardized) sample.
 * @since 2.2
 */
public static double[] normalize(final double[] sample) {
    DescriptiveStatistics stats = new DescriptiveStatistics();

    // Add the data from the series to stats
    for (int i = 0; i < sample.length; i++) {
        stats.addValue(sample[i]);
    }

    // Compute mean and standard deviation
    double mean = stats.getMean();
    double standardDeviation = stats.getStandardDeviation();

    // initialize the standardizedSample, which has the same length as the sample
    double[] standardizedSample = new double[sample.length];

    for (int i = 0; i < sample.length; i++) {
        // z = (x- mean)/standardDeviation
        standardizedSample[i] = (sample[i] - mean) / standardDeviation;
    }
    return standardizedSample;
}
 
Example 11
Source File: ExpressionFigure.java    From biojava with GNU Lesser General Public License v2.1 5 votes vote down vote up
/**
 *
 * @param title
 * @param _siList
 * @param variable
 */
public void setSurvivalInfo(ArrayList<String> title, ArrayList<SurvivalInfo> _siList, String variable) {
	this.siList = new ArrayList<SurvivalInfo>();
	this.title = title;
	this.variable = variable;

	minX = 0.0;
	maxX = (double) _siList.size();
	minY = 0.0;
	maxY = null;
	DescriptiveStatistics ds = new DescriptiveStatistics();
	for (SurvivalInfo si : _siList) {
		this.siList.add(si);
		String v = si.getOriginalMetaData(variable);
		Double value = Double.parseDouble(v);
		ds.addValue(value);
		if (maxTime == null || maxTime < si.getTime()) {
			maxTime = si.getTime();
		}

	}
	SurvivalInfoValueComparator sivc = new SurvivalInfoValueComparator(variable);
	Collections.sort(this.siList, sivc);
	mean = ds.getMean();
	minY = ds.getMin();
	maxY = ds.getMax();
	minY = (double) Math.floor(minY);
	maxY = (double) Math.ceil(maxY);


	this.repaint();
}
 
Example 12
Source File: APARegionStatistics.java    From Juicebox with MIT License 5 votes vote down vote up
public static DescriptiveStatistics statistics(double[][] x) {
    DescriptiveStatistics stats = new DescriptiveStatistics();
    for (double[] row : x)
        for (double val : row)
            stats.addValue(val);
    return stats;
}
 
Example 13
Source File: APARegionStatistics.java    From JuiceboxLegacy with MIT License 5 votes vote down vote up
public static DescriptiveStatistics statistics(double[][] x) {
    DescriptiveStatistics stats = new DescriptiveStatistics();
    for (double[] row : x)
        for (double val : row)
            stats.addValue(val);
    return stats;
}
 
Example 14
Source File: MySQLEnsembleOfSimpleOneStepReductionsExperimentRunner.java    From AILibs with GNU Affero General Public License v3.0 5 votes vote down vote up
public void conductExperiment(final MySQLEnsembleOfSimpleOneStepReductionsExperiment exp) throws Exception {
	List<Map<String, Object>> mccvResults = Util.conductEnsembleOfOneStepReductionsExperiment(exp.getExperiment());
	DescriptiveStatistics errorRate = new DescriptiveStatistics();
	DescriptiveStatistics runtime = new DescriptiveStatistics();
	for (Map<String, Object> result : mccvResults) {
		errorRate.addValue((double) result.get(KEY_ERROR_RATE));
		runtime.addValue((long) result.get("trainTime"));
	}

	/* prepapre values for experiment update */
	Map<String, Object> values = new HashMap<>();
	values.put(KEY_ERROR_RATE, errorRate.getMean());
	this.updateExperiment(exp, values);
}
 
Example 15
Source File: Step2dGoldReasonStatistics.java    From argument-reasoning-comprehension-task with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args)
        throws IOException
{
    File inputFile = new File(
            "mturk/annotation-task/data/32-reasons-batch-0001-5000-2026args-gold.xml.gz");

    // read all arguments from the original corpus
    List<StandaloneArgument> arguments = XStreamSerializer
            .deserializeArgumentListFromXML(inputFile);
    System.out.println("Arguments: " + arguments.size());

    Frequency frequency = new Frequency();
    DescriptiveStatistics statistics = new DescriptiveStatistics();

    for (StandaloneArgument argument : arguments) {
        JCas jCas = argument.getJCas();
        Collection<Premise> premises = JCasUtil.select(jCas, Premise.class);

        frequency.addValue(premises.size());
        statistics.addValue(premises.size());
    }

    System.out.println(frequency);
    System.out.println(statistics.getSum());
    System.out.println(statistics.getMean());
    System.out.println(statistics.getStandardDeviation());
}
 
Example 16
Source File: Step3dExportGistToXMIFiles.java    From argument-reasoning-comprehension-task with Apache License 2.0 5 votes vote down vote up
private static void export(File argumentsWithReasonsFile, File argumentsWithGistFile,
        File outputDir)
        throws Exception
{
    List<StandaloneArgument> arguments = ExportHelper.copyReasonAnnotationsWithGistOnly(
            argumentsWithReasonsFile, argumentsWithGistFile);

    String metaDataCSV = ExportHelper.exportMetaDataToCSV(arguments);
    FileUtils.write(new File(outputDir, "metadata.csv"), metaDataCSV, "utf-8");

    Frequency premisesFrequency = new Frequency();
    DescriptiveStatistics premisesStatistics = new DescriptiveStatistics();

    // and export them all as XMI files using standard DKPro pipeline
    for (StandaloneArgument argument : arguments) {
        JCas jCas = argument.getJCas();
        SimplePipeline.runPipeline(jCas, AnalysisEngineFactory.createEngineDescription(
                XmiWriter.class,
                XmiWriter.PARAM_TARGET_LOCATION, outputDir,
                XmiWriter.PARAM_USE_DOCUMENT_ID, true,
                XmiWriter.PARAM_OVERWRITE, true
        ));

        // count all premises
        int count = JCasUtil.select(jCas, Premise.class).size();
        premisesStatistics.addValue(count);
        premisesFrequency.addValue(count);
    }

    System.out.println("Premises total: " + premisesStatistics.getSum());
    System.out.println("Argument: " + arguments.size());
    System.out.println(premisesFrequency);
}
 
Example 17
Source File: GLSMultipleLinearRegressionTest.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
/**
 * Generate an error covariance matrix and sample data representing models
 * with this error structure. Then verify that GLS estimated coefficients,
 * on average, perform better than OLS.
 */
@Test
public void testGLSEfficiency() throws Exception {
    RandomGenerator rg = new JDKRandomGenerator();
    rg.setSeed(200);  // Seed has been selected to generate non-trivial covariance
    
    // Assume model has 16 observations (will use Longley data).  Start by generating
    // non-constant variances for the 16 error terms.
    final int nObs = 16;
    double[] sigma = new double[nObs];
    for (int i = 0; i < nObs; i++) {
        sigma[i] = 10 * rg.nextDouble();
    }
    
    // Now generate 1000 error vectors to use to estimate the covariance matrix
    // Columns are draws on N(0, sigma[col])
    final int numSeeds = 1000;
    RealMatrix errorSeeds = MatrixUtils.createRealMatrix(numSeeds, nObs);
    for (int i = 0; i < numSeeds; i++) {
        for (int j = 0; j < nObs; j++) {
            errorSeeds.setEntry(i, j, rg.nextGaussian() * sigma[j]);
        }
    }
    
    // Get covariance matrix for columns
    RealMatrix cov = (new Covariance(errorSeeds)).getCovarianceMatrix();
      
    // Create a CorrelatedRandomVectorGenerator to use to generate correlated errors
    GaussianRandomGenerator rawGenerator = new GaussianRandomGenerator(rg);
    double[] errorMeans = new double[nObs];  // Counting on init to 0 here
    CorrelatedRandomVectorGenerator gen = new CorrelatedRandomVectorGenerator(errorMeans, cov,
     1.0e-12 * cov.getNorm(), rawGenerator);
    
    // Now start generating models.  Use Longley X matrix on LHS
    // and Longley OLS beta vector as "true" beta.  Generate
    // Y values by XB + u where u is a CorrelatedRandomVector generated
    // from cov.
    OLSMultipleLinearRegression ols = new OLSMultipleLinearRegression();
    ols.newSampleData(longley, nObs, 6);
    final RealVector b = ols.calculateBeta().copy();
    final RealMatrix x = ols.X.copy();
    
    // Create a GLS model to reuse
    GLSMultipleLinearRegression gls = new GLSMultipleLinearRegression();
    gls.newSampleData(longley, nObs, 6);
    gls.newCovarianceData(cov.getData());
    
    // Create aggregators for stats measuring model performance
    DescriptiveStatistics olsBetaStats = new DescriptiveStatistics();
    DescriptiveStatistics glsBetaStats = new DescriptiveStatistics();
    
    // Generate Y vectors for 10000 models, estimate GLS and OLS and
    // Verify that OLS estimates are better
    final int nModels = 10000;
    for (int i = 0; i < nModels; i++) {
        
        // Generate y = xb + u with u cov
        RealVector u = MatrixUtils.createRealVector(gen.nextVector());
        double[] y = u.add(x.operate(b)).getData();
        
        // Estimate OLS parameters
        ols.newYSampleData(y);
        RealVector olsBeta = ols.calculateBeta();
        
        // Estimate GLS parameters
        gls.newYSampleData(y);
        RealVector glsBeta = gls.calculateBeta();
        
        // Record deviations from "true" beta
        double dist = olsBeta.getDistance(b);
        olsBetaStats.addValue(dist * dist);
        dist = glsBeta.getDistance(b);
        glsBetaStats.addValue(dist * dist);
        
    }
    
    // Verify that GLS is on average more efficient, lower variance
    assert(olsBetaStats.getMean() > 1.5 * glsBetaStats.getMean());
    assert(olsBetaStats.getStandardDeviation() > glsBetaStats.getStandardDeviation());  
}
 
Example 18
Source File: GLSMultipleLinearRegressionTest.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
/**
 * Generate an error covariance matrix and sample data representing models
 * with this error structure. Then verify that GLS estimated coefficients,
 * on average, perform better than OLS.
 */
@Test
public void testGLSEfficiency() throws Exception {
    RandomGenerator rg = new JDKRandomGenerator();
    rg.setSeed(200);  // Seed has been selected to generate non-trivial covariance
    
    // Assume model has 16 observations (will use Longley data).  Start by generating
    // non-constant variances for the 16 error terms.
    final int nObs = 16;
    double[] sigma = new double[nObs];
    for (int i = 0; i < nObs; i++) {
        sigma[i] = 10 * rg.nextDouble();
    }
    
    // Now generate 1000 error vectors to use to estimate the covariance matrix
    // Columns are draws on N(0, sigma[col])
    final int numSeeds = 1000;
    RealMatrix errorSeeds = MatrixUtils.createRealMatrix(numSeeds, nObs);
    for (int i = 0; i < numSeeds; i++) {
        for (int j = 0; j < nObs; j++) {
            errorSeeds.setEntry(i, j, rg.nextGaussian() * sigma[j]);
        }
    }
    
    // Get covariance matrix for columns
    RealMatrix cov = (new Covariance(errorSeeds)).getCovarianceMatrix();
      
    // Create a CorrelatedRandomVectorGenerator to use to generate correlated errors
    GaussianRandomGenerator rawGenerator = new GaussianRandomGenerator(rg);
    double[] errorMeans = new double[nObs];  // Counting on init to 0 here
    CorrelatedRandomVectorGenerator gen = new CorrelatedRandomVectorGenerator(errorMeans, cov,
     1.0e-12 * cov.getNorm(), rawGenerator);
    
    // Now start generating models.  Use Longley X matrix on LHS
    // and Longley OLS beta vector as "true" beta.  Generate
    // Y values by XB + u where u is a CorrelatedRandomVector generated
    // from cov.
    OLSMultipleLinearRegression ols = new OLSMultipleLinearRegression();
    ols.newSampleData(longley, nObs, 6);
    final RealVector b = ols.calculateBeta().copy();
    final RealMatrix x = ols.X.copy();
    
    // Create a GLS model to reuse
    GLSMultipleLinearRegression gls = new GLSMultipleLinearRegression();
    gls.newSampleData(longley, nObs, 6);
    gls.newCovarianceData(cov.getData());
    
    // Create aggregators for stats measuring model performance
    DescriptiveStatistics olsBetaStats = new DescriptiveStatistics();
    DescriptiveStatistics glsBetaStats = new DescriptiveStatistics();
    
    // Generate Y vectors for 10000 models, estimate GLS and OLS and
    // Verify that OLS estimates are better
    final int nModels = 10000;
    for (int i = 0; i < nModels; i++) {
        
        // Generate y = xb + u with u cov
        RealVector u = MatrixUtils.createRealVector(gen.nextVector());
        double[] y = u.add(x.operate(b)).getData();
        
        // Estimate OLS parameters
        ols.newYSampleData(y);
        RealVector olsBeta = ols.calculateBeta();
        
        // Estimate GLS parameters
        gls.newYSampleData(y);
        RealVector glsBeta = gls.calculateBeta();
        
        // Record deviations from "true" beta
        double dist = olsBeta.getDistance(b);
        olsBetaStats.addValue(dist * dist);
        dist = glsBeta.getDistance(b);
        glsBetaStats.addValue(dist * dist);
        
    }
    
    // Verify that GLS is on average more efficient, lower variance
    assert(olsBetaStats.getMean() > 1.5 * glsBetaStats.getMean());
    assert(olsBetaStats.getStandardDeviation() > glsBetaStats.getStandardDeviation());  
}
 
Example 19
Source File: Step7aOriginalWarrantHITCreator.java    From argument-reasoning-comprehension-task with Apache License 2.0 4 votes vote down vote up
@Override
protected List<ReasonClaimWarrantContainer> preFilterList(
        List<ReasonClaimWarrantContainer> list)
{
    Set<String> uniqueIDs = new TreeSet<>();
    // check we have unique ID's!!!
    for (ReasonClaimWarrantContainer claimWarrantContainer : list) {
        String id = claimWarrantContainer.getReasonClaimWarrantId();
        if (uniqueIDs.contains(id)) {
            throw new IllegalStateException("Repeated ID: " + id);
        }

        uniqueIDs.add(id);
    }

    // do some statistics first
    DescriptiveStatistics statHard = new DescriptiveStatistics();
    DescriptiveStatistics statLogic = new DescriptiveStatistics();

    XYSeriesCollection dataset = new XYSeriesCollection();
    XYSeries series = new XYSeries("Random");
    for (ReasonClaimWarrantContainer container : list) {
        series.add(container.getLogicScore(), container.getHardScore());
        statHard.addValue(container.getHardScore());
        statLogic.addValue(container.getLogicScore());
    }
    dataset.addSeries(series);

    //        System.out.println(statHard);
    //        System.out.println(statLogic);

    JFrame jFrame = new JFrame();
    // This will create the dataset
    // based on the dataset we create the chart
    JFreeChart chart = ChartFactory
            .createScatterPlot("title", "Logic", "Hard", dataset, PlotOrientation.HORIZONTAL,
                    false, false, false);
    // we put the chart into a panel
    ChartPanel chartPanel = new ChartPanel(chart);
    // default size
    chartPanel.setPreferredSize(new java.awt.Dimension(500, 270));
    // add it to our application
    jFrame.setContentPane(chartPanel);

    //        jFrame.pack();
    //        jFrame.setVisible(true);

    // ok, take only those with logic > 0.68
    return list.stream()
            .filter(reasonClaimWarrantContainer -> reasonClaimWarrantContainer.getLogicScore()
                    >= 0.68).collect(Collectors.toList());
}
 
Example 20
Source File: GLSMultipleLinearRegressionTest.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
/**
 * Generate an error covariance matrix and sample data representing models
 * with this error structure. Then verify that GLS estimated coefficients,
 * on average, perform better than OLS.
 */
@Test
public void testGLSEfficiency() throws Exception {
    RandomGenerator rg = new JDKRandomGenerator();
    rg.setSeed(200);  // Seed has been selected to generate non-trivial covariance
    
    // Assume model has 16 observations (will use Longley data).  Start by generating
    // non-constant variances for the 16 error terms.
    final int nObs = 16;
    double[] sigma = new double[nObs];
    for (int i = 0; i < nObs; i++) {
        sigma[i] = 10 * rg.nextDouble();
    }
    
    // Now generate 1000 error vectors to use to estimate the covariance matrix
    // Columns are draws on N(0, sigma[col])
    final int numSeeds = 1000;
    RealMatrix errorSeeds = MatrixUtils.createRealMatrix(numSeeds, nObs);
    for (int i = 0; i < numSeeds; i++) {
        for (int j = 0; j < nObs; j++) {
            errorSeeds.setEntry(i, j, rg.nextGaussian() * sigma[j]);
        }
    }
    
    // Get covariance matrix for columns
    RealMatrix cov = (new Covariance(errorSeeds)).getCovarianceMatrix();
      
    // Create a CorrelatedRandomVectorGenerator to use to generate correlated errors
    GaussianRandomGenerator rawGenerator = new GaussianRandomGenerator(rg);
    double[] errorMeans = new double[nObs];  // Counting on init to 0 here
    CorrelatedRandomVectorGenerator gen = new CorrelatedRandomVectorGenerator(errorMeans, cov,
     1.0e-12 * cov.getNorm(), rawGenerator);
    
    // Now start generating models.  Use Longley X matrix on LHS
    // and Longley OLS beta vector as "true" beta.  Generate
    // Y values by XB + u where u is a CorrelatedRandomVector generated
    // from cov.
    OLSMultipleLinearRegression ols = new OLSMultipleLinearRegression();
    ols.newSampleData(longley, nObs, 6);
    final RealVector b = ols.calculateBeta().copy();
    final RealMatrix x = ols.X.copy();
    
    // Create a GLS model to reuse
    GLSMultipleLinearRegression gls = new GLSMultipleLinearRegression();
    gls.newSampleData(longley, nObs, 6);
    gls.newCovarianceData(cov.getData());
    
    // Create aggregators for stats measuring model performance
    DescriptiveStatistics olsBetaStats = new DescriptiveStatistics();
    DescriptiveStatistics glsBetaStats = new DescriptiveStatistics();
    
    // Generate Y vectors for 10000 models, estimate GLS and OLS and
    // Verify that OLS estimates are better
    final int nModels = 10000;
    for (int i = 0; i < nModels; i++) {
        
        // Generate y = xb + u with u cov
        RealVector u = MatrixUtils.createRealVector(gen.nextVector());
        double[] y = u.add(x.operate(b)).getData();
        
        // Estimate OLS parameters
        ols.newYSampleData(y);
        RealVector olsBeta = ols.calculateBeta();
        
        // Estimate GLS parameters
        gls.newYSampleData(y);
        RealVector glsBeta = gls.calculateBeta();
        
        // Record deviations from "true" beta
        double dist = olsBeta.getDistance(b);
        olsBetaStats.addValue(dist * dist);
        dist = glsBeta.getDistance(b);
        glsBetaStats.addValue(dist * dist);
        
    }
    
    // Verify that GLS is on average more efficient, lower variance
    assert(olsBetaStats.getMean() > 1.5 * glsBetaStats.getMean());
    assert(olsBetaStats.getStandardDeviation() > glsBetaStats.getStandardDeviation());  
}