org.apache.commons.math3.stat.correlation.Covariance Java Examples

The following examples show how to use org.apache.commons.math3.stat.correlation.Covariance. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestDoubleCovarianceSampAggregation.java    From presto with Apache License 2.0 5 votes vote down vote up
@Override
protected Object getExpectedValue(int start, int length)
{
    if (length <= 1) {
        return null;
    }
    return new Covariance().covariance(constructDoublePrimitiveArray(start + 5, length), constructDoublePrimitiveArray(start, length), true);
}
 
Example #2
Source File: MultivariateNormalDistributionTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Test the accuracy of sampling from the distribution.
 */
@Test
public void testSampling() {
    final double[] mu = { -1.5, 2 };
    final double[][] sigma = { { 2, -1.1 },
                               { -1.1, 2 } };
    final MultivariateNormalDistribution d = new MultivariateNormalDistribution(mu, sigma);
    d.reseedRandomGenerator(50);

    final int n = 500000;

    final double[][] samples = d.sample(n);
    final int dim = d.getDimension();
    final double[] sampleMeans = new double[dim];

    for (int i = 0; i < samples.length; i++) {
        for (int j = 0; j < dim; j++) {
            sampleMeans[j] += samples[i][j];
        }
    }

    final double sampledValueTolerance = 1e-2;
    for (int j = 0; j < dim; j++) {
        sampleMeans[j] /= samples.length;
        Assert.assertEquals(mu[j], sampleMeans[j], sampledValueTolerance);
    }

    final double[][] sampleSigma = new Covariance(samples).getCovarianceMatrix().getData();
    for (int i = 0; i < dim; i++) {
        for (int j = 0; j < dim; j++) {
            Assert.assertEquals(sigma[i][j], sampleSigma[i][j], sampledValueTolerance);
        }
    }
}
 
Example #3
Source File: TestRealCovarianceSampAggregation.java    From presto with Apache License 2.0 5 votes vote down vote up
@Override
protected Object getExpectedValue(int start, int length)
{
    if (length <= 1) {
        return null;
    }
    return (float) new Covariance().covariance(constructDoublePrimitiveArray(start + 5, length), constructDoublePrimitiveArray(start, length), true);
}
 
Example #4
Source File: MultivariateNormalDistributionTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Test the accuracy of sampling from the distribution.
 */
@Test
public void testSampling() {
    final double[] mu = { -1.5, 2 };
    final double[][] sigma = { { 2, -1.1 },
                               { -1.1, 2 } };
    final MultivariateNormalDistribution d = new MultivariateNormalDistribution(mu, sigma);
    d.reseedRandomGenerator(50);

    final int n = 500000;

    final double[][] samples = d.sample(n);
    final int dim = d.getDimension();
    final double[] sampleMeans = new double[dim];

    for (int i = 0; i < samples.length; i++) {
        for (int j = 0; j < dim; j++) {
            sampleMeans[j] += samples[i][j];
        }
    }

    final double sampledValueTolerance = 1e-2;
    for (int j = 0; j < dim; j++) {
        sampleMeans[j] /= samples.length;
        Assert.assertEquals(mu[j], sampleMeans[j], sampledValueTolerance);
    }

    final double[][] sampleSigma = new Covariance(samples).getCovarianceMatrix().getData();
    for (int i = 0; i < dim; i++) {
        for (int j = 0; j < dim; j++) {
            Assert.assertEquals(sigma[i][j], sampleSigma[i][j], sampledValueTolerance);
        }
    }
}
 
Example #5
Source File: MultivariateNormalDistributionTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Test the accuracy of sampling from the distribution.
 */
@Test
public void testSampling() {
    final double[] mu = { -1.5, 2 };
    final double[][] sigma = { { 2, -1.1 },
                               { -1.1, 2 } };
    final MultivariateNormalDistribution d = new MultivariateNormalDistribution(mu, sigma);
    d.reseedRandomGenerator(50);

    final int n = 500000;

    final double[][] samples = d.sample(n);
    final int dim = d.getDimensions();
    final double[] sampleMeans = new double[dim];

    for (int i = 0; i < samples.length; i++) {
        for (int j = 0; j < dim; j++) {
            sampleMeans[j] += samples[i][j];
        }
    }

    final double sampledValueTolerance = 1e-2;
    for (int j = 0; j < dim; j++) {
        sampleMeans[j] /= samples.length;
        Assert.assertEquals(mu[j], sampleMeans[j], sampledValueTolerance);
    }

    final double[][] sampleSigma = new Covariance(samples).getCovarianceMatrix().getData();
    for (int i = 0; i < dim; i++) {
        for (int j = 0; j < dim; j++) {
            Assert.assertEquals(sigma[i][j], sampleSigma[i][j], sampledValueTolerance);
        }
    }
}
 
Example #6
Source File: MultivariateNormalDistributionTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Test the accuracy of sampling from the distribution.
 */
@Test
public void testSampling() {
    final double[] mu = { -1.5, 2 };
    final double[][] sigma = { { 2, -1.1 },
                               { -1.1, 2 } };
    final MultivariateNormalDistribution d = new MultivariateNormalDistribution(mu, sigma);
    d.reseedRandomGenerator(50);

    final int n = 500000;

    final double[][] samples = d.sample(n);
    final int dim = d.getDimension();
    final double[] sampleMeans = new double[dim];

    for (int i = 0; i < samples.length; i++) {
        for (int j = 0; j < dim; j++) {
            sampleMeans[j] += samples[i][j];
        }
    }

    final double sampledValueTolerance = 1e-2;
    for (int j = 0; j < dim; j++) {
        sampleMeans[j] /= samples.length;
        Assert.assertEquals(mu[j], sampleMeans[j], sampledValueTolerance);
    }

    final double[][] sampleSigma = new Covariance(samples).getCovarianceMatrix().getData();
    for (int i = 0; i < dim; i++) {
        for (int j = 0; j < dim; j++) {
            Assert.assertEquals(sigma[i][j], sampleSigma[i][j], sampledValueTolerance);
        }
    }
}
 
Example #7
Source File: MultivariateNormalDistributionTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Test the accuracy of sampling from the distribution.
 */
@Test
public void testSampling() {
    final double[] mu = { -1.5, 2 };
    final double[][] sigma = { { 2, -1.1 },
                               { -1.1, 2 } };
    final MultivariateNormalDistribution d = new MultivariateNormalDistribution(mu, sigma);
    d.reseedRandomGenerator(50);

    final int n = 500000;

    final double[][] samples = d.sample(n);
    final int dim = d.getDimension();
    final double[] sampleMeans = new double[dim];

    for (int i = 0; i < samples.length; i++) {
        for (int j = 0; j < dim; j++) {
            sampleMeans[j] += samples[i][j];
        }
    }

    final double sampledValueTolerance = 1e-2;
    for (int j = 0; j < dim; j++) {
        sampleMeans[j] /= samples.length;
        Assert.assertEquals(mu[j], sampleMeans[j], sampledValueTolerance);
    }

    final double[][] sampleSigma = new Covariance(samples).getCovarianceMatrix().getData();
    for (int i = 0; i < dim; i++) {
        for (int j = 0; j < dim; j++) {
            Assert.assertEquals(sigma[i][j], sampleSigma[i][j], sampledValueTolerance);
        }
    }
}
 
Example #8
Source File: CorrelationExample.java    From Java-Data-Analysis with MIT License 5 votes vote down vote up
static double rho(double[][] data) {
    Variance v = new Variance();
    double varX = v.evaluate(data[0]);
    double sigX = Math.sqrt(varX);
    double varY = v.evaluate(data[1]);
    double sigY = Math.sqrt(varY);
    Covariance c = new Covariance(data);
    double sigXY = c.covariance(data[0], data[1]);
    return sigXY/(sigX*sigY);
}
 
Example #9
Source File: MultivariateNormalDistributionTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Test the accuracy of sampling from the distribution.
 */
@Test
public void testSampling() {
    final double[] mu = { -1.5, 2 };
    final double[][] sigma = { { 2, -1.1 },
                               { -1.1, 2 } };
    final MultivariateNormalDistribution d = new MultivariateNormalDistribution(mu, sigma);
    d.reseedRandomGenerator(50);

    final int n = 500000;

    final double[][] samples = d.sample(n);
    final int dim = d.getDimension();
    final double[] sampleMeans = new double[dim];

    for (int i = 0; i < samples.length; i++) {
        for (int j = 0; j < dim; j++) {
            sampleMeans[j] += samples[i][j];
        }
    }

    final double sampledValueTolerance = 1e-2;
    for (int j = 0; j < dim; j++) {
        sampleMeans[j] /= samples.length;
        Assert.assertEquals(mu[j], sampleMeans[j], sampledValueTolerance);
    }

    final double[][] sampleSigma = new Covariance(samples).getCovarianceMatrix().getData();
    for (int i = 0; i < dim; i++) {
        for (int j = 0; j < dim; j++) {
            Assert.assertEquals(sigma[i][j], sampleSigma[i][j], sampledValueTolerance);
        }
    }
}
 
Example #10
Source File: TestDoubleCovariancePopAggregation.java    From presto with Apache License 2.0 5 votes vote down vote up
@Override
protected Object getExpectedValue(int start, int length)
{
    if (length <= 0) {
        return null;
    }
    if (length == 1) {
        return 0.;
    }
    Covariance covariance = new Covariance();
    return covariance.covariance(constructDoublePrimitiveArray(start + 5, length), constructDoublePrimitiveArray(start, length), false);
}
 
Example #11
Source File: TestRealCovariancePopAggregation.java    From presto with Apache License 2.0 5 votes vote down vote up
@Override
protected Object getExpectedValue(int start, int length)
{
    if (length <= 0) {
        return null;
    }
    if (length == 1) {
        return 0.f;
    }
    Covariance covariance = new Covariance();
    return (float) covariance.covariance(constructDoublePrimitiveArray(start + 5, length), constructDoublePrimitiveArray(start, length), false);
}
 
Example #12
Source File: MultivariateNormalMixtureExpectationMaximization.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
/**
 * Helper method to create a multivariate normal mixture model which can be
 * used to initialize {@link #fit(MixtureMultivariateNormalDistribution)}.
 *
 * This method uses the data supplied to the constructor to try to determine
 * a good mixture model at which to start the fit, but it is not guaranteed
 * to supply a model which will find the optimal solution or even converge.
 *
 * @param data Data to estimate distribution
 * @param numComponents Number of components for estimated mixture
 * @return Multivariate normal mixture model estimated from the data
 * @throws NumberIsTooLargeException if {@code numComponents} is greater
 * than the number of data rows.
 * @throws NumberIsTooSmallException if {@code numComponents < 2}.
 * @throws NotStrictlyPositiveException if data has less than 2 rows
 * @throws DimensionMismatchException if rows of data have different numbers
 *             of columns
 */
public static MixtureMultivariateNormalDistribution estimate(final double[][] data,
                                                             final int numComponents)
    throws NotStrictlyPositiveException,
           DimensionMismatchException {
    if (data.length < 2) {
        throw new NotStrictlyPositiveException(data.length);
    }
    if (numComponents < 2) {
        throw new NumberIsTooSmallException(numComponents, 2, true);
    }
    if (numComponents > data.length) {
        throw new NumberIsTooLargeException(numComponents, data.length, true);
    }

    final int numRows = data.length;
    final int numCols = data[0].length;

    // sort the data
    final DataRow[] sortedData = new DataRow[numRows];
    for (int i = 0; i < numRows; i++) {
        sortedData[i] = new DataRow(data[i]);
    }
    Arrays.sort(sortedData);

    // uniform weight for each bin
    final double weight = 1d / numComponents;

    // components of mixture model to be created
    final List<Pair<Double, MultivariateNormalDistribution>> components =
            new ArrayList<Pair<Double, MultivariateNormalDistribution>>(numComponents);

    // create a component based on data in each bin
    for (int binIndex = 0; binIndex < numComponents; binIndex++) {
        // minimum index (inclusive) from sorted data for this bin
        final int minIndex = (binIndex * numRows) / numComponents;

        // maximum index (exclusive) from sorted data for this bin
        final int maxIndex = ((binIndex + 1) * numRows) / numComponents;

        // number of data records that will be in this bin
        final int numBinRows = maxIndex - minIndex;

        // data for this bin
        final double[][] binData = new double[numBinRows][numCols];

        // mean of each column for the data in the this bin
        final double[] columnMeans = new double[numCols];

        // populate bin and create component
        for (int i = minIndex, iBin = 0; i < maxIndex; i++, iBin++) {
            for (int j = 0; j < numCols; j++) {
                final double val = sortedData[i].getRow()[j];
                columnMeans[j] += val;
                binData[iBin][j] = val;
            }
        }

        MathArrays.scaleInPlace(1d / numBinRows, columnMeans);

        // covariance matrix for this bin
        final double[][] covMat
            = new Covariance(binData).getCovarianceMatrix().getData();
        final MultivariateNormalDistribution mvn
            = new MultivariateNormalDistribution(columnMeans, covMat);

        components.add(new Pair<Double, MultivariateNormalDistribution>(weight, mvn));
    }

    return new MixtureMultivariateNormalDistribution(components);
}
 
Example #13
Source File: GLSMultipleLinearRegressionTest.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
/**
 * Generate an error covariance matrix and sample data representing models
 * with this error structure. Then verify that GLS estimated coefficients,
 * on average, perform better than OLS.
 */
@Test
public void testGLSEfficiency() {
    RandomGenerator rg = new JDKRandomGenerator();
    rg.setSeed(200);  // Seed has been selected to generate non-trivial covariance
    
    // Assume model has 16 observations (will use Longley data).  Start by generating
    // non-constant variances for the 16 error terms.
    final int nObs = 16;
    double[] sigma = new double[nObs];
    for (int i = 0; i < nObs; i++) {
        sigma[i] = 10 * rg.nextDouble();
    }
    
    // Now generate 1000 error vectors to use to estimate the covariance matrix
    // Columns are draws on N(0, sigma[col])
    final int numSeeds = 1000;
    RealMatrix errorSeeds = MatrixUtils.createRealMatrix(numSeeds, nObs);
    for (int i = 0; i < numSeeds; i++) {
        for (int j = 0; j < nObs; j++) {
            errorSeeds.setEntry(i, j, rg.nextGaussian() * sigma[j]);
        }
    }
    
    // Get covariance matrix for columns
    RealMatrix cov = (new Covariance(errorSeeds)).getCovarianceMatrix();
      
    // Create a CorrelatedRandomVectorGenerator to use to generate correlated errors
    GaussianRandomGenerator rawGenerator = new GaussianRandomGenerator(rg);
    double[] errorMeans = new double[nObs];  // Counting on init to 0 here
    CorrelatedRandomVectorGenerator gen = new CorrelatedRandomVectorGenerator(errorMeans, cov,
     1.0e-12 * cov.getNorm(), rawGenerator);
    
    // Now start generating models.  Use Longley X matrix on LHS
    // and Longley OLS beta vector as "true" beta.  Generate
    // Y values by XB + u where u is a CorrelatedRandomVector generated
    // from cov.
    OLSMultipleLinearRegression ols = new OLSMultipleLinearRegression();
    ols.newSampleData(longley, nObs, 6);
    final RealVector b = ols.calculateBeta().copy();
    final RealMatrix x = ols.getX().copy();
    
    // Create a GLS model to reuse
    GLSMultipleLinearRegression gls = new GLSMultipleLinearRegression();
    gls.newSampleData(longley, nObs, 6);
    gls.newCovarianceData(cov.getData());
    
    // Create aggregators for stats measuring model performance
    DescriptiveStatistics olsBetaStats = new DescriptiveStatistics();
    DescriptiveStatistics glsBetaStats = new DescriptiveStatistics();
    
    // Generate Y vectors for 10000 models, estimate GLS and OLS and
    // Verify that OLS estimates are better
    final int nModels = 10000;
    for (int i = 0; i < nModels; i++) {
        
        // Generate y = xb + u with u cov
        RealVector u = MatrixUtils.createRealVector(gen.nextVector());
        double[] y = u.add(x.operate(b)).toArray();
        
        // Estimate OLS parameters
        ols.newYSampleData(y);
        RealVector olsBeta = ols.calculateBeta();
        
        // Estimate GLS parameters
        gls.newYSampleData(y);
        RealVector glsBeta = gls.calculateBeta();
        
        // Record deviations from "true" beta
        double dist = olsBeta.getDistance(b);
        olsBetaStats.addValue(dist * dist);
        dist = glsBeta.getDistance(b);
        glsBetaStats.addValue(dist * dist);
        
    }
    
    // Verify that GLS is on average more efficient, lower variance
    assert(olsBetaStats.getMean() > 1.5 * glsBetaStats.getMean());
    assert(olsBetaStats.getStandardDeviation() > glsBetaStats.getStandardDeviation());  
}
 
Example #14
Source File: MultivariateNormalMixtureExpectationMaximization.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
/**
 * Helper method to create a multivariate normal mixture model which can be
 * used to initialize {@link #fit(MixtureMultivariateNormalDistribution)}.
 *
 * This method uses the data supplied to the constructor to try to determine
 * a good mixture model at which to start the fit, but it is not guaranteed
 * to supply a model which will find the optimal solution or even converge.
 *
 * @param data Data to estimate distribution
 * @param numComponents Number of components for estimated mixture
 * @return Multivariate normal mixture model estimated from the data
 * @throws NumberIsTooLargeException if {@code numComponents} is greater
 * than the number of data rows.
 * @throws NumberIsTooSmallException if {@code numComponents < 2}.
 * @throws NotStrictlyPositiveException if data has less than 2 rows
 * @throws DimensionMismatchException if rows of data have different numbers
 *             of columns
 */
public static MixtureMultivariateNormalDistribution estimate(final double[][] data,
                                                             final int numComponents)
    throws NotStrictlyPositiveException,
           DimensionMismatchException {
    if (data.length < 2) {
        throw new NotStrictlyPositiveException(data.length);
    }
    if (numComponents < 2) {
        throw new NumberIsTooSmallException(numComponents, 2, true);
    }
    if (numComponents > data.length) {
        throw new NumberIsTooLargeException(numComponents, data.length, true);
    }

    final int numRows = data.length;
    final int numCols = data[0].length;

    // sort the data
    final DataRow[] sortedData = new DataRow[numRows];
    for (int i = 0; i < numRows; i++) {
        sortedData[i] = new DataRow(data[i]);
    }
    Arrays.sort(sortedData);

    // uniform weight for each bin
    final double weight = 1d / numComponents;

    // components of mixture model to be created
    final List<Pair<Double, MultivariateNormalDistribution>> components =
            new ArrayList<Pair<Double, MultivariateNormalDistribution>>();

    // create a component based on data in each bin
    for (int binIndex = 0; binIndex < numComponents; binIndex++) {
        // minimum index (inclusive) from sorted data for this bin
        final int minIndex = (binIndex * numRows) / numComponents;

        // maximum index (exclusive) from sorted data for this bin
        final int maxIndex = ((binIndex + 1) * numRows) / numComponents;

        // number of data records that will be in this bin
        final int numBinRows = maxIndex - minIndex;

        // data for this bin
        final double[][] binData = new double[numBinRows][numCols];

        // mean of each column for the data in the this bin
        final double[] columnMeans = new double[numCols];

        // populate bin and create component
        for (int i = minIndex, iBin = 0; i < maxIndex; i++, iBin++) {
            for (int j = 0; j < numCols; j++) {
                final double val = sortedData[i].getRow()[j];
                columnMeans[j] += val;
                binData[iBin][j] = val;
            }
        }

        MathArrays.scaleInPlace(1d / numBinRows, columnMeans);

        // covariance matrix for this bin
        final double[][] covMat
            = new Covariance(binData).getCovarianceMatrix().getData();
        final MultivariateNormalDistribution mvn
            = new MultivariateNormalDistribution(columnMeans, covMat);

        components.add(new Pair<Double, MultivariateNormalDistribution>(weight, mvn));
    }

    return new MixtureMultivariateNormalDistribution(components);
}
 
Example #15
Source File: GLSMultipleLinearRegressionTest.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
/**
 * Generate an error covariance matrix and sample data representing models
 * with this error structure. Then verify that GLS estimated coefficients,
 * on average, perform better than OLS.
 */
@Test
public void testGLSEfficiency() {
    RandomGenerator rg = new JDKRandomGenerator();
    rg.setSeed(200);  // Seed has been selected to generate non-trivial covariance
    
    // Assume model has 16 observations (will use Longley data).  Start by generating
    // non-constant variances for the 16 error terms.
    final int nObs = 16;
    double[] sigma = new double[nObs];
    for (int i = 0; i < nObs; i++) {
        sigma[i] = 10 * rg.nextDouble();
    }
    
    // Now generate 1000 error vectors to use to estimate the covariance matrix
    // Columns are draws on N(0, sigma[col])
    final int numSeeds = 1000;
    RealMatrix errorSeeds = MatrixUtils.createRealMatrix(numSeeds, nObs);
    for (int i = 0; i < numSeeds; i++) {
        for (int j = 0; j < nObs; j++) {
            errorSeeds.setEntry(i, j, rg.nextGaussian() * sigma[j]);
        }
    }
    
    // Get covariance matrix for columns
    RealMatrix cov = (new Covariance(errorSeeds)).getCovarianceMatrix();
      
    // Create a CorrelatedRandomVectorGenerator to use to generate correlated errors
    GaussianRandomGenerator rawGenerator = new GaussianRandomGenerator(rg);
    double[] errorMeans = new double[nObs];  // Counting on init to 0 here
    CorrelatedRandomVectorGenerator gen = new CorrelatedRandomVectorGenerator(errorMeans, cov,
     1.0e-12 * cov.getNorm(), rawGenerator);
    
    // Now start generating models.  Use Longley X matrix on LHS
    // and Longley OLS beta vector as "true" beta.  Generate
    // Y values by XB + u where u is a CorrelatedRandomVector generated
    // from cov.
    OLSMultipleLinearRegression ols = new OLSMultipleLinearRegression();
    ols.newSampleData(longley, nObs, 6);
    final RealVector b = ols.calculateBeta().copy();
    final RealMatrix x = ols.getX().copy();
    
    // Create a GLS model to reuse
    GLSMultipleLinearRegression gls = new GLSMultipleLinearRegression();
    gls.newSampleData(longley, nObs, 6);
    gls.newCovarianceData(cov.getData());
    
    // Create aggregators for stats measuring model performance
    DescriptiveStatistics olsBetaStats = new DescriptiveStatistics();
    DescriptiveStatistics glsBetaStats = new DescriptiveStatistics();
    
    // Generate Y vectors for 10000 models, estimate GLS and OLS and
    // Verify that OLS estimates are better
    final int nModels = 10000;
    for (int i = 0; i < nModels; i++) {
        
        // Generate y = xb + u with u cov
        RealVector u = MatrixUtils.createRealVector(gen.nextVector());
        double[] y = u.add(x.operate(b)).toArray();
        
        // Estimate OLS parameters
        ols.newYSampleData(y);
        RealVector olsBeta = ols.calculateBeta();
        
        // Estimate GLS parameters
        gls.newYSampleData(y);
        RealVector glsBeta = gls.calculateBeta();
        
        // Record deviations from "true" beta
        double dist = olsBeta.getDistance(b);
        olsBetaStats.addValue(dist * dist);
        dist = glsBeta.getDistance(b);
        glsBetaStats.addValue(dist * dist);
        
    }
    
    // Verify that GLS is on average more efficient, lower variance
    assert(olsBetaStats.getMean() > 1.5 * glsBetaStats.getMean());
    assert(olsBetaStats.getStandardDeviation() > glsBetaStats.getStandardDeviation());  
}
 
Example #16
Source File: GLSMultipleLinearRegressionTest.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
/**
 * Generate an error covariance matrix and sample data representing models
 * with this error structure. Then verify that GLS estimated coefficients,
 * on average, perform better than OLS.
 */
@Test
public void testGLSEfficiency() {
    RandomGenerator rg = new JDKRandomGenerator();
    rg.setSeed(200);  // Seed has been selected to generate non-trivial covariance
    
    // Assume model has 16 observations (will use Longley data).  Start by generating
    // non-constant variances for the 16 error terms.
    final int nObs = 16;
    double[] sigma = new double[nObs];
    for (int i = 0; i < nObs; i++) {
        sigma[i] = 10 * rg.nextDouble();
    }
    
    // Now generate 1000 error vectors to use to estimate the covariance matrix
    // Columns are draws on N(0, sigma[col])
    final int numSeeds = 1000;
    RealMatrix errorSeeds = MatrixUtils.createRealMatrix(numSeeds, nObs);
    for (int i = 0; i < numSeeds; i++) {
        for (int j = 0; j < nObs; j++) {
            errorSeeds.setEntry(i, j, rg.nextGaussian() * sigma[j]);
        }
    }
    
    // Get covariance matrix for columns
    RealMatrix cov = (new Covariance(errorSeeds)).getCovarianceMatrix();
      
    // Create a CorrelatedRandomVectorGenerator to use to generate correlated errors
    GaussianRandomGenerator rawGenerator = new GaussianRandomGenerator(rg);
    double[] errorMeans = new double[nObs];  // Counting on init to 0 here
    CorrelatedRandomVectorGenerator gen = new CorrelatedRandomVectorGenerator(errorMeans, cov,
     1.0e-12 * cov.getNorm(), rawGenerator);
    
    // Now start generating models.  Use Longley X matrix on LHS
    // and Longley OLS beta vector as "true" beta.  Generate
    // Y values by XB + u where u is a CorrelatedRandomVector generated
    // from cov.
    OLSMultipleLinearRegression ols = new OLSMultipleLinearRegression();
    ols.newSampleData(longley, nObs, 6);
    final RealVector b = ols.calculateBeta().copy();
    final RealMatrix x = ols.getX().copy();
    
    // Create a GLS model to reuse
    GLSMultipleLinearRegression gls = new GLSMultipleLinearRegression();
    gls.newSampleData(longley, nObs, 6);
    gls.newCovarianceData(cov.getData());
    
    // Create aggregators for stats measuring model performance
    DescriptiveStatistics olsBetaStats = new DescriptiveStatistics();
    DescriptiveStatistics glsBetaStats = new DescriptiveStatistics();
    
    // Generate Y vectors for 10000 models, estimate GLS and OLS and
    // Verify that OLS estimates are better
    final int nModels = 10000;
    for (int i = 0; i < nModels; i++) {
        
        // Generate y = xb + u with u cov
        RealVector u = MatrixUtils.createRealVector(gen.nextVector());
        double[] y = u.add(x.operate(b)).toArray();
        
        // Estimate OLS parameters
        ols.newYSampleData(y);
        RealVector olsBeta = ols.calculateBeta();
        
        // Estimate GLS parameters
        gls.newYSampleData(y);
        RealVector glsBeta = gls.calculateBeta();
        
        // Record deviations from "true" beta
        double dist = olsBeta.getDistance(b);
        olsBetaStats.addValue(dist * dist);
        dist = glsBeta.getDistance(b);
        glsBetaStats.addValue(dist * dist);
        
    }
    
    // Verify that GLS is on average more efficient, lower variance
    assert(olsBetaStats.getMean() > 1.5 * glsBetaStats.getMean());
    assert(olsBetaStats.getStandardDeviation() > glsBetaStats.getStandardDeviation());  
}
 
Example #17
Source File: MultivariateNormalMixtureExpectationMaximization.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
/**
 * Helper method to create a multivariate normal mixture model which can be
 * used to initialize {@link #fit(MixtureMultivariateRealDistribution)}.
 *
 * This method uses the data supplied to the constructor to try to determine
 * a good mixture model at which to start the fit, but it is not guaranteed
 * to supply a model which will find the optimal solution or even converge.
 *
 * @param data Data to estimate distribution
 * @param numComponents Number of components for estimated mixture
 * @return Multivariate normal mixture model estimated from the data
 * @throws NumberIsTooLargeException if {@code numComponents\ is greater
 * than the number of data rows.
 * @throws NumberIsTooSmallException if {@code numComponents < 2}.
 * @throws NotStrictlyPositiveException if data has less than 2 rows
 * @throws DimensionMismatchException if rows of data have different numbers
 *             of columns
 * @see #fit
 */
public static MixtureMultivariateNormalDistribution estimate(final double[][] data,
                                                             final int numComponents)
    throws NotStrictlyPositiveException,
           DimensionMismatchException {
    if (data.length < 2) {
        throw new NotStrictlyPositiveException(data.length);
    }
    if (numComponents < 2) {
        throw new NumberIsTooSmallException(numComponents, 2, true);
    }
    if (numComponents > data.length) {
        throw new NumberIsTooLargeException(numComponents, data.length, true);
    }

    final int numRows = data.length;
    final int numCols = data[0].length;

    // sort the data
    final DataRow[] sortedData = new DataRow[numRows];
    for (int i = 0; i < numRows; i++) {
        sortedData[i] = new DataRow(data[i]);
    }
    Arrays.sort(sortedData);

    final int totalBins = numComponents;

    // uniform weight for each bin
    final double weight = 1d / totalBins;

    // components of mixture model to be created
    final List<Pair<Double, MultivariateNormalDistribution>> components =
            new ArrayList<Pair<Double, MultivariateNormalDistribution>>();

    // create a component based on data in each bin
    for (int binNumber = 1; binNumber <= totalBins; binNumber++) {
        // minimum index from sorted data for this bin
        final int minIndex
            = (int) FastMath.max(0,
                                 FastMath.floor((binNumber - 1) * numRows / totalBins));

        // maximum index from sorted data for this bin
        final int maxIndex
            = (int) FastMath.ceil(binNumber * numRows / numComponents) - 1;

        // number of data records that will be in this bin
        final int numBinRows = maxIndex - minIndex + 1;

        // data for this bin
        final double[][] binData = new double[numBinRows][numCols];

        // mean of each column for the data in the this bin
        final double[] columnMeans = new double[numCols];

        // populate bin and create component
        for (int i = minIndex, iBin = 0; i <= maxIndex; i++, iBin++) {
            for (int j = 0; j < numCols; j++) {
                final double val = sortedData[i].getRow()[j];
                columnMeans[j] += val;
                binData[iBin][j] = val;
            }
        }

        MathArrays.scaleInPlace(1d / numBinRows, columnMeans);

        // covariance matrix for this bin
        final double[][] covMat
            = new Covariance(binData).getCovarianceMatrix().getData();
        final MultivariateNormalDistribution mvn
            = new MultivariateNormalDistribution(columnMeans, covMat);

        components.add(new Pair<Double, MultivariateNormalDistribution>(weight, mvn));
    }

    return new MixtureMultivariateNormalDistribution(components);
}
 
Example #18
Source File: GLSMultipleLinearRegressionTest.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
/**
 * Generate an error covariance matrix and sample data representing models
 * with this error structure. Then verify that GLS estimated coefficients,
 * on average, perform better than OLS.
 */
@Test
public void testGLSEfficiency() {
    RandomGenerator rg = new JDKRandomGenerator();
    rg.setSeed(200);  // Seed has been selected to generate non-trivial covariance
    
    // Assume model has 16 observations (will use Longley data).  Start by generating
    // non-constant variances for the 16 error terms.
    final int nObs = 16;
    double[] sigma = new double[nObs];
    for (int i = 0; i < nObs; i++) {
        sigma[i] = 10 * rg.nextDouble();
    }
    
    // Now generate 1000 error vectors to use to estimate the covariance matrix
    // Columns are draws on N(0, sigma[col])
    final int numSeeds = 1000;
    RealMatrix errorSeeds = MatrixUtils.createRealMatrix(numSeeds, nObs);
    for (int i = 0; i < numSeeds; i++) {
        for (int j = 0; j < nObs; j++) {
            errorSeeds.setEntry(i, j, rg.nextGaussian() * sigma[j]);
        }
    }
    
    // Get covariance matrix for columns
    RealMatrix cov = (new Covariance(errorSeeds)).getCovarianceMatrix();
      
    // Create a CorrelatedRandomVectorGenerator to use to generate correlated errors
    GaussianRandomGenerator rawGenerator = new GaussianRandomGenerator(rg);
    double[] errorMeans = new double[nObs];  // Counting on init to 0 here
    CorrelatedRandomVectorGenerator gen = new CorrelatedRandomVectorGenerator(errorMeans, cov,
     1.0e-12 * cov.getNorm(), rawGenerator);
    
    // Now start generating models.  Use Longley X matrix on LHS
    // and Longley OLS beta vector as "true" beta.  Generate
    // Y values by XB + u where u is a CorrelatedRandomVector generated
    // from cov.
    OLSMultipleLinearRegression ols = new OLSMultipleLinearRegression();
    ols.newSampleData(longley, nObs, 6);
    final RealVector b = ols.calculateBeta().copy();
    final RealMatrix x = ols.getX().copy();
    
    // Create a GLS model to reuse
    GLSMultipleLinearRegression gls = new GLSMultipleLinearRegression();
    gls.newSampleData(longley, nObs, 6);
    gls.newCovarianceData(cov.getData());
    
    // Create aggregators for stats measuring model performance
    DescriptiveStatistics olsBetaStats = new DescriptiveStatistics();
    DescriptiveStatistics glsBetaStats = new DescriptiveStatistics();
    
    // Generate Y vectors for 10000 models, estimate GLS and OLS and
    // Verify that OLS estimates are better
    final int nModels = 10000;
    for (int i = 0; i < nModels; i++) {
        
        // Generate y = xb + u with u cov
        RealVector u = MatrixUtils.createRealVector(gen.nextVector());
        double[] y = u.add(x.operate(b)).toArray();
        
        // Estimate OLS parameters
        ols.newYSampleData(y);
        RealVector olsBeta = ols.calculateBeta();
        
        // Estimate GLS parameters
        gls.newYSampleData(y);
        RealVector glsBeta = gls.calculateBeta();
        
        // Record deviations from "true" beta
        double dist = olsBeta.getDistance(b);
        olsBetaStats.addValue(dist * dist);
        dist = glsBeta.getDistance(b);
        glsBetaStats.addValue(dist * dist);
        
    }
    
    // Verify that GLS is on average more efficient, lower variance
    assert(olsBetaStats.getMean() > 1.5 * glsBetaStats.getMean());
    assert(olsBetaStats.getStandardDeviation() > glsBetaStats.getStandardDeviation());  
}
 
Example #19
Source File: GLSMultipleLinearRegressionTest.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
/**
 * Generate an error covariance matrix and sample data representing models
 * with this error structure. Then verify that GLS estimated coefficients,
 * on average, perform better than OLS.
 */
@Test
public void testGLSEfficiency() throws Exception {
    RandomGenerator rg = new JDKRandomGenerator();
    rg.setSeed(200);  // Seed has been selected to generate non-trivial covariance
    
    // Assume model has 16 observations (will use Longley data).  Start by generating
    // non-constant variances for the 16 error terms.
    final int nObs = 16;
    double[] sigma = new double[nObs];
    for (int i = 0; i < nObs; i++) {
        sigma[i] = 10 * rg.nextDouble();
    }
    
    // Now generate 1000 error vectors to use to estimate the covariance matrix
    // Columns are draws on N(0, sigma[col])
    final int numSeeds = 1000;
    RealMatrix errorSeeds = MatrixUtils.createRealMatrix(numSeeds, nObs);
    for (int i = 0; i < numSeeds; i++) {
        for (int j = 0; j < nObs; j++) {
            errorSeeds.setEntry(i, j, rg.nextGaussian() * sigma[j]);
        }
    }
    
    // Get covariance matrix for columns
    RealMatrix cov = (new Covariance(errorSeeds)).getCovarianceMatrix();
      
    // Create a CorrelatedRandomVectorGenerator to use to generate correlated errors
    GaussianRandomGenerator rawGenerator = new GaussianRandomGenerator(rg);
    double[] errorMeans = new double[nObs];  // Counting on init to 0 here
    CorrelatedRandomVectorGenerator gen = new CorrelatedRandomVectorGenerator(errorMeans, cov,
     1.0e-12 * cov.getNorm(), rawGenerator);
    
    // Now start generating models.  Use Longley X matrix on LHS
    // and Longley OLS beta vector as "true" beta.  Generate
    // Y values by XB + u where u is a CorrelatedRandomVector generated
    // from cov.
    OLSMultipleLinearRegression ols = new OLSMultipleLinearRegression();
    ols.newSampleData(longley, nObs, 6);
    final RealVector b = ols.calculateBeta().copy();
    final RealMatrix x = ols.getX().copy();
    
    // Create a GLS model to reuse
    GLSMultipleLinearRegression gls = new GLSMultipleLinearRegression();
    gls.newSampleData(longley, nObs, 6);
    gls.newCovarianceData(cov.getData());
    
    // Create aggregators for stats measuring model performance
    DescriptiveStatistics olsBetaStats = new DescriptiveStatistics();
    DescriptiveStatistics glsBetaStats = new DescriptiveStatistics();
    
    // Generate Y vectors for 10000 models, estimate GLS and OLS and
    // Verify that OLS estimates are better
    final int nModels = 10000;
    for (int i = 0; i < nModels; i++) {
        
        // Generate y = xb + u with u cov
        RealVector u = MatrixUtils.createRealVector(gen.nextVector());
        double[] y = u.add(x.operate(b)).toArray();
        
        // Estimate OLS parameters
        ols.newYSampleData(y);
        RealVector olsBeta = ols.calculateBeta();
        
        // Estimate GLS parameters
        gls.newYSampleData(y);
        RealVector glsBeta = gls.calculateBeta();
        
        // Record deviations from "true" beta
        double dist = olsBeta.getDistance(b);
        olsBetaStats.addValue(dist * dist);
        dist = glsBeta.getDistance(b);
        glsBetaStats.addValue(dist * dist);
        
    }
    
    // Verify that GLS is on average more efficient, lower variance
    assert(olsBetaStats.getMean() > 1.5 * glsBetaStats.getMean());
    assert(olsBetaStats.getStandardDeviation() > glsBetaStats.getStandardDeviation());  
}
 
Example #20
Source File: GLSMultipleLinearRegressionTest.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
/**
 * Generate an error covariance matrix and sample data representing models
 * with this error structure. Then verify that GLS estimated coefficients,
 * on average, perform better than OLS.
 */
@Test
public void testGLSEfficiency() {
    RandomGenerator rg = new JDKRandomGenerator();
    rg.setSeed(200);  // Seed has been selected to generate non-trivial covariance
    
    // Assume model has 16 observations (will use Longley data).  Start by generating
    // non-constant variances for the 16 error terms.
    final int nObs = 16;
    double[] sigma = new double[nObs];
    for (int i = 0; i < nObs; i++) {
        sigma[i] = 10 * rg.nextDouble();
    }
    
    // Now generate 1000 error vectors to use to estimate the covariance matrix
    // Columns are draws on N(0, sigma[col])
    final int numSeeds = 1000;
    RealMatrix errorSeeds = MatrixUtils.createRealMatrix(numSeeds, nObs);
    for (int i = 0; i < numSeeds; i++) {
        for (int j = 0; j < nObs; j++) {
            errorSeeds.setEntry(i, j, rg.nextGaussian() * sigma[j]);
        }
    }
    
    // Get covariance matrix for columns
    RealMatrix cov = (new Covariance(errorSeeds)).getCovarianceMatrix();
      
    // Create a CorrelatedRandomVectorGenerator to use to generate correlated errors
    GaussianRandomGenerator rawGenerator = new GaussianRandomGenerator(rg);
    double[] errorMeans = new double[nObs];  // Counting on init to 0 here
    CorrelatedRandomVectorGenerator gen = new CorrelatedRandomVectorGenerator(errorMeans, cov,
     1.0e-12 * cov.getNorm(), rawGenerator);
    
    // Now start generating models.  Use Longley X matrix on LHS
    // and Longley OLS beta vector as "true" beta.  Generate
    // Y values by XB + u where u is a CorrelatedRandomVector generated
    // from cov.
    OLSMultipleLinearRegression ols = new OLSMultipleLinearRegression();
    ols.newSampleData(longley, nObs, 6);
    final RealVector b = ols.calculateBeta().copy();
    final RealMatrix x = ols.getX().copy();
    
    // Create a GLS model to reuse
    GLSMultipleLinearRegression gls = new GLSMultipleLinearRegression();
    gls.newSampleData(longley, nObs, 6);
    gls.newCovarianceData(cov.getData());
    
    // Create aggregators for stats measuring model performance
    DescriptiveStatistics olsBetaStats = new DescriptiveStatistics();
    DescriptiveStatistics glsBetaStats = new DescriptiveStatistics();
    
    // Generate Y vectors for 10000 models, estimate GLS and OLS and
    // Verify that OLS estimates are better
    final int nModels = 10000;
    for (int i = 0; i < nModels; i++) {
        
        // Generate y = xb + u with u cov
        RealVector u = MatrixUtils.createRealVector(gen.nextVector());
        double[] y = u.add(x.operate(b)).toArray();
        
        // Estimate OLS parameters
        ols.newYSampleData(y);
        RealVector olsBeta = ols.calculateBeta();
        
        // Estimate GLS parameters
        gls.newYSampleData(y);
        RealVector glsBeta = gls.calculateBeta();
        
        // Record deviations from "true" beta
        double dist = olsBeta.getDistance(b);
        olsBetaStats.addValue(dist * dist);
        dist = glsBeta.getDistance(b);
        glsBetaStats.addValue(dist * dist);
        
    }
    
    // Verify that GLS is on average more efficient, lower variance
    assert(olsBetaStats.getMean() > 1.5 * glsBetaStats.getMean());
    assert(olsBetaStats.getStandardDeviation() > glsBetaStats.getStandardDeviation());  
}
 
Example #21
Source File: MultivariateNormalMixtureExpectationMaximization.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
/**
 * Helper method to create a multivariate normal mixture model which can be
 * used to initialize {@link #fit(MixtureMultivariateNormalDistribution)}.
 *
 * This method uses the data supplied to the constructor to try to determine
 * a good mixture model at which to start the fit, but it is not guaranteed
 * to supply a model which will find the optimal solution or even converge.
 *
 * @param data Data to estimate distribution
 * @param numComponents Number of components for estimated mixture
 * @return Multivariate normal mixture model estimated from the data
 * @throws NumberIsTooLargeException if {@code numComponents} is greater
 * than the number of data rows.
 * @throws NumberIsTooSmallException if {@code numComponents < 2}.
 * @throws NotStrictlyPositiveException if data has less than 2 rows
 * @throws DimensionMismatchException if rows of data have different numbers
 *             of columns
 */
public static MixtureMultivariateNormalDistribution estimate(final double[][] data,
                                                             final int numComponents)
    throws NotStrictlyPositiveException,
           DimensionMismatchException {
    if (data.length < 2) {
        throw new NotStrictlyPositiveException(data.length);
    }
    if (numComponents < 2) {
        throw new NumberIsTooSmallException(numComponents, 2, true);
    }
    if (numComponents > data.length) {
        throw new NumberIsTooLargeException(numComponents, data.length, true);
    }

    final int numRows = data.length;
    final int numCols = data[0].length;

    // sort the data
    final DataRow[] sortedData = new DataRow[numRows];
    for (int i = 0; i < numRows; i++) {
        sortedData[i] = new DataRow(data[i]);
    }
    Arrays.sort(sortedData);

    // uniform weight for each bin
    final double weight = 1d / numComponents;

    // components of mixture model to be created
    final List<Pair<Double, MultivariateNormalDistribution>> components =
            new ArrayList<Pair<Double, MultivariateNormalDistribution>>(numComponents);

    // create a component based on data in each bin
    for (int binIndex = 0; binIndex < numComponents; binIndex++) {
        // minimum index (inclusive) from sorted data for this bin
        final int minIndex = (binIndex * numRows) / numComponents;

        // maximum index (exclusive) from sorted data for this bin
        final int maxIndex = ((binIndex + 1) * numRows) / numComponents;

        // number of data records that will be in this bin
        final int numBinRows = maxIndex - minIndex;

        // data for this bin
        final double[][] binData = new double[numBinRows][numCols];

        // mean of each column for the data in the this bin
        final double[] columnMeans = new double[numCols];

        // populate bin and create component
        for (int i = minIndex, iBin = 0; i < maxIndex; i++, iBin++) {
            for (int j = 0; j < numCols; j++) {
                final double val = sortedData[i].getRow()[j];
                columnMeans[j] += val;
                binData[iBin][j] = val;
            }
        }

        MathArrays.scaleInPlace(1d / numBinRows, columnMeans);

        // covariance matrix for this bin
        final double[][] covMat
            = new Covariance(binData).getCovarianceMatrix().getData();
        final MultivariateNormalDistribution mvn
            = new MultivariateNormalDistribution(columnMeans, covMat);

        components.add(new Pair<Double, MultivariateNormalDistribution>(weight, mvn));
    }

    return new MixtureMultivariateNormalDistribution(components);
}
 
Example #22
Source File: GLSMultipleLinearRegressionTest.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
/**
 * Generate an error covariance matrix and sample data representing models
 * with this error structure. Then verify that GLS estimated coefficients,
 * on average, perform better than OLS.
 */
@Test
public void testGLSEfficiency() {
    RandomGenerator rg = new JDKRandomGenerator();
    rg.setSeed(200);  // Seed has been selected to generate non-trivial covariance
    
    // Assume model has 16 observations (will use Longley data).  Start by generating
    // non-constant variances for the 16 error terms.
    final int nObs = 16;
    double[] sigma = new double[nObs];
    for (int i = 0; i < nObs; i++) {
        sigma[i] = 10 * rg.nextDouble();
    }
    
    // Now generate 1000 error vectors to use to estimate the covariance matrix
    // Columns are draws on N(0, sigma[col])
    final int numSeeds = 1000;
    RealMatrix errorSeeds = MatrixUtils.createRealMatrix(numSeeds, nObs);
    for (int i = 0; i < numSeeds; i++) {
        for (int j = 0; j < nObs; j++) {
            errorSeeds.setEntry(i, j, rg.nextGaussian() * sigma[j]);
        }
    }
    
    // Get covariance matrix for columns
    RealMatrix cov = (new Covariance(errorSeeds)).getCovarianceMatrix();
      
    // Create a CorrelatedRandomVectorGenerator to use to generate correlated errors
    GaussianRandomGenerator rawGenerator = new GaussianRandomGenerator(rg);
    double[] errorMeans = new double[nObs];  // Counting on init to 0 here
    CorrelatedRandomVectorGenerator gen = new CorrelatedRandomVectorGenerator(errorMeans, cov,
     1.0e-12 * cov.getNorm(), rawGenerator);
    
    // Now start generating models.  Use Longley X matrix on LHS
    // and Longley OLS beta vector as "true" beta.  Generate
    // Y values by XB + u where u is a CorrelatedRandomVector generated
    // from cov.
    OLSMultipleLinearRegression ols = new OLSMultipleLinearRegression();
    ols.newSampleData(longley, nObs, 6);
    final RealVector b = ols.calculateBeta().copy();
    final RealMatrix x = ols.getX().copy();
    
    // Create a GLS model to reuse
    GLSMultipleLinearRegression gls = new GLSMultipleLinearRegression();
    gls.newSampleData(longley, nObs, 6);
    gls.newCovarianceData(cov.getData());
    
    // Create aggregators for stats measuring model performance
    DescriptiveStatistics olsBetaStats = new DescriptiveStatistics();
    DescriptiveStatistics glsBetaStats = new DescriptiveStatistics();
    
    // Generate Y vectors for 10000 models, estimate GLS and OLS and
    // Verify that OLS estimates are better
    final int nModels = 10000;
    for (int i = 0; i < nModels; i++) {
        
        // Generate y = xb + u with u cov
        RealVector u = MatrixUtils.createRealVector(gen.nextVector());
        double[] y = u.add(x.operate(b)).toArray();
        
        // Estimate OLS parameters
        ols.newYSampleData(y);
        RealVector olsBeta = ols.calculateBeta();
        
        // Estimate GLS parameters
        gls.newYSampleData(y);
        RealVector glsBeta = gls.calculateBeta();
        
        // Record deviations from "true" beta
        double dist = olsBeta.getDistance(b);
        olsBetaStats.addValue(dist * dist);
        dist = glsBeta.getDistance(b);
        glsBetaStats.addValue(dist * dist);
        
    }
    
    // Verify that GLS is on average more efficient, lower variance
    assert(olsBetaStats.getMean() > 1.5 * glsBetaStats.getMean());
    assert(olsBetaStats.getStandardDeviation() > glsBetaStats.getStandardDeviation());  
}
 
Example #23
Source File: GLSMultipleLinearRegressionTest.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
/**
 * Generate an error covariance matrix and sample data representing models
 * with this error structure. Then verify that GLS estimated coefficients,
 * on average, perform better than OLS.
 */
@Test
public void testGLSEfficiency() {
    RandomGenerator rg = new JDKRandomGenerator();
    rg.setSeed(200);  // Seed has been selected to generate non-trivial covariance
    
    // Assume model has 16 observations (will use Longley data).  Start by generating
    // non-constant variances for the 16 error terms.
    final int nObs = 16;
    double[] sigma = new double[nObs];
    for (int i = 0; i < nObs; i++) {
        sigma[i] = 10 * rg.nextDouble();
    }
    
    // Now generate 1000 error vectors to use to estimate the covariance matrix
    // Columns are draws on N(0, sigma[col])
    final int numSeeds = 1000;
    RealMatrix errorSeeds = MatrixUtils.createRealMatrix(numSeeds, nObs);
    for (int i = 0; i < numSeeds; i++) {
        for (int j = 0; j < nObs; j++) {
            errorSeeds.setEntry(i, j, rg.nextGaussian() * sigma[j]);
        }
    }
    
    // Get covariance matrix for columns
    RealMatrix cov = (new Covariance(errorSeeds)).getCovarianceMatrix();
      
    // Create a CorrelatedRandomVectorGenerator to use to generate correlated errors
    GaussianRandomGenerator rawGenerator = new GaussianRandomGenerator(rg);
    double[] errorMeans = new double[nObs];  // Counting on init to 0 here
    CorrelatedRandomVectorGenerator gen = new CorrelatedRandomVectorGenerator(errorMeans, cov,
     1.0e-12 * cov.getNorm(), rawGenerator);
    
    // Now start generating models.  Use Longley X matrix on LHS
    // and Longley OLS beta vector as "true" beta.  Generate
    // Y values by XB + u where u is a CorrelatedRandomVector generated
    // from cov.
    OLSMultipleLinearRegression ols = new OLSMultipleLinearRegression();
    ols.newSampleData(longley, nObs, 6);
    final RealVector b = ols.calculateBeta().copy();
    final RealMatrix x = ols.getX().copy();
    
    // Create a GLS model to reuse
    GLSMultipleLinearRegression gls = new GLSMultipleLinearRegression();
    gls.newSampleData(longley, nObs, 6);
    gls.newCovarianceData(cov.getData());
    
    // Create aggregators for stats measuring model performance
    DescriptiveStatistics olsBetaStats = new DescriptiveStatistics();
    DescriptiveStatistics glsBetaStats = new DescriptiveStatistics();
    
    // Generate Y vectors for 10000 models, estimate GLS and OLS and
    // Verify that OLS estimates are better
    final int nModels = 10000;
    for (int i = 0; i < nModels; i++) {
        
        // Generate y = xb + u with u cov
        RealVector u = MatrixUtils.createRealVector(gen.nextVector());
        double[] y = u.add(x.operate(b)).toArray();
        
        // Estimate OLS parameters
        ols.newYSampleData(y);
        RealVector olsBeta = ols.calculateBeta();
        
        // Estimate GLS parameters
        gls.newYSampleData(y);
        RealVector glsBeta = gls.calculateBeta();
        
        // Record deviations from "true" beta
        double dist = olsBeta.getDistance(b);
        olsBetaStats.addValue(dist * dist);
        dist = glsBeta.getDistance(b);
        glsBetaStats.addValue(dist * dist);
        
    }
    
    // Verify that GLS is on average more efficient, lower variance
    assert(olsBetaStats.getMean() > 1.5 * glsBetaStats.getMean());
    assert(olsBetaStats.getStandardDeviation() > glsBetaStats.getStandardDeviation());  
}
 
Example #24
Source File: MultivariateNormalMixtureExpectationMaximization.java    From astor with GNU General Public License v2.0 4 votes vote down vote up
/**
 * Helper method to create a multivariate normal mixture model which can be
 * used to initialize {@link #fit(MixtureMultivariateNormalDistribution)}.
 *
 * This method uses the data supplied to the constructor to try to determine
 * a good mixture model at which to start the fit, but it is not guaranteed
 * to supply a model which will find the optimal solution or even converge.
 *
 * @param data Data to estimate distribution
 * @param numComponents Number of components for estimated mixture
 * @return Multivariate normal mixture model estimated from the data
 * @throws NumberIsTooLargeException if {@code numComponents} is greater
 * than the number of data rows.
 * @throws NumberIsTooSmallException if {@code numComponents < 2}.
 * @throws NotStrictlyPositiveException if data has less than 2 rows
 * @throws DimensionMismatchException if rows of data have different numbers
 *             of columns
 */
public static MixtureMultivariateNormalDistribution estimate(final double[][] data,
                                                             final int numComponents)
    throws NotStrictlyPositiveException,
           DimensionMismatchException {
    if (data.length < 2) {
        throw new NotStrictlyPositiveException(data.length);
    }
    if (numComponents < 2) {
        throw new NumberIsTooSmallException(numComponents, 2, true);
    }
    if (numComponents > data.length) {
        throw new NumberIsTooLargeException(numComponents, data.length, true);
    }

    final int numRows = data.length;
    final int numCols = data[0].length;

    // sort the data
    final DataRow[] sortedData = new DataRow[numRows];
    for (int i = 0; i < numRows; i++) {
        sortedData[i] = new DataRow(data[i]);
    }
    Arrays.sort(sortedData);

    // uniform weight for each bin
    final double weight = 1d / numComponents;

    // components of mixture model to be created
    final List<Pair<Double, MultivariateNormalDistribution>> components =
            new ArrayList<Pair<Double, MultivariateNormalDistribution>>(numComponents);

    // create a component based on data in each bin
    for (int binIndex = 0; binIndex < numComponents; binIndex++) {
        // minimum index (inclusive) from sorted data for this bin
        final int minIndex = (binIndex * numRows) / numComponents;

        // maximum index (exclusive) from sorted data for this bin
        final int maxIndex = ((binIndex + 1) * numRows) / numComponents;

        // number of data records that will be in this bin
        final int numBinRows = maxIndex - minIndex;

        // data for this bin
        final double[][] binData = new double[numBinRows][numCols];

        // mean of each column for the data in the this bin
        final double[] columnMeans = new double[numCols];

        // populate bin and create component
        for (int i = minIndex, iBin = 0; i < maxIndex; i++, iBin++) {
            for (int j = 0; j < numCols; j++) {
                final double val = sortedData[i].getRow()[j];
                columnMeans[j] += val;
                binData[iBin][j] = val;
            }
        }

        MathArrays.scaleInPlace(1d / numBinRows, columnMeans);

        // covariance matrix for this bin
        final double[][] covMat
            = new Covariance(binData).getCovarianceMatrix().getData();
        final MultivariateNormalDistribution mvn
            = new MultivariateNormalDistribution(columnMeans, covMat);

        components.add(new Pair<Double, MultivariateNormalDistribution>(weight, mvn));
    }

    return new MixtureMultivariateNormalDistribution(components);
}
 
Example #25
Source File: StrategyFilter.java    From iMetrica with GNU General Public License v3.0 4 votes vote down vote up
public static double[] maximizeSharpe(double[][] data, int n_basket, int nobs, int nonneg)
   {
   
      int i,j; 
      double[] means = new double[n_basket];
      double sum=0;
      RealVector sol; 
      double[] w = new double[n_basket]; 
       
      for(i=0;i<n_basket;i++)
      {
       sum=0;
       for(j=0;j<nobs;j++)
       {
        sum = sum + data[j][i];        
       }
       means[i] = sum/nobs;
       //System.out.println(means[i]);
      } 
       
      RealVector m = new ArrayRealVector(means, false);
      Covariance covComp = new Covariance(data);
       
      //LinearConstraint(double[] coefficients, Relationship relationship, double value)
      //public static final Relationship LEQ 
      //RealMatrix rm = covComp.scalarMultiply(10000);
      RealMatrix rm = covComp.getCovarianceMatrix();
//       rm = rm.scalarMultiply(1000000);
//       for(i=0;i<n_basket;i++)
//       {printRow(rm.getRow(i));}
      
       
      try
      { 
        DecompositionSolver solver = new QRDecomposition(rm).getSolver();
        sol = solver.solve(m);
        w = sol.toArray(); 
      }
      catch(SingularMatrixException sme) 
      {
       //System.out.println("Matrix singular: setting weights to uniform"); 
       w = new double[n_basket]; 
       for(i=0;i<n_basket;i++) {w[i] = 1.0/n_basket;}
      }
      
      double sumw = 0;
      for(i=0;i<w.length;i++) 
      {
        if(nonneg == 1)
        {if(w[i] < 0) {w[i] = 1.0/n_basket;}}
        else if(nonneg == 2)
        {w[i] = Math.abs(w[i]);}
        
        sumw = sumw + w[i]; 
      }
       
      for(i=0;i<w.length;i++) {w[i] = w[i]/sumw;}
    
      return w; 
  }
 
Example #26
Source File: EvolutionPanel.java    From iMetrica with GNU General Public License v3.0 4 votes vote down vote up
public void fuseStrategies()
{
  if(n_saved_perf > 0)
  {
  
    int i,j,k; int npos = 0;
    int n_basket = n_saved_perf+1;
    int min_obs = mdfaEvolutionCanvas.min_obs;
    double[][] data = new double[min_obs][n_basket];
    double[] w = new double[n_basket];
    double[] target = new double[min_obs];
    //fill with current strategy first
    for(i=0;i<min_obs;i++)
    { 
      data[min_obs - 1 - i][0] = performances[performances.length - 1 - i].getReturn();        
    }
    
    for(k=0;k<n_saved_perf;k++)
    { 
     JInvestment[] temp = portfolio_invest.get(k);
     for(i=0;i<min_obs;i++)
     { 
      data[min_obs - 1 - i][k+1] = temp[temp.length - 1 - i].getReturn();
     } 
    }

    double[] means = new double[n_basket];
    double sum=0;
    RealVector sol; 
     
    for(i=0;i<n_basket;i++)
    {
     sum=0;
     for(j=0;j<min_obs;j++)
     {sum = sum + data[j][i];}
     means[i] = sum/min_obs;
    } 
     
    RealVector m = new ArrayRealVector(means, false);
    Covariance covComp = new Covariance(data);
    RealMatrix rm = covComp.getCovarianceMatrix();  

    if(uniformWeightsCheck.isSelected())
    {
      for(i=0;i<n_basket;i++) {w[i] = 1.0/n_basket;}
    }
    else if(maxSharpeWeightsCheck.isSelected())
    {
    
     try
     { 
      DecompositionSolver solver = new QRDecomposition(rm).getSolver();
      sol = solver.solve(m);
      w = sol.toArray(); 
     }
     catch(SingularMatrixException sme) 
     {
       System.out.println("Matrix singular: setting weights to uniform"); 
       w = new double[n_basket]; 
       for(i=0;i<n_basket;i++) {w[i] = 1.0/n_basket;}
     }
    
     double sumw = 0;
     for(i=0;i<w.length;i++) 
     {
      if(w[i] < 0) {w[i] = 1.0/n_basket;}       
      sumw = sumw + w[i]; 
     }
     for(i=0;i<w.length;i++) {w[i] = w[i]/sumw;}
    }   
    
    for(i=0;i<min_obs;i++)
    {
      sum = 0;
      for(k=0;k<n_basket;k++)
      {sum = sum + data[i][k]*w[k];}
      target[i] = sum;  
      if(target[i] > 0) {npos++;}
    }
    
    double[] mstd = mean_std(target); 
    sharpe_ratio = Math.sqrt(250)*mstd[0]/mstd[1];    
    double[] cum_port_returns = cumsum(target,min_obs); 
    

    
    max_drawdown = computeDrawdown(cum_port_returns);
   
    if(realrets) 
    {cum_port_returns = cumsum(target,target.length);}     
    
    double bRatio = (double)npos/min_obs;      
    
    mdfaEvolutionCanvas.addAggregate(cum_port_returns, new String(""+df2.format(sharpe_ratio)+", " +df2.format(max_drawdown)+", "+df.format(bRatio)));
    
  }
}
 
Example #27
Source File: GmmSemi.java    From orbit-image-analysis with GNU General Public License v3.0 4 votes vote down vote up
/**
 * Helper method to create a multivariate normal mixture model which can be
 * used to initialize {@link #fit(MixtureMultivariateNormalDistribution)}.
 *
 * This method uses the data supplied to the constructor to try to determine
 * a good mixture model at which to start the fit, but it is not guaranteed
 * to supply a model which will find the optimal solution or even converge.
 *
 * @param data Data to estimate distribution
 * @param numComponents Number of components for estimated mixture
 * @return Multivariate normal mixture model estimated from the data
 * @throws NumberIsTooLargeException if {@code numComponents} is greater
 * than the number of data rows.
 * @throws NumberIsTooSmallException if {@code numComponents < 2}.
 * @throws NotStrictlyPositiveException if data has less than 2 rows
 * @throws DimensionMismatchException if rows of data have different numbers
 *             of columns
 */
public static MixtureMultivariateNormalDistribution estimate(final double[][] data,
                                                             final int numComponents)
    throws NotStrictlyPositiveException,
           DimensionMismatchException {
    if (data.length < 2) {
        throw new NotStrictlyPositiveException(data.length);
    }
    if (numComponents < 2) {
        throw new NumberIsTooSmallException(numComponents, 2, true);
    }
    if (numComponents > data.length) {
        throw new NumberIsTooLargeException(numComponents, data.length, true);
    }

    final int numRows = data.length;
    final int numCols = data[0].length;

    // sort the data
    final DataRow[] sortedData = new DataRow[numRows];
    for (int i = 0; i < numRows; i++) {
        sortedData[i] = new DataRow(data[i]);
    }
    Arrays.sort(sortedData);

    // uniform weight for each bin
    final double weight = 1d / numComponents;

    // components of mixture model to be created
    final List<Pair<Double, MultivariateNormalDistribution>> components =
            new ArrayList<Pair<Double, MultivariateNormalDistribution>>(numComponents);

    // create a component based on data in each bin
    for (int binIndex = 0; binIndex < numComponents; binIndex++) {
        // minimum index (inclusive) from sorted data for this bin
        final int minIndex = (binIndex * numRows) / numComponents;

        // maximum index (exclusive) from sorted data for this bin
        final int maxIndex = ((binIndex + 1) * numRows) / numComponents;

        // number of data records that will be in this bin
        final int numBinRows = maxIndex - minIndex;

        // data for this bin
        final double[][] binData = new double[numBinRows][numCols];

        // mean of each column for the data in the this bin
        final double[] columnMeans = new double[numCols];

        // populate bin and create component
        for (int i = minIndex, iBin = 0; i < maxIndex; i++, iBin++) {
            for (int j = 0; j < numCols; j++) {
                final double val = sortedData[i].getRow()[j];
                columnMeans[j] += val;
                binData[iBin][j] = val;
            }
        }

        MathArrays.scaleInPlace(1d / numBinRows, columnMeans);

        // covariance matrix for this bin
        final double[][] covMat
            = new Covariance(binData).getCovarianceMatrix().getData();
        final MultivariateNormalDistribution mvn
            = new MultivariateNormalDistribution(columnMeans, covMat);

        components.add(new Pair<Double, MultivariateNormalDistribution>(weight, mvn));
    }

    return new MixtureMultivariateNormalDistribution(components);
}
 
Example #28
Source File: CovarianceTest.java    From Java-Data-Science-Cookbook with MIT License 4 votes vote down vote up
public void calculateCov(double[] x, double[] y){
	double covariance = new Covariance().covariance(x, y, false);//take out false too
	System.out.println(covariance);
}
 
Example #29
Source File: StatsUtil.java    From MeteoInfo with GNU Lesser General Public License v3.0 3 votes vote down vote up
/**
 * Computes covariance of two arrays.
 *
 * @param x X data
 * @param y Y data
 * @param bias If true, returned value will be bias-corrected
 * @return The covariance
 */
public static double covariance(Array x, Array y, boolean bias){
    double[] xd = (double[]) ArrayUtil.copyToNDJavaArray_Double(x);
    double[] yd = (double[]) ArrayUtil.copyToNDJavaArray_Double(y);
    double r = new Covariance().covariance(xd, yd, bias);
    return r;
}
 
Example #30
Source File: Matrix.java    From buffer_bci with GNU General Public License v3.0 2 votes vote down vote up
/**
 * Covariance of the columns of the matrix
 *
 * @return covariance matrix with size columnsxcolumns
 */
public Matrix covariance() {
    Covariance cov = new Covariance(this.transpose(), true);
    return new Matrix(cov.getCovarianceMatrix());
}