org.apache.mahout.math.jet.random.AbstractContinousDistribution Java Examples

The following examples show how to use org.apache.mahout.math.jet.random.AbstractContinousDistribution. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TDigestTest.java    From streaminer with Apache License 2.0 6 votes vote down vote up
@Test
public void testNarrowNormal() {
    // this mixture of a uniform and normal distribution has a very narrow peak which is centered
    // near the median.  Our system should be scale invariant and work well regardless.
    final Random gen = RandomUtils.getRandom();
    AbstractContinousDistribution mix = new AbstractContinousDistribution() {
        AbstractContinousDistribution normal = new Normal(0, 1e-5, gen);
        AbstractContinousDistribution uniform = new Uniform(-1, 1, gen);

        @Override
        public double nextDouble() {
            double x;
            if (gen.nextDouble() < 0.5) {
                x = uniform.nextDouble();
            } else {
                x = normal.nextDouble();
            }
            return x;
        }
    };

    for (int i = 0; i < repeats(); i++) {
        runTest(mix, 100, new double[]{0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.99, 0.999}, "mixture", false, gen);
    }
}
 
Example #2
Source File: TDigestTest.java    From streaminer with Apache License 2.0 6 votes vote down vote up
@Test
public void testSequentialPoints() {
    Random gen = RandomUtils.getRandom();
    for (int i = 0; i < repeats(); i++) {
        runTest(new AbstractContinousDistribution() {
            double base = 0;

            @Override
            public double nextDouble() {
                base += Math.PI * 1e-5;
                return base;
            }
        }, 100, new double[]{0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999},
                "sequential", true, gen);
    }
}
 
Example #3
Source File: TDigestTest.java    From streaminer with Apache License 2.0 6 votes vote down vote up
private void compare(AbstractContinousDistribution gen, String tag, long scale, Random rand) {
    for (double compression : new double[]{2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000}) {
        QDigest qd = new QDigest(compression);
        TDigest dist = new TDigest(compression, rand);
        List<Double> data = Lists.newArrayList();
        for (int i = 0; i < 100000; i++) {
            double x = gen.nextDouble();
            dist.add(x);
            qd.offer((long) (x * scale));
            data.add(x);
        }
        dist.compress();
        Collections.sort(data);

        for (double q : new double[]{0.001, 0.01, 0.1, 0.2, 0.3, 0.5, 0.7, 0.8, 0.9, 0.99, 0.999}) {
            double x1 = dist.quantile(q);
            double x2 = (double) qd.getQuantile(q) / scale;
            double e1 = cdf(x1, data) - q;
            System.out.printf("%s\t%.0f\t%.8f\t%.10g\t%.10g\t%d\t%d\n", tag, compression, q, e1, cdf(x2, data) - q, dist.smallByteSize(), QDigest.serialize(qd).length);

        }
    }
}
 
Example #4
Source File: ComparisonTest.java    From t-digest with Apache License 2.0 6 votes vote down vote up
private void compareQD(PrintWriter out, AbstractContinousDistribution gen, String tag, long scale) {
    for (double compression : new double[]{10, 20, 50, 100, 200, 500, 1000, 2000}) {
        QDigest qd = new QDigest(compression);
        TDigest dist = new MergingDigest(compression);
        double[] data = new double[100000];
        for (int i = 0; i < 100000; i++) {
            double x = gen.nextDouble();
            dist.add(x);
            qd.offer((long) (x * scale));
            data[i] = x;
        }
        dist.compress();
        Arrays.sort(data);

        for (double q : new double[]{1e-5, 1e-4, 0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999, 0.9999, 0.99999}) {
            double x1 = dist.quantile(q);
            double x2 = (double) qd.getQuantile(q) / scale;
            double e1 = Dist.cdf(x1, data) - q;
            double e2 = Dist.cdf(x2, data) - q;
            out.printf("%s,%.0f,%.8f,%.10g,%.10g,%d,%d\n", tag, compression, q, e1, e2, dist.smallByteSize(), QDigest.serialize(qd).length);
        }
    }
}
 
Example #5
Source File: TDigestTest.java    From t-digest with Apache License 2.0 6 votes vote down vote up
@Test
public void testNarrowNormal() {
    // this mixture of a uniform and normal distribution has a very narrow peak which is centered
    // near the median.  Our system should be scale invariant and work well regardless.
    final Random gen = getRandom();
    AbstractContinousDistribution mix = new AbstractContinousDistribution() {
        final AbstractContinousDistribution normal = new Normal(0, 1e-5, gen);
        final AbstractContinousDistribution uniform = new Uniform(-1, 1, gen);

        @Override
        public double nextDouble() {
            double x;
            if (gen.nextDouble() < 0.5) {
                x = uniform.nextDouble();
            } else {
                x = normal.nextDouble();
            }
            return x;
        }
    };

    for (int i = 0; i < repeats(); i++) {
        runTest(factory(400), mix, new double[]{0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.99, 0.999}, "mixture", false);
    }
}
 
Example #6
Source File: ComparisonTest.java    From t-digest with Apache License 2.0 5 votes vote down vote up
private void compareSQ(PrintWriter out, AbstractContinousDistribution gen, String tag) {
    double[] quantiles = {0.001, 0.01, 0.1, 0.2, 0.3, 0.5, 0.7, 0.8, 0.9, 0.99, 0.999};
    for (double compression : new double[]{10, 20, 50, 100, 200, 500, 1000, 2000}) {
        QuantileEstimator sq = new QuantileEstimator(1001);
        TDigest dist = new MergingDigest(compression);
        double[] data = new double[100000];
        for (int i = 0; i < 100000; i++) {
            double x = gen.nextDouble();
            dist.add(x);
            sq.add(x);
            data[i] = x;
        }
        dist.compress();
        Arrays.sort(data);

        List<Double> qz = sq.getQuantiles();
        for (double q : quantiles) {
            double x1 = dist.quantile(q);
            double x2 = qz.get((int) (q * 1000 + 0.5));
            double e1 = Dist.cdf(x1, data) - q;
            double e2 = Dist.cdf(x2, data) - q;
            out.printf("%s,%.0f,%.8f,%.10g,%.10g,%d,%d\n",
                    tag, compression, q, e1, e2, dist.smallByteSize(), sq.serializedSize());

        }
    }
}
 
Example #7
Source File: TDigestTest.java    From t-digest with Apache License 2.0 5 votes vote down vote up
@Test
public void testSequentialPoints() {
    for (int i = 0; i < repeats(); i++) {
        runTest(factory(), new AbstractContinousDistribution() {
                    double base = 0;

                    @Override
                    public double nextDouble() {
                        base += Math.PI * 1e-5;
                        return base;
                    }
                }, new double[]{0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999},
                "sequential", true);
    }
}
 
Example #8
Source File: TDigestTest.java    From streaminer with Apache License 2.0 4 votes vote down vote up
@Test
public void testRepeatedValues() {
    final Random gen = RandomUtils.getRandom();

    // 5% of samples will be 0 or 1.0.  10% for each of the values 0.1 through 0.9
    AbstractContinousDistribution mix = new AbstractContinousDistribution() {
        @Override
        public double nextDouble() {
            return Math.rint(gen.nextDouble() * 10) / 10.0;
        }
    };

    TDigest dist = new TDigest((double) 1000, gen);
    long t0 = System.nanoTime();
    List<Double> data = Lists.newArrayList();
    for (int i1 = 0; i1 < 100000; i1++) {
        double x = mix.nextDouble();
        data.add(x);
        dist.add(x);
    }

    System.out.printf("# %fus per point\n", (System.nanoTime() - t0) * 1e-3 / 100000);
    System.out.printf("# %d centroids\n", dist.centroidCount());

    // I would be happier with 5x compression, but repeated values make things kind of weird
    assertTrue("Summary is too large", dist.centroidCount() < 10 * (double) 1000);

    // all quantiles should round to nearest actual value
    for (int i = 0; i < 10; i++) {
        double z = i / 10.0;
        // we skip over troublesome points that are nearly halfway between
        for (double delta : new double[]{0.01, 0.02, 0.03, 0.07, 0.08, 0.09}) {
            double q = z + delta;
            double cdf = dist.cdf(q);
            // we also relax the tolerances for repeated values
            assertEquals(String.format("z=%.1f, q = %.3f, cdf = %.3f", z, q, cdf), z + 0.05, cdf, 0.005);

            double estimate = dist.quantile(q);
            assertEquals(String.format("z=%.1f, q = %.3f, cdf = %.3f, estimate = %.3f", z, q, cdf, estimate), Math.rint(q * 10) / 10.0, estimate, 0.001);
        }
    }
}
 
Example #9
Source File: Util.java    From t-digest with Apache License 2.0 4 votes vote down vote up
@Override
public AbstractContinousDistribution create(Random gen) {
    return new Uniform(0, 1, gen);
}
 
Example #10
Source File: Util.java    From t-digest with Apache License 2.0 4 votes vote down vote up
@Override
public AbstractContinousDistribution create(Random gen) {
    return new Gamma(0.1, 0.1, gen);
}
 
Example #11
Source File: AccuracyTest.java    From t-digest with Apache License 2.0 4 votes vote down vote up
@Test
    public void testAccuracyVersusCompression() throws IOException, InterruptedException {
        String head = Git.getHash(true).substring(0, 10);
        String experiment = "digest";
        new File("tests").mkdirs();
        try (PrintWriter out = new PrintWriter(String.format("tests/accuracy-%s-%s.csv", experiment, head));
             PrintWriter cdf = new PrintWriter(String.format("tests/accuracy-cdf-%s-%s.csv", experiment, head));
             PrintWriter sizes = new PrintWriter(String.format("tests/accuracy-sizes-%s-%s.csv", experiment, head))) {
            out.printf("digest, dist, sort, q.digest, q.raw, error, compression, q, x, k, clusters\n");
            cdf.printf("digest, dist, sort, x.digest, x.raw, error, compression, q, k, clusters\n");
            sizes.printf("digest, dist, sort, q.0, q.1, dk, mean, compression, count, k, clusters\n");

            ExecutorService pool = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors() + 4);
            Collection<Callable<Integer>> tasks = new ArrayList<>();
            AtomicInteger lines = new AtomicInteger();
            long t0 = System.nanoTime();
            for (int k = 0; k < 50; k++) {
                int finalK = k;
                tasks.add(() -> {
                    try {
//                            for (Util.Distribution dist : Collections.singleton(Util.Distribution.UNIFORM)) {
                        for (Util.Distribution dist : Util.Distribution.values()) {
                            AbstractContinousDistribution dx = dist.create(gen);
                            int size = (int) (N + new Random().nextGaussian() * 1000);
                            double[] raw = new double[size];
                            for (int i = 0; i < size; i++) {
                                raw[i] = dx.nextDouble();
                            }
                            double[] sorted = Arrays.copyOf(raw, raw.length);
                            Arrays.sort(sorted);

                            for (boolean useWeightLimit : new boolean[]{true, false}) {
                                for (ScaleFunction scale : ScaleFunction.values()) {
                                    if (scale.toString().contains("_NO_NORM") || scale.toString().equals("K_0")
                                            || scale.toString().contains("FAST") || scale.toString().contains("kSize")) {
                                        continue;
                                    }
                                    for (double compression : new double[]{50, 100, 200, 500, 1000}) {
                                        //                            for (double compression : new double[]{100, 200, 500}) {
                                        for (Util.Factory factory : Collections.singleton(Util.Factory.MERGE)) {
                                            //                                    for (Util.Factory factory : Util.Factory.values()) {
                                            TDigest digest = factory.create(compression);
                                            MergingDigest.useWeightLimit = useWeightLimit;
                                            digest.setScaleFunction(scale);
                                            for (double datum : raw) {
                                                digest.add(datum);
                                            }
                                            digest.compress();
                                            evaluate(finalK, out, sizes, cdf, dist, "unsorted", sorted, compression, digest);

//                                        digest = factory.create(compression);
//                                        for (double datum : sorted) {
//                                            digest.add(datum);
//                                        }
//                                        evaluate(finalK, out, sizes, cdf, dist, "sorted", factory, sorted, compression, digest);
                                        }
                                    }
                                }
                            }
                        }
                    } catch (Throwable e) {
                        e.printStackTrace();
                    }
                    int count = lines.incrementAndGet();
                    long t = System.nanoTime();
                    double duration = (t - t0) * 1e-9;
                    System.out.printf("%d, %d, %.2f, %.3f\n", finalK, count, duration, count / duration);
                    return finalK;
                });
            }
            pool.invokeAll(tasks);
        }
    }
 
Example #12
Source File: AccuracyTest.java    From t-digest with Apache License 2.0 4 votes vote down vote up
/**
     * Prints the actual samples that went into a few clusters near the tails and near the median.
     * <p>
     * This is important for testing how close to ideal a real-world t-digest might be. In particular,
     * it lets us visualize how clusters are shaped in sample space to look for smear or skew.
     * <p>
     * The accuracy.r script produces a visualization of the data produced by this test.
     *
     * @throws FileNotFoundException If output file can't be opened.
     * @throws InterruptedException  If threads are interrupted (we don't ever expect that to happen).
     */
    @Test
    public void testBucketFill() throws FileNotFoundException, InterruptedException {
        ExecutorService pool = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors() + 2);
        Collection<Callable<Integer>> tasks = new ArrayList<>();
        AtomicInteger lines = new AtomicInteger();
        long t0 = System.nanoTime();

        PrintWriter samples = new PrintWriter("accuracy-samples.csv");
        samples.printf("digest, dist, sort, compression, k, centroid, centroid.down, i, x, mean, q0, q1\n");
        for (int k = 0; k < 20; k++) {
            int finalK = k;
            tasks.add(() -> {
                for (double compression : new double[]{100}) {
                    for (Util.Distribution dist : Util.Distribution.values()) {
                        AbstractContinousDistribution dx = dist.create(gen);
                        double[] raw = new double[N];
                        for (int i = 0; i < N; i++) {
                            raw[i] = dx.nextDouble();
                        }
                        double[] sorted = Arrays.copyOf(raw, raw.length);
                        Arrays.sort(sorted);
                        for (ScaleFunction scale : new ScaleFunction[]{ScaleFunction.K_2, ScaleFunction.K_3}) {
//                            for (Util.Factory factory : Collections.singletonList(Util.Factory.MERGE)) {
                            MergingDigest digest = new MergingDigest(compression);
                            digest.recordAllData();
                            digest.setScaleFunction(scale);

                            evaluate2(finalK, dist, samples, raw, compression, digest);
//                            evaluate2(finalK, dist, samples, "sorted", factory, sorted, compression);
                        }
                    }
                    //                  }
                }
                int count = lines.incrementAndGet();
                long t = System.nanoTime();
                double duration = (t - t0) * 1e-9;
                System.out.printf("%d, %d, %.2f, %.3f\n", finalK, count, duration, count / duration);
                return finalK;
            });
        }
        pool.invokeAll(tasks);
        samples.close();
    }
 
Example #13
Source File: Util.java    From t-digest with Apache License 2.0 votes vote down vote up
public abstract AbstractContinousDistribution create(Random gen);