com.tdunning.math.stats.MergingDigest Java Examples

The following examples show how to use com.tdunning.math.stats.MergingDigest. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MergeBench.java    From t-digest with Apache License 2.0 6 votes vote down vote up
@Setup
public void setup() {
    data = new double[10000000];
    for (int i = 0; i < data.length; i++) {
        data[i] = gen.nextDouble();
    }
    td = new MergingDigest(compression, (factor + 1) * compression, compression);
    td.setScaleFunction(ScaleFunction.valueOf(scaleFunction));

    // First values are very cheap to add, we are more interested in the steady state,
    // when the summary is full. Summaries are expected to contain about 0.6*compression
    // centroids, hence the 5 * compression * (factor+1)
    for (int i = 0; i < 5 * compression * (factor + 1); ++i) {
        td.add(gen.nextDouble());
    }
}
 
Example #2
Source File: TDigestQuantilesTest.java    From beam with Apache License 2.0 6 votes vote down vote up
private <T> boolean encodeDecodeEquals(MergingDigest tDigest) throws IOException {
  MergingDigest decoded = CoderUtils.clone(new MergingDigestCoder(), tDigest);

  boolean equal = true;
  // the only way to compare the two sketches is to compare them centroid by centroid.
  // Indeed, the means are doubles but are encoded as float and cast during decoding.
  // This entails a small approximation that makes the centroids different after decoding.
  Iterator<Centroid> it1 = decoded.centroids().iterator();
  Iterator<Centroid> it2 = tDigest.centroids().iterator();

  for (int i = 0; i < decoded.centroids().size(); i++) {
    Centroid c1 = it1.next();
    Centroid c2 = it2.next();
    if ((float) c1.mean() != (float) c2.mean() || c1.count() != c2.count()) {
      equal = false;
      break;
    }
  }
  return equal;
}
 
Example #3
Source File: ComparisonTest.java    From t-digest with Apache License 2.0 6 votes vote down vote up
private void compareQD(PrintWriter out, AbstractContinousDistribution gen, String tag, long scale) {
    for (double compression : new double[]{10, 20, 50, 100, 200, 500, 1000, 2000}) {
        QDigest qd = new QDigest(compression);
        TDigest dist = new MergingDigest(compression);
        double[] data = new double[100000];
        for (int i = 0; i < 100000; i++) {
            double x = gen.nextDouble();
            dist.add(x);
            qd.offer((long) (x * scale));
            data[i] = x;
        }
        dist.compress();
        Arrays.sort(data);

        for (double q : new double[]{1e-5, 1e-4, 0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999, 0.9999, 0.99999}) {
            double x1 = dist.quantile(q);
            double x2 = (double) qd.getQuantile(q) / scale;
            double e1 = Dist.cdf(x1, data) - q;
            double e2 = Dist.cdf(x2, data) - q;
            out.printf("%s,%.0f,%.8f,%.10g,%.10g,%d,%d\n", tag, compression, q, e1, e2, dist.smallByteSize(), QDigest.serialize(qd).length);
        }
    }
}
 
Example #4
Source File: TDigestQuantiles.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public void encode(MergingDigest value, OutputStream outStream) throws IOException {
  if (value == null) {
    throw new CoderException("cannot encode a null T-Digest sketch");
  }
  ByteBuffer buf = ByteBuffer.allocate(value.byteSize());
  value.asBytes(buf);
  BYTE_ARRAY_CODER.encode(buf.array(), outStream);
}
 
Example #5
Source File: TDigestNumericHistogram.java    From geowave with Apache License 2.0 5 votes vote down vote up
@Override
public void fromBinary(final ByteBuffer buffer) {
  tdigest = MergingDigest.fromBytes(buffer);
  final byte[] remaining = new byte[buffer.remaining()];
  buffer.get(remaining);
  count = ByteArrayUtils.variableLengthDecode(remaining);
}
 
Example #6
Source File: TDigestQuantilesTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testMergeAccum() {
  Random rd = new Random(1234);
  List<MergingDigest> accums = new ArrayList<>();
  for (int i = 0; i < 3; i++) {
    MergingDigest std = new MergingDigest(100);
    for (int j = 0; j < 1000; j++) {
      std.add(rd.nextDouble());
    }
    accums.add(std);
  }
  TDigestQuantilesFn fn = TDigestQuantilesFn.create(100);
  MergingDigest res = fn.mergeAccumulators(accums);
}
 
Example #7
Source File: TDigestQuantilesTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testCoder() throws Exception {
  MergingDigest tDigest = new MergingDigest(1000);
  for (int i = 0; i < 10; i++) {
    tDigest.add(2.4 + i);
  }

  Assert.assertTrue("Encode and Decode", encodeDecodeEquals(tDigest));
}
 
Example #8
Source File: TDigestQuantiles.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
protected long getEncodedElementByteSize(MergingDigest value) throws IOException {
  if (value == null) {
    throw new CoderException("cannot encode a null T-Digest sketch");
  }
  return value.byteSize();
}
 
Example #9
Source File: ComparisonTest.java    From t-digest with Apache License 2.0 5 votes vote down vote up
private void compareSQ(PrintWriter out, AbstractContinousDistribution gen, String tag) {
    double[] quantiles = {0.001, 0.01, 0.1, 0.2, 0.3, 0.5, 0.7, 0.8, 0.9, 0.99, 0.999};
    for (double compression : new double[]{10, 20, 50, 100, 200, 500, 1000, 2000}) {
        QuantileEstimator sq = new QuantileEstimator(1001);
        TDigest dist = new MergingDigest(compression);
        double[] data = new double[100000];
        for (int i = 0; i < 100000; i++) {
            double x = gen.nextDouble();
            dist.add(x);
            sq.add(x);
            data[i] = x;
        }
        dist.compress();
        Arrays.sort(data);

        List<Double> qz = sq.getQuantiles();
        for (double q : quantiles) {
            double x1 = dist.quantile(q);
            double x2 = qz.get((int) (q * 1000 + 0.5));
            double e1 = Dist.cdf(x1, data) - q;
            double e2 = Dist.cdf(x2, data) - q;
            out.printf("%s,%.0f,%.8f,%.10g,%.10g,%d,%d\n",
                    tag, compression, q, e1, e2, dist.smallByteSize(), sq.serializedSize());

        }
    }
}
 
Example #10
Source File: BinFill.java    From t-digest with Apache License 2.0 5 votes vote down vote up
@Test
public void sampleFill() {
    System.out.printf("scale,delta,centroid,mean,count\n");
    for (double delta : new double[]{5, 10}) {
        double[] data = {0, 0, 3, 4, 1, 6, 0, 5, 2, 0, 3, 3, 2, 3, 0, 2, 5, 0, 3, 1};

        MergingDigest t1 = new MergingDigest(delta);
        t1.setScaleFunction(ScaleFunction.K_1);

        MergingDigest t2 = new MergingDigest(delta);
        t2.setScaleFunction(ScaleFunction.K_2);

        MergingDigest t3 = new MergingDigest(delta);
        t3.setScaleFunction(ScaleFunction.K_3);
        for (double x : data) {
            t1.add(x);
            t2.add(x);
            t3.add(x);
        }


        int i = 1;
        for (MergingDigest t : Lists.newArrayList(t1, t2, t3)) {
            System.out.printf("> %d, %.0f, %.5f, %.5f\n", i, delta, t.quantile(0.65), Dist.quantile(0.65, data));
            int j = 0;
            for (Centroid centroid : t.centroids()) {
                System.out.printf("%d,%.0f,%d,%.5f,%d\n", i, delta, j, centroid.mean(), centroid.count());
                j++;
            }
            i++;
        }
    }
}
 
Example #11
Source File: TDigestQuantiles.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public MergingDigest mergeAccumulators(Iterable<MergingDigest> accumulators) {
  Iterator<MergingDigest> it = accumulators.iterator();
  MergingDigest merged = it.next();
  while (it.hasNext()) {
    merged.add(it.next());
  }
  return merged;
}
 
Example #12
Source File: Util.java    From t-digest with Apache License 2.0 4 votes vote down vote up
TDigest create(double compression, int bufferSize) {
    TDigest digest = new MergingDigest(compression, bufferSize);
    ((MergingDigest) digest).useAlternatingSort = false;
    ((MergingDigest) digest).useTwoLevelCompression = false;
    return digest;
}
 
Example #13
Source File: BinFill.java    From t-digest with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws FileNotFoundException {
    try (PrintWriter out = new PrintWriter("bin-fill.csv")) {
        out.printf("iteration,dist,algo,scale,q,x,k0,k1,dk,q0,q1,count,max0,max1\n");

        // for all scale functions except the non-normalized ones
        for (ScaleFunction f : ScaleFunction.values()) {
            if (f.toString().contains("NO_NORM")) {
                continue;
            }
            System.out.printf("%s\n", f);

            // for all kinds of t-digests
            for (Util.Factory factory : Util.Factory.values()) {
                // for different distributions of values
                for (Util.Distribution distribution : Util.Distribution.values()) {
                    AbstractDistribution gen = distribution.create(new Random());
                    // do multiple passes
                    for (int i = 0; i < 10; i++) {
                        TDigest dist = factory.create();
                        if (dist instanceof MergingDigest) {
                            // can only set scale function on merging digest right now ...
                            // ability for TreeDigest coming soon
                            dist.setScaleFunction(f);
                        }
                        for (int j = 0; j < N; j++) {
                            dist.add(gen.nextDouble());
                        }

                        // now dump stats for the centroids
                        double q0 = 0;
                        double k0 = 0;
                        for (Centroid c : dist.centroids()) {
                            double q1 = q0 + (double) c.count() / N;
                            double k1 = f.k(q1, dist.compression(), dist.size());
                            out.printf("%d,%s,%s,%s,%.7f,%.7f,%.7f,%.7f,%.7f,%.7f,%.7f,%d,%.1f,%.1f\n",
                                    i, distribution, factory, f, (q0 + q1) / 2, c.mean(),
                                    k0, k1, k1 - k0, q0, q1, c.count(),
                                    dist.size() * f.max(q0, dist.compression(), dist.size()),
                                    dist.size() * f.max(q1, dist.compression(), dist.size())
                            );
                            q0 = q1;
                            k0 = k1;
                        }
                    }
                }
            }
        }
    }
}
 
Example #14
Source File: Util.java    From t-digest with Apache License 2.0 4 votes vote down vote up
TDigest create(double compression) {
    TDigest digest = new MergingDigest(compression, (int) (10 * compression));
    ((MergingDigest) digest).useAlternatingSort = true;
    ((MergingDigest) digest).useTwoLevelCompression = true;
    return digest;
}
 
Example #15
Source File: Util.java    From t-digest with Apache License 2.0 4 votes vote down vote up
TDigest create(double compression, int bufferSize) {
    TDigest digest = new MergingDigest(compression, bufferSize);
    ((MergingDigest) digest).useAlternatingSort = true;
    ((MergingDigest) digest).useTwoLevelCompression = true;
    return digest;
}
 
Example #16
Source File: Util.java    From t-digest with Apache License 2.0 4 votes vote down vote up
TDigest create(double compression) {
    TDigest digest = new MergingDigest(compression, (int) (10 * compression));
    ((MergingDigest) digest).useAlternatingSort = false;
    ((MergingDigest) digest).useTwoLevelCompression = false;
    return digest;
}
 
Example #17
Source File: TDigestQuantiles.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<MergingDigest> expand(PCollection<Double> input) {
  return input.apply(
      "Compute T-Digest Structure",
      Combine.globally(TDigestQuantilesFn.create(this.compression())));
}
 
Example #18
Source File: TDigestBench.java    From t-digest with Apache License 2.0 4 votes vote down vote up
@Override
TDigest create(double compression) {
    return new MergingDigest(compression, (int) (10 * compression));
}
 
Example #19
Source File: TDigestQuantiles.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public boolean isRegisterByteSizeObserverCheap(MergingDigest value) {
  return true;
}
 
Example #20
Source File: TDigestQuantiles.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public MergingDigest decode(InputStream inStream) throws IOException {
  byte[] bytes = BYTE_ARRAY_CODER.decode(inStream);
  ByteBuffer buf = ByteBuffer.wrap(bytes);
  return MergingDigest.fromBytes(buf);
}
 
Example #21
Source File: TDigestQuantiles.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public Coder<MergingDigest> getDefaultOutputCoder(CoderRegistry registry, Coder inputCoder) {
  return new MergingDigestCoder();
}
 
Example #22
Source File: TDigestQuantiles.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public Coder<MergingDigest> getAccumulatorCoder(CoderRegistry registry, Coder inputCoder) {
  return new MergingDigestCoder();
}
 
Example #23
Source File: TDigestQuantiles.java    From beam with Apache License 2.0 4 votes vote down vote up
/** Output the whole structure so it can be queried, reused or stored easily. */
@Override
public MergingDigest extractOutput(MergingDigest accum) {
  return accum;
}
 
Example #24
Source File: TDigestQuantiles.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public MergingDigest addInput(MergingDigest accum, Double value) {
  accum.add(value);
  return accum;
}
 
Example #25
Source File: TDigestQuantiles.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public MergingDigest createAccumulator() {
  return new MergingDigest(compression);
}
 
Example #26
Source File: TDigestQuantiles.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<KV<K, MergingDigest>> expand(PCollection<KV<K, Double>> input) {
  return input.apply(
      "Compute T-Digest Structure",
      Combine.perKey(TDigestQuantilesFn.create(this.compression())));
}