com.clearspring.analytics.stream.cardinality.HyperLogLogPlus Java Examples

The following examples show how to use com.clearspring.analytics.stream.cardinality.HyperLogLogPlus. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FacetTableFunction.java    From datawave with Apache License 2.0 6 votes vote down vote up
@Override
public Entry<Key,Document> apply(Entry<Key,Value> input) {
    Key key = input.getKey();
    Document newDoc = new Document();
    try {
        
        String[] fields = StringUtils.split(key.getColumnFamily().toString(), "\u0000");
        String[] fieldValues = StringUtils.split(key.getRow().toString(), "\u0000");
        
        FieldValueCardinality fvc = new FieldValueCardinality(HyperLogLogPlus.Builder.build(input.getValue().get()));
        fvc.setContent(fieldValues[1]);
        
        Cardinality card = new Cardinality(fvc, key, false);
        
        newDoc.put(fields[1], card);
        
        return Maps.immutableEntry(key, newDoc);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
 
Example #2
Source File: StringQuality.java    From DataVec with Apache License 2.0 6 votes vote down vote up
public StringQuality(long countValid, long countInvalid, long countMissing, long countTotal, long countEmptyString,
                long countAlphabetic, long countNumerical, long countWordCharacter, long countWhitespace,
                double relativeSD) {
    /*
     * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
     * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
     * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
     *
     * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting
     * a nonzero `sp > p` in HyperLogLogPlus(p, sp) would trigger sparse
     * representation of registers, which may reduce the memory consumption
     * and increase accuracy when the cardinality is small.
     */
    this(countValid, countInvalid, countMissing, countTotal, countEmptyString, countAlphabetic, countNumerical,
                    countWordCharacter, countWhitespace,
                    new HyperLogLogPlus((int) Math.ceil(2.0 * Math.log(1.054 / relativeSD) / Math.log(2)), 0));
}
 
Example #3
Source File: StatsHyperLogSummaryTest.java    From datawave with Apache License 2.0 6 votes vote down vote up
/**
 * Randomly populates a {@link HyperLogLogPlus} object.
 */
private HyperLogLogPlus createHyperLog() {
    Set<String> unique = new HashSet<>();
    HyperLogLogPlus logPlus = new HyperLogLogPlus(StatsJob.HYPERLOG_NORMAL_DEFAULT_VALUE, StatsJob.HYPERLOG_SPARSE_DEFAULT_VALUE);
    this.uniqueCount = rVal.nextInt(MAX_UNIQUE_VALUES - MIN_UNIQUE_VALUES) + MIN_UNIQUE_VALUES;
    for (int n = 0; n < this.uniqueCount;) {
        int len = 4 + rVal.nextInt(10);
        String str = RandomStringUtils.randomAlphabetic(len);
        if (unique.add(str)) {
            logPlus.offer(str);
            n++;
        }
    }
    
    log.debug("unique strings added to hyper log(" + this.uniqueCount + ")");
    
    // add duplicates
    List<String> values = new ArrayList<>(unique);
    int dups = rVal.nextInt(MAX_DUP_VALUES - MIN_DUP_VALUES) + MIN_DUP_VALUES;
    for (int n = 0; n < dups; n++) {
        int idx = rVal.nextInt(values.size());
        logPlus.offer(values.get(idx));
    }
    
    return logPlus;
}
 
Example #4
Source File: StatsHyperLogSummaryTest.java    From datawave with Apache License 2.0 6 votes vote down vote up
@Test
public void testSerialize() throws IOException {
    for (int n = 0; n < 10; n++) {
        HyperLogLogPlus logPlus = createHyperLog();
        final StatsHyperLogSummary before = new StatsHyperLogSummary(n, logPlus, this.uniqueCount);
        byte[] bytes = before.toByteArray();
        Value value = new Value(bytes);
        final StatsHyperLogSummary after = new StatsHyperLogSummary(value);
        
        log.debug("before(" + before + ")");
        log.debug("after(" + after + ")");
        
        Assert.assertEquals(before, after);
        Assert.assertEquals(0, before.compareTo(after));
        Assert.assertEquals(before.getCount(), after.getCount());
        
        HyperLogLogPlus logPlusBefore = before.getHyperLogPlus();
        HyperLogLogPlus logPlusAfter = after.getHyperLogPlus();
        
        Assert.assertEquals(logPlusBefore.cardinality(), logPlusAfter.cardinality());
        // may not be true for large sample set but for small sample it is correct
        Assert.assertEquals(this.uniqueCount, logPlusAfter.cardinality());
        Assert.assertEquals(this.uniqueCount, after.getUniqueCount());
        Assert.assertEquals(this.uniqueCount, before.getUniqueCount());
    }
}
 
Example #5
Source File: CardinalityScanner.java    From datawave with Apache License 2.0 6 votes vote down vote up
public void addPair(String fieldName, String fieldValue, HyperLogLogPlus hllp) throws Exception {
    if (fieldName.equals(tuple.getValue0())) {
        return;
    }
    Pair<String,String> p = new Pair<>(fieldName, fieldValue);
    Long currSum = intersectionSum.get(p);
    HyperLogLogPlus currHllp = intersectionUnion.get(p);
    HyperLogLogPlus newHllp = HyperLogLogPlus.Builder.build(hllp.getBytes());
    if (currSum == null) {
        intersectionSum.put(p, baseHllp.cardinality() + hllp.cardinality());
        newHllp.addAll(baseHllp);
        intersectionUnion.put(p, newHllp);
    } else {
        intersectionSum.put(p, currSum + hllp.cardinality());
        newHllp.addAll(currHllp);
        intersectionUnion.put(p, newHllp);
    }
}
 
Example #6
Source File: StringQuality.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
public StringQuality(long countValid, long countInvalid, long countMissing, long countTotal, long countEmptyString,
                long countAlphabetic, long countNumerical, long countWordCharacter, long countWhitespace,
                double relativeSD) {
    /*
     * The algorithm used is based on streamlib's implementation of "HyperLogLog in Practice:
     * Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm", available
     * <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
     *
     * The relative accuracy is approximately `1.054 / sqrt(2^p)`. Setting
     * a nonzero `sp > p` in HyperLogLogPlus(p, sp) would trigger sparse
     * representation of registers, which may reduce the memory consumption
     * and increase accuracy when the cardinality is small.
     */
    this(countValid, countInvalid, countMissing, countTotal, countEmptyString, countAlphabetic, countNumerical,
                    countWordCharacter, countWhitespace,
                    new HyperLogLogPlus((int) Math.ceil(2.0 * Math.log(1.054 / relativeSD) / Math.log(2)), 0));
}
 
Example #7
Source File: ApproximateDistinctTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testCoder() throws Exception {
  HyperLogLogPlus hllp = new HyperLogLogPlus(12, 18);
  for (int i = 0; i < 10; i++) {
    hllp.offer(i);
  }
  CoderProperties.coderDecodeEncodeEqual(ApproximateDistinct.HyperLogLogPlusCoder.of(), hllp);
}
 
Example #8
Source File: StringQualityAddFunction.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Override
public StringQuality apply(StringQuality v1, Writable writable) {
    long valid = v1.getCountValid();
    long invalid = v1.getCountInvalid();
    long countMissing = v1.getCountMissing();
    long countTotal = v1.getCountTotal() + 1;
    long empty = v1.getCountEmptyString();
    long alphabetic = v1.getCountAlphabetic();
    long numerical = v1.getCountNumerical();
    long word = v1.getCountWordCharacter();
    long whitespaceOnly = v1.getCountWhitespace();
    HyperLogLogPlus hll = v1.getHll();

    String str = writable.toString();

    if (writable instanceof NullWritable)
        countMissing++;
    else if (meta.isValid(writable))
        valid++;
    else
        invalid++;

    if (str == null || str.isEmpty()) {
        empty++;
    } else {
        if (str.matches("[a-zA-Z]"))
            alphabetic++;
        if (str.matches("\\d+"))
            numerical++;
        if (str.matches("\\w+"))
            word++;
        if (str.matches("\\s+"))
            whitespaceOnly++;
    }

    hll.offer(str);
    return new StringQuality(valid, invalid, countMissing, countTotal, empty, alphabetic, numerical, word,
                    whitespaceOnly, hll);
}
 
Example #9
Source File: ApproximateDistinct.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public HyperLogLogPlus addInput(HyperLogLogPlus acc, InputT record) {
  try {
    acc.offer(CoderUtils.encodeToByteArray(inputCoder, record));
  } catch (CoderException e) {
    throw new IllegalStateException("The input value cannot be encoded: " + e.getMessage(), e);
  }
  return acc;
}
 
Example #10
Source File: ApproximateDistinct.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public HyperLogLogPlus mergeAccumulators(Iterable<HyperLogLogPlus> accumulators) {
  HyperLogLogPlus mergedAccum = createAccumulator();
  for (HyperLogLogPlus accum : accumulators) {
    try {
      mergedAccum.addAll(accum);
    } catch (CardinalityMergeException e) {
      // Should never happen because only HyperLogLogPlus accumulators are instantiated.
      throw new IllegalStateException(
          "The accumulators cannot be merged: " + e.getMessage(), e);
    }
  }
  return mergedAccum;
}
 
Example #11
Source File: StringQuality.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public StringQuality(long countValid, long countInvalid, long countMissing, long countTotal, long countEmptyString,
                long countAlphabetic, long countNumerical, long countWordCharacter, long countWhitespace,
                HyperLogLogPlus hll) {
    super(countValid, countInvalid, countMissing, countTotal);
    this.countEmptyString = countEmptyString;
    this.countAlphabetic = countAlphabetic;
    this.countNumerical = countNumerical;
    this.countWordCharacter = countWordCharacter;
    this.countWhitespace = countWhitespace;
    this.hll = hll;
}
 
Example #12
Source File: ApproximateDistinct.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public void encode(HyperLogLogPlus value, OutputStream outStream) throws IOException {
  if (value == null) {
    throw new CoderException("cannot encode a null HyperLogLogPlus sketch");
  }
  BYTE_ARRAY_CODER.encode(value.getBytes(), outStream);
}
 
Example #13
Source File: ApproximateDistinct.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
protected long getEncodedElementByteSize(HyperLogLogPlus value) throws IOException {
  if (value == null) {
    throw new CoderException("cannot encode a null HyperLogLogPlus sketch");
  }
  return value.sizeof();
}
 
Example #14
Source File: ApproximateDistinct.java    From beam with Apache License 2.0 5 votes vote down vote up
private static <K> DoFn<KV<K, HyperLogLogPlus>, KV<K, Long>> perKey() {
  return new DoFn<KV<K, HyperLogLogPlus>, KV<K, Long>>() {
    @ProcessElement
    public void processElement(ProcessContext c) {
      KV<K, HyperLogLogPlus> kv = c.element();
      c.output(KV.of(kv.getKey(), kv.getValue().cardinality()));
    }
  };
}
 
Example #15
Source File: ApproximateDistinct.java    From beam with Apache License 2.0 5 votes vote down vote up
private static DoFn<HyperLogLogPlus, Long> globally() {
  return new DoFn<HyperLogLogPlus, Long>() {
    @ProcessElement
    public void apply(ProcessContext c) {
      c.output(c.element().cardinality());
    }
  };
}
 
Example #16
Source File: FeatureHyperLogLogStatistics.java    From geowave with Apache License 2.0 5 votes vote down vote up
@Override
public void merge(final Mergeable mergeable) {
  if (mergeable instanceof FeatureHyperLogLogStatistics) {
    try {
      loglog = (HyperLogLogPlus) ((FeatureHyperLogLogStatistics) mergeable).loglog.merge(loglog);
    } catch (final CardinalityMergeException e) {
      throw new RuntimeException("Unable to merge counters", e);
    }
  }
}
 
Example #17
Source File: CoreOutputManager.java    From ffwd with Apache License 2.0 5 votes vote down vote up
/**
* To reset cardinality this will swap HLL++ if it was tripped after configured period of ms
*/
private void swapHyperLogLogPlus() {
    if (System.currentTimeMillis() - hyperLogSwapTS.get() > hyperLogLogPlusSwapPeriodMS
        && hyperLogSwapLock.compareAndExchange(false, true)) {
        hyperLog.set(new HyperLogLogPlus(
                HYPER_LOG_LOG_PLUS_PRECISION_NORMAL, HYPER_LOG_LOG_PLUS_PRECISION_SPARSE));
        hyperLogSwapTS.set(System.currentTimeMillis());
        hyperLogSwapLock.set(false);
    }
}
 
Example #18
Source File: StringQualityAddFunction.java    From DataVec with Apache License 2.0 5 votes vote down vote up
@Override
public StringQuality call(StringQuality v1, Writable writable) throws Exception {
    long valid = v1.getCountValid();
    long invalid = v1.getCountInvalid();
    long countMissing = v1.getCountMissing();
    long countTotal = v1.getCountTotal() + 1;
    long empty = v1.getCountEmptyString();
    long alphabetic = v1.getCountAlphabetic();
    long numerical = v1.getCountNumerical();
    long word = v1.getCountWordCharacter();
    long whitespaceOnly = v1.getCountWhitespace();
    HyperLogLogPlus hll = v1.getHll();

    String str = writable.toString();

    if (writable instanceof NullWritable)
        countMissing++;
    else if (meta.isValid(writable))
        valid++;
    else
        invalid++;

    if (str == null || str.isEmpty()) {
        empty++;
    } else {
        if (str.matches("[a-zA-Z]"))
            alphabetic++;
        if (str.matches("\\d+"))
            numerical++;
        if (str.matches("\\w+"))
            word++;
        if (str.matches("\\s+"))
            whitespaceOnly++;
    }

    hll.offer(str);
    return new StringQuality(valid, invalid, countMissing, countTotal, empty, alphabetic, numerical, word,
                    whitespaceOnly, hll);
}
 
Example #19
Source File: TopKSampler.java    From stratio-cassandra with Apache License 2.0 5 votes vote down vote up
/**
 * Start to record samples
 *
 * @param capacity
 *            Number of sample items to keep in memory, the lower this is
 *            the less accurate results are. For best results use value
 *            close to cardinality, but understand the memory trade offs.
 */
public synchronized void beginSampling(int capacity)
{
    if (!enabled)
    {
        summary = new StreamSummary<T>(capacity);
        hll = new HyperLogLogPlus(14);
        enabled = true;
    }
}
 
Example #20
Source File: SSTableReader.java    From stratio-cassandra with Apache License 2.0 5 votes vote down vote up
private static ICardinality mergeCardinalities(Collection<ICardinality> cardinalities)
{
    ICardinality base = new HyperLogLogPlus(13, 25); // see MetadataCollector.cardinality
    try
    {
        base = base.merge(cardinalities.toArray(new ICardinality[cardinalities.size()]));
    }
    catch (CardinalityMergeException e)
    {
        logger.warn("Could not merge cardinalities", e);
    }
    return base;
}
 
Example #21
Source File: CompactionMetadata.java    From stratio-cassandra with Apache License 2.0 5 votes vote down vote up
public CompactionMetadata deserialize(Descriptor.Version version, DataInput in) throws IOException
{
    int nbAncestors = in.readInt();
    Set<Integer> ancestors = new HashSet<>(nbAncestors);
    for (int i = 0; i < nbAncestors; i++)
        ancestors.add(in.readInt());
    ICardinality cardinality = HyperLogLogPlus.Builder.build(ByteBufferUtil.readBytes(in, in.readInt()));
    return new CompactionMetadata(ancestors, cardinality);
}
 
Example #22
Source File: FeatureHyperLogLogStatistics.java    From geowave with Apache License 2.0 5 votes vote down vote up
@Override
public void fromBinary(final byte[] bytes) {
  final ByteBuffer buffer = super.binaryBuffer(bytes);
  final byte[] data = ByteArrayUtils.safeRead(buffer, VarintUtils.readUnsignedInt(buffer));
  try {
    loglog = HyperLogLogPlus.Builder.build(data);
  } catch (final IOException e) {
    LOGGER.error("Exception while reading statistic", e);
  }
}
 
Example #23
Source File: DistinctCountHyperLogLogAggregateFunction.java    From phoenix with Apache License 2.0 5 votes vote down vote up
@Override
public void aggregate(Tuple tuple, ImmutableBytesWritable ptr) {
	try {
		hll.addAll(HyperLogLogPlus.Builder.build(ByteUtil.copyKeyBytesIfNecessary(ptr)));
	} catch (Exception e) {
		throw new RuntimeException(e);
	}
}
 
Example #24
Source File: FeatureHyperLogLogStatistics.java    From geowave with Apache License 2.0 5 votes vote down vote up
/**
 * @param adapterId the adapter ID
 * @param fieldName the field name
 * @param precision number of bits to support counting. 2^p is the maximum count value per
 *        distinct value. 1 <= p <= 32
 */
public FeatureHyperLogLogStatistics(
    final Short adapterId,
    final String fieldName,
    final int precision) {
  super(adapterId, STATS_TYPE, fieldName);
  loglog = new HyperLogLogPlus(precision);
  this.precision = precision;
}
 
Example #25
Source File: HyperLogFieldSummary.java    From datawave with Apache License 2.0 5 votes vote down vote up
@Override
public void add(Value value) throws IOException {
    StatsHyperLogSummary stats = new StatsHyperLogSummary(value);
    this.count += stats.getCount();
    HyperLogLogPlus hllpAdd = stats.getHyperLogPlus();
    try {
        this.logPlus.addAll(hllpAdd);
    } catch (CardinalityMergeException e) {
        // addAll throws an out of scope exception
        throw new IOException(e);
    }
}
 
Example #26
Source File: Cardinality.java    From datawave with Apache License 2.0 5 votes vote down vote up
@Override
public void readFields(DataInput in) throws IOException {
    readMetadata(in);
    content = new FieldValueCardinality();
    content.lower = WritableUtils.readString(in);
    content.upper = WritableUtils.readString(in);
    byte[] cardArray = WritableUtils.readCompressedByteArray(in);
    content.estimate = HyperLogLogPlus.Builder.build(cardArray);
}
 
Example #27
Source File: Cardinality.java    From datawave with Apache License 2.0 5 votes vote down vote up
@Override
public void read(Kryo kryo, Input input) {
    super.readMetadata(kryo, input);
    content = new FieldValueCardinality();
    this.content.lower = input.readString();
    this.content.upper = input.readString();
    int size = input.readInt();
    byte[] cardArray = new byte[size];
    input.read(cardArray);
    try {
        this.content.estimate = HyperLogLogPlus.Builder.build(cardArray);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
 
Example #28
Source File: CardinalityScanner.java    From datawave with Apache License 2.0 5 votes vote down vote up
public Set<CardinalityIntersectionRecord> scanCardinalities(List<String> fields, DateAggregationType dateAggregationType,
                DatatypeAggregationType datatypeAggregationType) throws Exception {
    
    Map<CardinalityIntersectionRecord,HyperLogLogPlus> cardinalityMap = new TreeMap<>();
    Scanner scanner = null;
    try {
        ZooKeeperInstance instance = new ZooKeeperInstance(config.getInstanceName(), config.getZookeepers());
        Connector connector = instance.getConnector(config.getUsername(), new PasswordToken(config.getPassword()));
        Collection<Authorizations> authCollection = Collections.singleton(new Authorizations(config.getAuths().split(",")));
        if (!connector.tableOperations().exists(config.getTableName())) {
            throw new IllegalArgumentException("Table " + config.getTableName() + " does not exist");
        }
        scanner = ScannerHelper.createScanner(connector, config.getTableName(), authCollection);
        Range r = new Range(config.getBeginDate(), config.getEndDate() + "\0");
        scanner.setRange(r);
        
        Iterator<Map.Entry<Key,Value>> itr = scanner.iterator();
        while (itr.hasNext()) {
            Map.Entry<Key,Value> nextEntry = itr.next();
            Key key = nextEntry.getKey();
            String field = key.getColumnFamily().toString();
            if (fields != null && !fields.isEmpty() && !fields.contains(field)) {
                continue;
            } else {
                addEntry(cardinalityMap, nextEntry, dateAggregationType, datatypeAggregationType);
            }
        }
    } catch (Exception e) {
        log.error(e);
    } finally {
        if (scanner != null) {
            scanner.close();
            
        }
    }
    return cardinalityMap.keySet();
}
 
Example #29
Source File: StringQuality.java    From DataVec with Apache License 2.0 5 votes vote down vote up
public StringQuality(long countValid, long countInvalid, long countMissing, long countTotal, long countEmptyString,
                long countAlphabetic, long countNumerical, long countWordCharacter, long countWhitespace,
                HyperLogLogPlus hll) {
    super(countValid, countInvalid, countMissing, countTotal);
    this.countEmptyString = countEmptyString;
    this.countAlphabetic = countAlphabetic;
    this.countNumerical = countNumerical;
    this.countWordCharacter = countWordCharacter;
    this.countWhitespace = countWhitespace;
    this.hll = hll;
}
 
Example #30
Source File: StatsHyperLogReducer.java    From datawave with Apache License 2.0 5 votes vote down vote up
@Override
public void doReduce(BulkIngestKey key, Iterable<Value> values, TaskInputOutputContext<?,?,BulkIngestKey,Value> context) throws IOException,
                InterruptedException {
    log.info("reduce key(" + key.getKey() + ")");
    this.totalKeys++;
    HyperLogLogPlus hllp = new HyperLogLogPlus(this.normalPrecision, this.sparsePrecision);
    HyperLogFieldSummary stats = new HyperLogFieldSummary(hllp);
    int valueCount = 0;
    for (Value val : values) {
        stats.add(val);
        valueCount++;
        if (0 == (valueCount % this.valueInterval) || this.countsOnly) {
            if (this.countsOnly) {
                StatsHyperLogSummary addStats = new StatsHyperLogSummary(val);
                log.info("add values(" + addStats.statsString() + ")");
            }
            log.info("value count(" + valueCount + ")");
        }
    }
    
    log.info("final stats data(" + stats.toString() + ")");
    if (!this.countsOnly) {
        if (this.minCount <= stats.getCount()) {
            // write to bulk output
            StatsCounters counters = stats.toStatsCounters();
            // set timestamp
            Key k = key.getKey();
            k.setTimestamp(this.timestamp);
            writeBulkIngestKey(key, counters.getValue(), context);
        } else {
            log.debug("count is less than minimum: " + key.getKey().toString() + ") count(" + stats.getCount() + ")");
        }
    }
    
    context.progress();
}