streaminer-master
- src
  - main
    - java
      - org
        streaminer
        stream
        learner
        Regressor.java
        AbstractRegressor.java
        LearnerUtils.java
        Learner.java
        avg
        IAverage.java
        TEWMA.java
        ExponentialMovingAverage.java
        MovingAverage.java
        SimpleEWMA.java
        VariableEWMA.java
        frequency
        SpaceSaving.java
        BaseFrequency.java
        topk
        ITopK.java
        StochasticTopper.java
        StreamSummary.java
        ConcurrentStreamSummary.java
        Frequent.java
        LossyCounting.java
        AMSSketch.java
        ISimpleFrequency.java
        TimeDecayCountMinSketch.java
        Majority.java
        IRichFrequency.java
        CountMinSketch.java
        util
        ISampleSet.java
        SampleSet.java
        CountEntryWithMaxError.java
        ScoredItem.java
        CountEntry.java
        Counter.java
        ITimeDecayFrequency.java
        MisraGries.java
        decay
        DecayFormula.java
        LinDecayFormula.java
        Quantity.java
        ExpDecayFormula.java
        LogDecayFormula.java
        CountSketch.java
        CCFCSketch.java
        IBaseFrequency.java
        StickySampling.java
        SimpleTopKCounting.java
        CountMinSketchAlt.java
        TimeDecayRealCounting.java
        CGT.java
        IFrequencyList.java
        RealCounting.java
        FrequencyException.java
        cardinality
        RegisterSet.java
        LogLog.java
        IBaseCardinality.java
        FlajoletMartin.java
        LinearCounting.java
        CountThenEstimate.java
        CardinalityMergeException.java
        HyperLogLogPlus.java
        KMinValues.java
        BJKST.java
        HyperLogLog.java
        IRichCardinality.java
        AdaptiveCounting.java
        membership
        BloomCalculations.java
        BloomFilter.java
        RetouchedBloomFilter.java
        StableBloomFilter.java
        TimingBloomFilter.java
        DynamicBloomFilter.java
        VarCountingBloomFilter.java
        BloomFilterAlt.java
        HashFunction.java
        IFilter.java
        CuckooFilter.java
        AbstractFilter.java
        CountingBloomFilter.java
        Key.java
        DLeftCountingBloomFilter.java
        ODTDBloomFilter.java
        RemoveScheme.java
        eval
        ConfusionMatrix.java
        TableOfConfusion.java
        change
        AbstractChange.java
        AbsoluteChange.java
        CusumNP.java
        RelativeChange.java
        AbstractCusum.java
        VariationalChange.java
        misc
        Rate.java
        Significance.java
        MovingWindowDelta.java
        Statistics.java
        clustering
        KMeans.java
        birch
        CFEntryPair.java
        CFTree.java
        CFNode.java
        CFEntry.java
        IClusterer.java
        histogram
        spdt
        Bin.java
        MixedInsertException.java
        ArrayBinReservoir.java
        SumOutOfRangeException.java
        MapCategoricalTarget.java
        BinReservoir.java
        CategoricalTarget.java
        Target.java
        Histogram.java
        Gap.java
        TreeBinReservoir.java
        SumResult.java
        GroupTarget.java
        SimpleTarget.java
        BinUpdateException.java
        NumericTarget.java
        ArrayCategoricalTarget.java
        data
        Data.java
        DataImpl.java
        DataUtils.java
        classifier
        AbstractClassifier.java
        RandomClassifier.java
        Perceptron.java
        PAClassifier.java
        BWinnowClassifier.java
        Classifier.java
        bayes
        NaiveBayes.java
        MultiLossyBayes.java
        StickySamplingBayes.java
        NaiveBayesWOP.java
        LossyBayes.java
        TopKBayes.java
        BoundedBayes.java
        MultiBayes.java
        MultiTopkBayes.java
        AbstractMultiClassifier.java
        MajorityClass.java
        MultiRandomClassifier.java
        WinnowClassifier.java
        tree
        HoeffdingTree.java
        TreeNode.java
        ChernoffStatistics.java
        LinearRegression.java
        RegressionTreeNode.java
        RegressionTreeModel.java
        Visitor.java
        RTreeStatistics.java
        HoeffdingTreeModel.java
        RTree.java
        BinaryTreeNode.java
        HoeffdingTreeNode.java
        ModelTree.java
        LeafNode.java
        RegressionTree.java
        Range.java
        InnerNode.java
        NodeInfo.java
        RegressionTreeStatistics.java
        SplitCriterion.java
        QualityCriterion.java
        ChernoffSplitCriterion.java
        BTreeNode.java
        AROWClassifier.java
        MultiClassPAClassifier.java
        SimpleClassifier.java
        sampler
        L0Sampler.java
        gamma
        Z.java
        GammaFunction.java
        R.java
        SpaceSavingSampler.java
        ChainSampler.java
        BernoulliSampler.java
        SystematicSampler.java
        WeightedRandomSampler.java
        FrequentSampler.java
        ISampler.java
        sre
        SSparseRecoveryEstimator.java
        OneSparseRecoveryEstimator.java
        WRSampler.java
        ReservoirSampler.java
        ISampleList.java
        model
        Model.java
        HyperplaneModel.java
        NumericalDistributionModel.java
        StreamDistribution.java
        NominalDistributionModel.java
        PredictionModel.java
        Distribution.java
        SlidingWindow.java
        DescriptionModel.java
        SelectiveDescriptionModel.java
        entropy
        EntropyExact.java
        EntropySketch.java
        IEntropy.java
        norm
        StableSketch.java
        mapper
        ExtractDouble.java
        Identity.java
        AttributeFilter.java
        IMapper.java
        AddMax.java
        SelectFeaturesMapper.java
        Normalization.java
        NumericalBinning.java
        AddMinMax.java
        KeyMapper.java
        wavelet
        HaarWaveletTransform.java
        IWavelet.java
        quantile
        QDigest.java
        EnsembleQuantiles.java
        GKQuantiles.java
        SimpleQuantiles.java
        QuantilesException.java
        SumQuantiles.java
        TDigest.java
        MPQuantiles.java
        Frugal2U.java
        IQuantiles.java
        WindowSketchQuantiles.java
        rss
        SubsetTree.java
        Subset.java
        Interval.java
        Bucket.java
        RSSQuantiles.java
        ExactQuantiles.java
        GroupTree.java
        CKMSQuantiles.java
        ExactQuantilesAll.java
        util
        ByteUtil.java
        AbstractIterator.java
        Preconditions.java
        UnsignedIntComparator.java
        QuotedStringTokenizer.java
        ExternalizableUtil.java
        distribution
        ZipfDistribution.java
        IBuilder.java
        Bits.java
        Lists.java
        Varint.java
        math
        NumberUtil.java
        MathUtil.java
        MathHelper.java
        Prime.java
        PowerOfTwo.java
        SizeOf.java
        ListNode2.java
        Pair.java
        ByteArrayTable.java
        hash
        Hash.java
        Lookup3Hash.java
        MurmurHash3.java
        JenkinsHash.java
        SpookyHash64.java
        SpookyHash.java
        function
        MurmurHashFunction.java
        SimpleHashFunction.java
        HashFunction.java
        JenkinsHashFunction.java
        TwoUniversalHashFunction.java
        SpookyHash32.java
        factory
        SimpleHashFactory.java
        HashFunctionFactory.java
        TwoUniversalHashFactory.java
        MurmurHash.java
        HashUtils.java
        distance
        LinearDistance.java
        Kernel.java
        Radius.java
        CosineDistance.java
        SquaredDistance.java
        ArrayUtils.java
        DoublyLinkedList.java
  - test
    - resources
      - multi-golf.csv.gz
      - log4j.properties
      - generated-stream.out.xml
      - NaiveBayes-experiment.xml
      - generated-stream.xml
      - RandomClassifier-experiment.xml
      - golf.csv
      - MultiRandomClassifier-experiment.xml
      - test-stream.xml
      - golf.csv.gz
      - NaiveBayesWOP-experiment.xml
      - BoundedNaiveBayes-experiment.xml
      - MultiBayes-experiment.xml
    - java
      - org
        streaminer
        stream
        avg
        TEWMATest.java
        frequency
        SpaceSavingTest.java
        topk
        TestConcurrentStreamSummary.java
        TestStreamSummary.java
        TestStochasticTopper.java
        CountMinSketchAltTest.java
        CGTTest.java
        util
        TestSampleSet.java
        SimpleTopKCountingTest.java
        TimeDecayCountMinSketchTest.java
        AMSSketchTest.java
        CCFCSketchTest.java
        StreamGenerator.java
        cardinality
        TestAdaptiveCounting.java
        TestICardinality.java
        TestCountThenEstimate.java
        BJKSTTest.java
        RegisterSetTest.java
        FlajoletMartinTest.java
        KMinValuesTest.java
        TestHyperLogLogPlus.java
        TestLogLog.java
        TestLinearCounting.java
        TestHyperLogLog.java
        membership
        VarCountingBloomFilterTest.java
        StableBloomFilterTest.java
        CountingBloomFilterTest.java
        BloomFilterTest.java
        CuckooFilterTest.java
        BloomFilterAltTest.java
        DLeftCountingBloomFilterTest.java
        ODTDBloomFilterTest.java
        TimingBloomFilterTest.java
        change
        ExactSolution.java
        AbsoluteChangeTest.java
        StreamGenerator.java
        classifier
        bayes
        NaiveBayesTest.java
        sampler
        ChainSamplerTest.java
        model
        NominalDistributionModelTest.java
        entropy
        EntropySketchTest.java
        norm
        StableSketchTest.java
        benchmark
        CountMinSketchBenchmark.java
        quantile
        GroupTreeTest.java
        MPQuantilesTest.java
        QDigestTest.java
        CKMSQuantilesTest.java
        TDigestTest.java
        Frugal2UTest.java
        util
        TestDoublyLinkedList.java
        hash
        MurmurHashTest.java
        Lookup3HashTest.java
        SpookyHash32Test.java
        SpookyHash64Test.java
- pom.xml
- LICENSE
- README.md
- .gitignore

Streaminer

A collection of algorithms for mining data streams, including frequent itemsets, quantiles, sampling, moving averages, set membership and cardinality.

Releases

Maven:

<dependency>
    <groupId>com.github.mayconbordin</groupId>
    <artifactId>streaminer</artifactId>
    <version>1.1.1</version>
</dependency>

API Documentation

Frequent Itemsets

Algorithms

CountSketch [1]
CountMinSketch [2]
LossyCounting [3]
Majority [4]
MisraGries [5]
SpaceSaving [6]
StickySampling [3]
RealCounting
SimpleTopKCounting
TimeDecayCountMinSketch
TimeDecayRealCounting
AMSSketch
CCFCSketch
CGT

Usage

Except for the CountMinSketchAlt class, all algorithms implement the IRichFrequency interface. Here's an example using the SpaceSaving algorithm:

Random r = new Random();
int counters = 20;
double support = 0.01;
double maxError = 0.1;

IRichFrequency<Integer> counter = new SpaceSaving<Integer>(counters, support, maxError);
for (int i=0 i<1000; i++) {
    counter.add(r.nextInt(100), 1);
}

// get the top 10 items
List<CountEntry<Integer>> topk = counter.peek(10);

// print the items
for (CountEntry<Integer> item : topk) {
    System.out.println(item);
}

// get the frequency of a single item
int item = 25;
long freq = counter.estimateCount(item);
System.out.println(item + ": " + freq);

Time Decaying Algorithms

TimeDecayRealCounting and TimeDecayCountMinSketch are algorithms that use a decay function to update the current values of their counts in order to give more importance to newer values, while older values will slowly fade away.

The decay function implements the DecayFormula interface. Currently there are three implementations: the exponential (ExpDecayFormula), the linear (LinDecayFormula), and the logarithmic (LogDecayFormula).

Those counting algorithms implement a different interface called ITimeDecayFrequency, as both methods for adding and estimating the frequency need an additional argument, the timestamp.

Top-K

Algorithms

StreamSummary [6]
ConcurrentStreamSummary
Frequent
StochasticTopper

Usage

The basic usage of a Top-K algorithm is basically the same as the frequent itemset, except that these algorithms do not support the estimateCount method.

ITopK<String> counter = new StreamSummary<String>(3);

String[] stream = {"X", "X", "Y", "Z", "A", "B", "C", "X", "X", "A", "C", "A", "A"};
for (String i : stream) {
    counter.add(i);
}

List<CountEntry<String>> topk = counter.peek(3);
for (CountEntry<String> item : topk) {
    System.out.println(item);
}

Quantiles

Algorithms

CKMSQuantiles [7]
Frugal2U [8]
GKQuantiles [9]
MPQuantiles [10]
QDigest [11]
WindowSketchQuantiles [12]
RSSQuantiles [13]
EnsembleQuantiles
ExactQuantiles
ExactQuantilesAll
SimpleQuantiles
SumQuantiles
TDigest

Usage

double[] quantiles = new double[]{0.05, 0.25, 0.5, 0.75, 0.95};
IQuantiles<Integer> instance = new Frugal2U(quantiles, 0);

RandomEngine r = new MersenneTwister64(0);
Normal dist = new Normal(100, 50, r);
int numSamples = 1000;

for(int i = 0; i < numSamples; ++i) {
    int num = (int) Math.max(0, dist.nextDouble());
    instance.offer(num);
}

for (double q : quantiles) {
    System.out.println(q + ": " + instance.getQuantile(q));
}

Cardinality

Algorithms

AdaptiveCounting [14]
LogLog [15]
HyperLogLog [16]
HyperLogLogPlus [17]
LinearCounting [18]
CountThenEstimate
BJKST [26]
FlajoletMartin [27]
KMinCount

Usage

ICardinality card = new LogLog(8);

for (int i=0; i<100; i++) {
    card.offer(Math.random()*100.0);
}

System.out.println("Cardinality: " + card.cardinality());

Average

Algorithms

MovingAverage
ExponentialMovingAverage
SimpleEWMA
VariableEWMA
TEWMA [25]

Usage

// create a EWMA with 15 seconds of age for the metrics in the period
IAverage avg = new VariableEWMA(15.0);

for (int i=0; i<100; i++) {
    avg.add(Math.random()*100.0);
    if (i%10 == 0)
        System.out.println("Average: " + avg.getAverage());
}

Membership

Algorithms

BloomFilter [22]
BloomFilterAlt (alternative implementation)
CountingBloomFilter [19]
VarCountingBloomFilter (with variable bucketsPerWord)
DynamicBloomFilter [20]
RetouchedBloomFilter [21]
StableBloomFilter [23]
TimingBloomFilter [24]
ODTDBloomFilter [28]

Usage

IFilter bloom = new BloomFilter(1000, 32, Hash.MURMUR_HASH);

for (int i = 0; i < 100; i++) {
    String val = UUID.randomUUID().toString();
    Key k = new Key(val.getBytes());
    bloom.add(k);
    System.out.println(val + " exists? " + bloom.membershipTest(k));
}

Sampling

Algorithms

BernoulliSampler
ChainSampler [29]
ReservoirSampler
SystematicSampler
WRSampler (With Replacement)
WeightedRandomSampler
L0Sampler [30]
SpaceSavingSampler
FrequentSampler

Usage

// Create a sampler with 30% probability
ISampler sampler = new BernoulliSampler(0.3);

Random rand = new Random();

// Create a dummy stream of ints
List<Integer> stream = new ArrayList<Integer>(1000);
for (int i=0; i<1000; i++)
    stream.add(rand.nextInt(100));

for (Integer tuple : stream) {
    if (sampler.next()) {
        // tuple was sampled, do something
    } else {
        // tuple was ignored, move on
    }
}

Classifiers

Algorithms

Perceptron
NaiveBayes
NaiveBayesWOP
BoundedBayes
LossyBayes
MultiBayes
MultiLossyBayes
MultiTopKBayes
SticySamplingBayes
TopKBayes
MajorityClass
RandomClassifier
MultiRandomClassifier
AROWClassifier (Adaptive Regularization of Weight Vectors) [32]
BWinnowClassifier (Balanced Winnow Classifier) [33]
PAClassifier, MultiClassPAClassifier [34]
WinnowClassifier

Usage

NaiveBayes nb = new NaiveBayes();
nb.setLabelAttribute("play");

ICsvListReader listReader = new CsvListReader(
        new FileReader("src/test/resources/golf.csv"), 
        CsvPreference.EXCEL_NORTH_EUROPE_PREFERENCE);

listReader.getHeader(true);

List<String> list;
while( (list = listReader.read()) != null ) {
    Data data = new DataImpl();
    data.put("outlook", list.get(0));
    data.put("temperature", Integer.parseInt(list.get(1)));
    data.put("humidity", Integer.parseInt(list.get(2)));
    data.put("wind", Boolean.parseBoolean(list.get(3)));
    data.put("play", list.get(4));

    nb.learn(data);
}

Data test = new DataImpl();
test.put("outlook", "sunny");
test.put("temperature", "cool");
test.put("humidity", "high");
test.put("windy", "TRUE");

String prediction = nb.predict(test);
System.out.println("Item is: " + test);
System.out.println("Prediction is: " + prediction);

Clustering

Algorithms

K-Means
BIRCH (Balanced Iterative Reducing and Clustering using Hierarchies) [31]

References

[1] Charikar, Moses, Kevin Chen, and Martin Farach-Colton. "Finding frequent items in data streams." Automata, Languages and Programming. Springer Berlin Heidelberg, 2002. 693-703.

[2] Cormode, Graham, and S. Muthukrishnan. "An improved data stream summary: the count-min sketch and its applications." Journal of Algorithms 55.1 (2005): 58-75.

[3] Manku, Gurmeet Singh, and Rajeev Motwani. "Approximate frequency counts over data streams." Proceedings of the 28th international conference on Very Large Data Bases. VLDB Endowment, 2002.

[4] M. J. Fischer and S. L. Salzberg. "Finding a Majority Among N Votes: Solution to Problem 81-5(Journal of Algorithms, June 1981)", Journal of Algorithms, 3:4, December 1982, pp. 362-380.

[5] Misra, Jayadev, and David Gries. "Finding repeated elements." Science of computer programming 2.2 (1982): 143-152.

[6] Metwally, Ahmed, Divyakant Agrawal, and Amr El Abbadi. "Efficient computation of frequent and top-k elements in data streams." Database Theory-ICDT 2005. Springer Berlin Heidelberg, 2005. 398-412.

[7] Cormode, Graham, et al. "Effective computation of biased quantiles over data streams." Data Engineering, 2005. ICDE 2005. Proceedings. 21st International Conference on. IEEE, 2005.

[8] Ma, Qiang, S. Muthukrishnan, and Mark Sandler. "Frugal Streaming for Estimating Quantiles." Space-Efficient Data Structures, Streams, and Algorithms. Springer Berlin Heidelberg, 2013. 77-96.

[9] Greenwald, Michael, and Sanjeev Khanna. "Space-efficient online computation of quantile summaries." ACM SIGMOD Record. Vol. 30. No. 2. ACM, 2001.

[10] Munro, J. Ian, and Mike S. Paterson. "Selection and sorting with limited storage." Theoretical computer science 12.3 (1980): 315-323.

[11] Shrivastava, Nisheeth, et al. "Medians and beyond: new aggregation techniques for sensor networks." Proceedings of the 2nd international conference on Embedded networked sensor systems. ACM, 2004.

[12] Arasu, Arvind, and Gurmeet Singh Manku. "Approximate counts and quantiles over sliding windows." Proceedings of the twenty-third ACM SIGMOD-SIGACT-SIGART symposium on Principles of database systems. ACM, 2004.

[13] Gilbert, Anna C., et al. "How to summarize the universe: Dynamic maintenance of quantiles." Proceedings of the 28th international conference on Very Large Data Bases. VLDB Endowment, 2002.

[14] Cai, Min, et al. "Fast and accurate traffic matrix measurement using adaptive cardinality counting." Proceedings of the 2005 ACM SIGCOMM workshop on Mining network data. ACM, 2005.

[15] Durand, Marianne, and Philippe Flajolet. "Loglog counting of large cardinalities." Algorithms-ESA 2003. Springer Berlin Heidelberg, 2003. 605-617.

[16] Flajolet, Philippe, et al. "HyperLogLog: the analysis of a near-optimal cardinality estimation algorithm." DMTCS Proceedings 1 (2008).

[17] Heule, Stefan, Marc Nunkesser, and Alexander Hall. "HyperLogLog in practice: algorithmic engineering of a state of the art cardinality estimation algorithm." Proceedings of the 16th International Conference on Extending Database Technology. ACM, 2013.

[18] Whang, Kyu-Young, Brad T. Vander-Zanden, and Howard M. Taylor. "A linear-time probabilistic counting algorithm for database applications." ACM Transactions on Database Systems (TODS) 15.2 (1990): 208-229.

[19] Fan, L., Cao, P., Almeida, J., & Broder, A. Z. (2000). Summary cache: a scalable wide-area web cache sharing protocol. IEEE/ACM Transactions on Networking (TON), 8(3), 281-293.

[20] Guo, Deke, Jie Wu, Honghui Chen, and Xueshan Luo. "Theory and Network Applications of Dynamic Bloom Filters." In INFOCOM, pp. 1-12. 2006.

[21] Donnet, Benoit, Bruno Baynat, and Timur Friedman. "Retouched bloom filters: allowing networked applications to trade off selected false positives against false negatives." In Proceedings of the 2006 ACM CoNEXT conference, p. 13. ACM, 2006.

[22] Bloom, Burton H. "Space/time trade-offs in hash coding with allowable errors." Communications of the ACM 13, no. 7 (1970): 422-426.

[23] Deng, Fan, and Davood Rafiei. "Approximately detecting duplicates for streaming data using stable bloom filters." Proceedings of the 2006 ACM SIGMOD international conference on Management of data. ACM, 2006.

[24] Dautrich Jr, Jonathan L., and Chinya V. Ravishankar. "Inferential time-decaying Bloom filters." Proceedings of the 16th International Conference on Extending Database Technology. ACM, 2013.

[25] Martin, Ruediger, and Michael Menth. "Improving the Timeliness of Rate Measurements." In MMB, pp. 145-154. 2004.

[26] Bar-Yossef, Ziv, et al. "Counting distinct elements in a data stream." Randomization and Approximation Techniques in Computer Science. Springer Berlin Heidelberg, 2002. 1-10.

[27] Flajolet, Philippe, and G. Nigel Martin. "Probabilistic counting algorithms for data base applications." Journal of computer and system sciences 31.2 (1985): 182-209.

[28] Bianchi, Giuseppe, Nico d'Heureuse, and Saverio Niccolini. "On-demand time-decaying bloom filters for telemarketer detection." ACM SIGCOMM Computer Communication Review 41.5 (2011): 5-12.

[29] Babcock, Brian, Mayur Datar, and Rajeev Motwani. "Sampling from a moving window over streaming data." Proceedings of the thirteenth annual ACM-SIAM symposium on Discrete algorithms. Society for Industrial and Applied Mathematics, 2002.

[30] Cormode, Graham, Donatella Firmani, Graham Cormode, and Donatella Firmani. "On Unifying the Space of ℓ0-Sampling Algorithms." In ALENEX, pp. 163-172. 2013.

[31] Zhang, Tian, Raghu Ramakrishnan, and Miron Livny. "BIRCH: an efficient data clustering method for very large databases." ACM SIGMOD Record. Vol. 25. No. 2. ACM, 1996.

[32] Crammer, Koby, Alex Kulesza, and Mark Dredze. "Adaptive regularization of weight vectors." Advances in Neural Information Processing Systems. 2009.

[33] Carvalho, Vitor R., and William W. Cohen. "Single-pass online learning: Performance, voting schemes and online feature selection." Proceedings of the 12th ACM SIGKDD international conference on Knowledge discovery and data mining. ACM, 2006.

[34] Crammer, Koby, Ofer Dekel, Joseph Keshet, Shai Shalev-Shwartz, and Yoram Singer. "Online passive-aggressive algorithms." The Journal of Machine Learning Research 7 (2006): 551-585.

Similar Libraries

Java
Scala
- algebird
- fleet
C/C++
JavaScript
- node-streamcount
- node-datastream