/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.github.prasanthj.hll; import static org.junit.Assert.assertEquals; import com.github.prasanthj.hll.HyperLogLog.EncodingType; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.HashSet; import java.util.Random; import java.util.Set; import org.junit.After; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TestName; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; import org.junit.runners.Parameterized.Parameters; @RunWith(value = Parameterized.class) public class TestHLLSerialization { private int size; private File testFile; private static final String pathPrefix = "."; private static final int SEED = 100; // 5% tolerance for long range bias and 2.5% for short range bias private float longRangeTolerance = 5.0f; private float shortRangeTolerance = 2.5f; public TestHLLSerialization(int n) { this.size = n; this.testFile = new File(pathPrefix + testCaseName.getMethodName() + "_" + size + ".hll"); } @Parameters public static Collection<Object[]> data() { Object[][] data = new Object[][] { { 2 }, { 10 }, { 100 }, { 1000 }, { 2000 }, { 3000 }, { 5000 }, { 6000 }, { 10000 }, { 100000 }, { 1000000 } }; return Arrays.asList(data); } @After public void close() { if (testFile.exists()) { testFile.delete(); } } @Rule public TestName testCaseName = new TestName(); @Test public void testHLLSparseSerialization() throws IOException { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); Random rand = new Random(SEED); for (int i = 0; i < size; i++) { hll.addLong(rand.nextLong()); } FileOutputStream fos = new FileOutputStream(testFile); DataOutputStream out = new DataOutputStream(fos); HyperLogLogUtils.serializeHLL(out, hll); FileInputStream fis = new FileInputStream(testFile); DataInputStream in = new DataInputStream(fis); HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); assertEquals(hll, deserializedHLL); assertEquals(hll.toString(), deserializedHLL.toString()); assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); assertEquals(hll.hashCode(), deserializedHLL.hashCode()); assertEquals(hll.count(), deserializedHLL.count()); } @Test public void testHLLSparseSerializationHalfDistinct() throws IOException { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); Random rand = new Random(SEED); Set<Integer> hashset = new HashSet<Integer>(); for (int i = 0; i < size; i++) { int val = rand.nextInt(size / 2); hll.addLong(val); hashset.add(val); } FileOutputStream fos = new FileOutputStream(testFile); DataOutputStream out = new DataOutputStream(fos); HyperLogLogUtils.serializeHLL(out, hll); double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * hashset.size() / 100; FileInputStream fis = new FileInputStream(testFile); DataInputStream in = new DataInputStream(fis); HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); assertEquals(hll, deserializedHLL); assertEquals(hll.toString(), deserializedHLL.toString()); assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); assertEquals(hll.hashCode(), deserializedHLL.hashCode()); assertEquals(hll.count(), deserializedHLL.count()); assertEquals(hashset.size(), hll.count(), delta); assertEquals(hashset.size(), deserializedHLL.count(), delta); } @Test public void testHLLSparseNoBitPacking() throws IOException { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE) .enableBitPacking(false).build(); Random rand = new Random(SEED); for (int i = 0; i < size; i++) { hll.addLong(rand.nextLong()); } FileOutputStream fos = new FileOutputStream(testFile); DataOutputStream out = new DataOutputStream(fos); HyperLogLogUtils.serializeHLL(out, hll); FileInputStream fis = new FileInputStream(testFile); DataInputStream in = new DataInputStream(fis); HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); assertEquals(hll, deserializedHLL); assertEquals(hll.toString(), deserializedHLL.toString()); assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); assertEquals(hll.hashCode(), deserializedHLL.hashCode()); assertEquals(hll.count(), deserializedHLL.count()); } @Test public void testHLLSparseNoBitPackingHalfDistinct() throws IOException { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE) .enableBitPacking(false).build(); Random rand = new Random(SEED); Set<Integer> hashset = new HashSet<Integer>(); for (int i = 0; i < size; i++) { int val = rand.nextInt(size / 2); hll.addLong(val); hashset.add(val); } FileOutputStream fos = new FileOutputStream(testFile); DataOutputStream out = new DataOutputStream(fos); HyperLogLogUtils.serializeHLL(out, hll); double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * hashset.size() / 100; FileInputStream fis = new FileInputStream(testFile); DataInputStream in = new DataInputStream(fis); HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); assertEquals(hll, deserializedHLL); assertEquals(hll.toString(), deserializedHLL.toString()); assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); assertEquals(hll.hashCode(), deserializedHLL.hashCode()); assertEquals(hll.count(), deserializedHLL.count()); assertEquals(hashset.size(), hll.count(), delta); assertEquals(hashset.size(), deserializedHLL.count(), delta); } @Test public void testHLLDenseSerialization() throws IOException { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); Random rand = new Random(SEED); for (int i = 0; i < size; i++) { hll.addLong(rand.nextLong()); } FileOutputStream fos = new FileOutputStream(testFile); DataOutputStream out = new DataOutputStream(fos); HyperLogLogUtils.serializeHLL(out, hll); FileInputStream fis = new FileInputStream(testFile); DataInputStream in = new DataInputStream(fis); HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); assertEquals(hll, deserializedHLL); assertEquals(hll.toString(), deserializedHLL.toString()); assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); assertEquals(hll.hashCode(), deserializedHLL.hashCode()); assertEquals(hll.count(), deserializedHLL.count()); } @Test public void testHLLDenseSerializationHalfDistinct() throws IOException { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); Random rand = new Random(SEED); Set<Integer> hashset = new HashSet<Integer>(); for (int i = 0; i < size; i++) { int val = rand.nextInt(size / 2); hll.addLong(val); hashset.add(val); } FileOutputStream fos = new FileOutputStream(testFile); DataOutputStream out = new DataOutputStream(fos); HyperLogLogUtils.serializeHLL(out, hll); double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * hashset.size() / 100; FileInputStream fis = new FileInputStream(testFile); DataInputStream in = new DataInputStream(fis); HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); assertEquals(hll, deserializedHLL); assertEquals(hll.toString(), deserializedHLL.toString()); assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); assertEquals(hll.hashCode(), deserializedHLL.hashCode()); assertEquals(hll.count(), deserializedHLL.count()); assertEquals(hashset.size(), hll.count(), delta); assertEquals(hashset.size(), deserializedHLL.count(), delta); } @Test public void testHLLDenseNoBitPacking() throws IOException { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.DENSE).enableBitPacking(false) .build(); Random rand = new Random(SEED); for (int i = 0; i < size; i++) { hll.addLong(rand.nextLong()); } FileOutputStream fos = new FileOutputStream(testFile); DataOutputStream out = new DataOutputStream(fos); HyperLogLogUtils.serializeHLL(out, hll); FileInputStream fis = new FileInputStream(testFile); DataInputStream in = new DataInputStream(fis); HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); assertEquals(hll, deserializedHLL); assertEquals(hll.toString(), deserializedHLL.toString()); assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); assertEquals(hll.hashCode(), deserializedHLL.hashCode()); assertEquals(hll.count(), deserializedHLL.count()); } @Test public void testHLLDenseNoBitPackingHalfDistinct() throws IOException { HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.DENSE).enableBitPacking(false) .build(); Random rand = new Random(SEED); Set<Integer> hashset = new HashSet<Integer>(); for (int i = 0; i < size; i++) { int val = rand.nextInt(size / 2); hll.addLong(val); hashset.add(val); } FileOutputStream fos = new FileOutputStream(testFile); DataOutputStream out = new DataOutputStream(fos); HyperLogLogUtils.serializeHLL(out, hll); double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; double delta = threshold * hashset.size() / 100; FileInputStream fis = new FileInputStream(testFile); DataInputStream in = new DataInputStream(fis); HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); assertEquals(hll, deserializedHLL); assertEquals(hll.toString(), deserializedHLL.toString()); assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); assertEquals(hll.hashCode(), deserializedHLL.hashCode()); assertEquals(hll.count(), deserializedHLL.count()); assertEquals(hashset.size(), hll.count(), delta); assertEquals(hashset.size(), deserializedHLL.count(), delta); } }