Java Code Examples for org.apache.hadoop.util.bloom.DynamicBloomFilter

The following examples show how to use org.apache.hadoop.util.bloom.DynamicBloomFilter. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: hadoop   Source File: BloomMapFile.java    License: Apache License 2.0 6 votes vote down vote up
private void initBloomFilter(Path dirName, 
                             Configuration conf) {
  
  DataInputStream in = null;
  try {
    FileSystem fs = dirName.getFileSystem(conf);
    in = fs.open(new Path(dirName, BLOOM_FILE_NAME));
    bloomFilter = new DynamicBloomFilter();
    bloomFilter.readFields(in);
    in.close();
    in = null;
  } catch (IOException ioe) {
    LOG.warn("Can't open BloomFilter: " + ioe + " - fallback to MapFile.");
    bloomFilter = null;
  } finally {
    IOUtils.closeStream(in);
  }
}
 
Example 2
Source Project: big-c   Source File: BloomMapFile.java    License: Apache License 2.0 6 votes vote down vote up
private void initBloomFilter(Path dirName, 
                             Configuration conf) {
  
  DataInputStream in = null;
  try {
    FileSystem fs = dirName.getFileSystem(conf);
    in = fs.open(new Path(dirName, BLOOM_FILE_NAME));
    bloomFilter = new DynamicBloomFilter();
    bloomFilter.readFields(in);
    in.close();
    in = null;
  } catch (IOException ioe) {
    LOG.warn("Can't open BloomFilter: " + ioe + " - fallback to MapFile.");
    bloomFilter = null;
  } finally {
    IOUtils.closeStream(in);
  }
}
 
Example 3
Source Project: incubator-hivemall   Source File: BloomContainsUDF.java    License: Apache License 2.0 6 votes vote down vote up
@Nonnull
private Filter getFilter(@Nonnull final Text bloomStr) throws HiveException {
    final Filter bloom;
    if (prevBf != null && prevBfStr.equals(bloomStr)) {
        bloom = prevBf;
    } else {
        try {
            bloom = BloomFilterUtils.deserialize(bloomStr, new DynamicBloomFilter());
        } catch (IOException e) {
            throw new HiveException(e);
        }
        this.prevBfStr = new Text(bloomStr);
        this.prevBf = bloom;
    }
    return bloom;
}
 
Example 4
Source Project: incubator-hivemall   Source File: BloomContainsAnyUDF.java    License: Apache License 2.0 6 votes vote down vote up
@Nonnull
private Filter getFilter(@Nonnull final Text bloomStr) throws HiveException {
    final Filter bloom;
    if (prevBf != null && prevBfStr.equals(bloomStr)) {
        bloom = prevBf;
    } else {
        try {
            bloom = BloomFilterUtils.deserialize(bloomStr, new DynamicBloomFilter());
        } catch (IOException e) {
            throw new HiveException(e);
        }
        this.prevBfStr = new Text(bloomStr);
        this.prevBf = bloom;
    }
    return bloom;
}
 
Example 5
Source Project: incubator-hivemall   Source File: BloomContainsUDFTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testUDF() throws IOException, HiveException {
    BloomContainsUDF udf = new BloomContainsUDF();
    final long seed = 43L;
    final int size = 100;

    DynamicBloomFilter dbf = createBloomFilter(seed, size);
    Text bfstr = BloomFilterUtils.serialize(dbf, new Text());

    final Text key = new Text();
    final Random rnd1 = new Random(seed);
    for (int i = 0; i < size; i++) {
        double d = rnd1.nextGaussian();
        String s = Double.toHexString(d);
        key.set(s);
        Assert.assertEquals("Look up failed for key: " + key, Boolean.TRUE,
            udf.evaluate(bfstr, key));
    }
}
 
Example 6
Source Project: incubator-hivemall   Source File: BloomContainsUDFTest.java    License: Apache License 2.0 6 votes vote down vote up
@Nonnull
private static DynamicBloomFilter createBloomFilter(long seed, int size) {
    DynamicBloomFilter dbf = BloomFilterUtils.newDynamicBloomFilter(30);
    final Key key = new Key();

    final Random rnd1 = new Random(seed);
    for (int i = 0; i < size; i++) {
        double d = rnd1.nextGaussian();
        String s = Double.toHexString(d);
        Text t = new Text(s);
        key.set(t.copyBytes(), 1.0);
        dbf.add(key);
    }

    return dbf;
}
 
Example 7
Source Project: incubator-hivemall   Source File: BloomNotUDFTest.java    License: Apache License 2.0 6 votes vote down vote up
@Nonnull
private static DynamicBloomFilter createBloomFilter(long seed, int size) {
    DynamicBloomFilter dbf = BloomFilterUtils.newDynamicBloomFilter(3000);
    final Key key = new Key();

    final Random rnd1 = new Random(seed);
    for (int i = 0; i < size; i++) {
        double d = rnd1.nextGaussian();
        String s = Double.toHexString(d);

        key.set(s.getBytes(), 1.0);
        dbf.add(key);
    }

    return dbf;
}
 
Example 8
Source Project: incubator-hivemall   Source File: BloomOrUDFTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void test() throws IOException, HiveException {
    BloomOrUDF udf = new BloomOrUDF();

    DynamicBloomFilter bf1 = createBloomFilter(1L, 10000);
    DynamicBloomFilter bf2 = createBloomFilter(2L, 10000);

    Text bf1str = BloomFilterUtils.serialize(bf1, new Text());
    Text bf2str = BloomFilterUtils.serialize(bf2, new Text());

    bf1.or(bf2);
    Text expected = BloomFilterUtils.serialize(bf1, new Text());

    Text actual = udf.evaluate(bf1str, bf2str);

    Assert.assertEquals(expected, actual);

    DynamicBloomFilter deserialized =
            BloomFilterUtils.deserialize(actual, new DynamicBloomFilter());
    assertEquals(bf1, deserialized, 1L, 10000);
    assertEquals(bf1, deserialized, 2L, 10000);
}
 
Example 9
Source Project: incubator-hivemall   Source File: BloomOrUDFTest.java    License: Apache License 2.0 6 votes vote down vote up
@Nonnull
private static DynamicBloomFilter createBloomFilter(long seed, int size) {
    DynamicBloomFilter dbf = BloomFilterUtils.newDynamicBloomFilter(3000);
    final Key key = new Key();

    final Random rnd1 = new Random(seed);
    for (int i = 0; i < size; i++) {
        double d = rnd1.nextGaussian();
        String s = Double.toHexString(d);

        key.set(s.getBytes(), 1.0);
        dbf.add(key);
    }

    return dbf;
}
 
Example 10
Source Project: incubator-hivemall   Source File: BloomAndUDFTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void test() throws IOException, HiveException {
    BloomAndUDF udf = new BloomAndUDF();

    DynamicBloomFilter bf1 = createBloomFilter(1L, 10000);
    DynamicBloomFilter bf2 = createBloomFilter(2L, 10000);

    Text bf1str = BloomFilterUtils.serialize(bf1, new Text());
    Text bf2str = BloomFilterUtils.serialize(bf2, new Text());

    bf1.and(bf2);
    Text expected = BloomFilterUtils.serialize(bf1, new Text());

    Text actual = udf.evaluate(bf1str, bf2str);

    Assert.assertEquals(expected, actual);

    DynamicBloomFilter deserialized =
            BloomFilterUtils.deserialize(actual, new DynamicBloomFilter());
    assertNotContains(bf1, deserialized, 1L, 10000);
    assertNotContains(bf1, deserialized, 2L, 10000);
}
 
Example 11
Source Project: incubator-hivemall   Source File: BloomAndUDFTest.java    License: Apache License 2.0 6 votes vote down vote up
@Nonnull
private static DynamicBloomFilter createBloomFilter(long seed, int size) {
    DynamicBloomFilter dbf = BloomFilterUtils.newDynamicBloomFilter(3000);
    final Key key = new Key();

    final Random rnd1 = new Random(seed);
    for (int i = 0; i < size; i++) {
        double d = rnd1.nextGaussian();
        String s = Double.toHexString(d);

        key.set(s.getBytes(), 1.0);
        dbf.add(key);
    }

    return dbf;
}
 
Example 12
Source Project: hadoop   Source File: BloomMapFile.java    License: Apache License 2.0 5 votes vote down vote up
private synchronized void initBloomFilter(Configuration conf) {
  numKeys = conf.getInt("io.mapfile.bloom.size", 1024 * 1024);
  // vector size should be <code>-kn / (ln(1 - c^(1/k)))</code> bits for
  // single key, where <code> is the number of hash functions,
  // <code>n</code> is the number of keys and <code>c</code> is the desired
  // max. error rate.
  // Our desired error rate is by default 0.005, i.e. 0.5%
  float errorRate = conf.getFloat("io.mapfile.bloom.error.rate", 0.005f);
  vectorSize = (int)Math.ceil((double)(-HASH_COUNT * numKeys) /
      Math.log(1.0 - Math.pow(errorRate, 1.0/HASH_COUNT)));
  bloomFilter = new DynamicBloomFilter(vectorSize, HASH_COUNT,
      Hash.getHashType(conf), numKeys);
}
 
Example 13
Source Project: big-c   Source File: BloomMapFile.java    License: Apache License 2.0 5 votes vote down vote up
private synchronized void initBloomFilter(Configuration conf) {
  numKeys = conf.getInt("io.mapfile.bloom.size", 1024 * 1024);
  // vector size should be <code>-kn / (ln(1 - c^(1/k)))</code> bits for
  // single key, where <code> is the number of hash functions,
  // <code>n</code> is the number of keys and <code>c</code> is the desired
  // max. error rate.
  // Our desired error rate is by default 0.005, i.e. 0.5%
  float errorRate = conf.getFloat("io.mapfile.bloom.error.rate", 0.005f);
  vectorSize = (int)Math.ceil((double)(-HASH_COUNT * numKeys) /
      Math.log(1.0 - Math.pow(errorRate, 1.0/HASH_COUNT)));
  bloomFilter = new DynamicBloomFilter(vectorSize, HASH_COUNT,
      Hash.getHashType(conf), numKeys);
}
 
Example 14
Source Project: incubator-hivemall   Source File: BloomFilterUtils.java    License: Apache License 2.0 5 votes vote down vote up
@Nonnull
public static DynamicBloomFilter newDynamicBloomFilter(
        @Nonnegative final int expectedNumberOfElements, @Nonnegative final float errorRate) {
    // k = ceil(-log_2(false prob.))
    int nbHash = Math.max(2, (int) Math.ceil(-(Math.log(errorRate) / LOG2)));
    return newDynamicBloomFilter(expectedNumberOfElements, errorRate, nbHash);
}
 
Example 15
Source Project: incubator-hivemall   Source File: BloomFilterUtils.java    License: Apache License 2.0 5 votes vote down vote up
@Nonnull
public static DynamicBloomFilter newDynamicBloomFilter(
        @Nonnegative final int expectedNumberOfElements, @Nonnegative final float errorRate,
        @Nonnegative final int nbHash) {
    int vectorSize = (int) Math.ceil((-nbHash * expectedNumberOfElements)
            / Math.log(1.d - Math.pow(errorRate, 1.d / nbHash)));
    return new DynamicBloomFilter(vectorSize, nbHash, Hash.MURMUR_HASH,
        expectedNumberOfElements);
}
 
Example 16
Source Project: incubator-hivemall   Source File: BloomFilterUDAF.java    License: Apache License 2.0 5 votes vote down vote up
public boolean merge(@Nonnull Text partial) throws HiveException {
    final DynamicBloomFilter other;
    try {
        other = BloomFilterUtils.deserialize(partial, new DynamicBloomFilter());
    } catch (IOException e) {
        throw new HiveException(e);
    }

    if (filter == null) {
        this.filter = other;
    } else {
        filter.or(other);
    }
    return true;
}
 
Example 17
Source Project: incubator-hivemall   Source File: BloomNotUDFTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void test() throws IOException, HiveException {
    BloomNotUDF udf = new BloomNotUDF();

    DynamicBloomFilter bf1 = createBloomFilter(1L, 10000);
    Text bf1str = BloomFilterUtils.serialize(bf1, new Text());

    Text result = udf.evaluate(bf1str);
    DynamicBloomFilter actual = BloomFilterUtils.deserialize(result, new DynamicBloomFilter());

    bf1.not();

    Assert.assertEquals(bf1.toString(), actual.toString());
}
 
Example 18
Source Project: compiler   Source File: DistinctAggregator.java    License: Apache License 2.0 5 votes vote down vote up
/** {@inheritDoc} */
@Override
public void start(final EmitKey key) {
	super.start(key);

	this.filter = new DynamicBloomFilter(this.vectorSize, HASH_COUNT, Hash.MURMUR_HASH, (int) this.getArg());
}
 
Example 19
Source Project: RDFS   Source File: BloomMapFile.java    License: Apache License 2.0 5 votes vote down vote up
private synchronized void initBloomFilter(Configuration conf) {
  numKeys = conf.getInt("io.mapfile.bloom.size", 1024 * 1024);
  // vector size should be <code>-kn / (ln(1 - c^(1/k)))</code> bits for
  // single key, where <code> is the number of hash functions,
  // <code>n</code> is the number of keys and <code>c</code> is the desired
  // max. error rate.
  // Our desired error rate is by default 0.005, i.e. 0.5%
  float errorRate = conf.getFloat("io.mapfile.bloom.error.rate", 0.005f);
  vectorSize = (int)Math.ceil((double)(-HASH_COUNT * numKeys) /
      Math.log(1.0 - Math.pow(errorRate, 1.0/HASH_COUNT)));
  bloomFilter = new DynamicBloomFilter(vectorSize, HASH_COUNT,
      Hash.getHashType(conf), numKeys);
}
 
Example 20
Source Project: RDFS   Source File: BloomMapFile.java    License: Apache License 2.0 5 votes vote down vote up
private void initBloomFilter(FileSystem fs, String dirName,
    Configuration conf) {
  try {
    DataInputStream in = fs.open(new Path(dirName, BLOOM_FILE_NAME));
    bloomFilter = new DynamicBloomFilter();
    bloomFilter.readFields(in);
    in.close();
  } catch (IOException ioe) {
    LOG.warn("Can't open BloomFilter: " + ioe + " - fallback to MapFile.");
    bloomFilter = null;
  }
}
 
Example 21
Source Project: hadoop-gpu   Source File: BloomMapFile.java    License: Apache License 2.0 5 votes vote down vote up
private synchronized void initBloomFilter(Configuration conf) {
  numKeys = conf.getInt("io.mapfile.bloom.size", 1024 * 1024);
  // vector size should be <code>-kn / (ln(1 - c^(1/k)))</code> bits for
  // single key, where <code> is the number of hash functions,
  // <code>n</code> is the number of keys and <code>c</code> is the desired
  // max. error rate.
  // Our desired error rate is by default 0.005, i.e. 0.5%
  float errorRate = conf.getFloat("io.mapfile.bloom.error.rate", 0.005f);
  vectorSize = (int)Math.ceil((double)(-HASH_COUNT * numKeys) /
      Math.log(1.0 - Math.pow(errorRate, 1.0/HASH_COUNT)));
  bloomFilter = new DynamicBloomFilter(vectorSize, HASH_COUNT,
      Hash.getHashType(conf), numKeys);
}
 
Example 22
Source Project: hadoop-gpu   Source File: BloomMapFile.java    License: Apache License 2.0 5 votes vote down vote up
private void initBloomFilter(FileSystem fs, String dirName,
    Configuration conf) {
  try {
    DataInputStream in = fs.open(new Path(dirName, BLOOM_FILE_NAME));
    bloomFilter = new DynamicBloomFilter();
    bloomFilter.readFields(in);
    in.close();
  } catch (IOException ioe) {
    LOG.warn("Can't open BloomFilter: " + ioe + " - fallback to MapFile.");
    bloomFilter = null;
  }
}
 
Example 23
Source Project: incubator-hivemall   Source File: BloomFilterUtils.java    License: Apache License 2.0 4 votes vote down vote up
@Nonnull
public static DynamicBloomFilter newDynamicBloomFilter() {
    return newDynamicBloomFilter(DEFAULT_BLOOM_FILTER_SIZE, DEFAULT_ERROR_RATE, NUM_HASHES);
}
 
Example 24
Source Project: incubator-hivemall   Source File: BloomFilterUtils.java    License: Apache License 2.0 4 votes vote down vote up
@Nonnull
public static DynamicBloomFilter newDynamicBloomFilter(
        @Nonnegative final int expectedNumberOfElements) {
    return newDynamicBloomFilter(expectedNumberOfElements, DEFAULT_ERROR_RATE);
}