org.apache.hadoop.util.bloom.DynamicBloomFilter Java Examples

The following examples show how to use org.apache.hadoop.util.bloom.DynamicBloomFilter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BloomMapFile.java    From hadoop with Apache License 2.0 6 votes vote down vote up
private void initBloomFilter(Path dirName, 
                             Configuration conf) {
  
  DataInputStream in = null;
  try {
    FileSystem fs = dirName.getFileSystem(conf);
    in = fs.open(new Path(dirName, BLOOM_FILE_NAME));
    bloomFilter = new DynamicBloomFilter();
    bloomFilter.readFields(in);
    in.close();
    in = null;
  } catch (IOException ioe) {
    LOG.warn("Can't open BloomFilter: " + ioe + " - fallback to MapFile.");
    bloomFilter = null;
  } finally {
    IOUtils.closeStream(in);
  }
}
 
Example #2
Source File: BloomAndUDFTest.java    From incubator-hivemall with Apache License 2.0 6 votes vote down vote up
@Nonnull
private static DynamicBloomFilter createBloomFilter(long seed, int size) {
    DynamicBloomFilter dbf = BloomFilterUtils.newDynamicBloomFilter(3000);
    final Key key = new Key();

    final Random rnd1 = new Random(seed);
    for (int i = 0; i < size; i++) {
        double d = rnd1.nextGaussian();
        String s = Double.toHexString(d);

        key.set(s.getBytes(), 1.0);
        dbf.add(key);
    }

    return dbf;
}
 
Example #3
Source File: BloomAndUDFTest.java    From incubator-hivemall with Apache License 2.0 6 votes vote down vote up
@Test
public void test() throws IOException, HiveException {
    BloomAndUDF udf = new BloomAndUDF();

    DynamicBloomFilter bf1 = createBloomFilter(1L, 10000);
    DynamicBloomFilter bf2 = createBloomFilter(2L, 10000);

    Text bf1str = BloomFilterUtils.serialize(bf1, new Text());
    Text bf2str = BloomFilterUtils.serialize(bf2, new Text());

    bf1.and(bf2);
    Text expected = BloomFilterUtils.serialize(bf1, new Text());

    Text actual = udf.evaluate(bf1str, bf2str);

    Assert.assertEquals(expected, actual);

    DynamicBloomFilter deserialized =
            BloomFilterUtils.deserialize(actual, new DynamicBloomFilter());
    assertNotContains(bf1, deserialized, 1L, 10000);
    assertNotContains(bf1, deserialized, 2L, 10000);
}
 
Example #4
Source File: BloomOrUDFTest.java    From incubator-hivemall with Apache License 2.0 6 votes vote down vote up
@Nonnull
private static DynamicBloomFilter createBloomFilter(long seed, int size) {
    DynamicBloomFilter dbf = BloomFilterUtils.newDynamicBloomFilter(3000);
    final Key key = new Key();

    final Random rnd1 = new Random(seed);
    for (int i = 0; i < size; i++) {
        double d = rnd1.nextGaussian();
        String s = Double.toHexString(d);

        key.set(s.getBytes(), 1.0);
        dbf.add(key);
    }

    return dbf;
}
 
Example #5
Source File: BloomOrUDFTest.java    From incubator-hivemall with Apache License 2.0 6 votes vote down vote up
@Test
public void test() throws IOException, HiveException {
    BloomOrUDF udf = new BloomOrUDF();

    DynamicBloomFilter bf1 = createBloomFilter(1L, 10000);
    DynamicBloomFilter bf2 = createBloomFilter(2L, 10000);

    Text bf1str = BloomFilterUtils.serialize(bf1, new Text());
    Text bf2str = BloomFilterUtils.serialize(bf2, new Text());

    bf1.or(bf2);
    Text expected = BloomFilterUtils.serialize(bf1, new Text());

    Text actual = udf.evaluate(bf1str, bf2str);

    Assert.assertEquals(expected, actual);

    DynamicBloomFilter deserialized =
            BloomFilterUtils.deserialize(actual, new DynamicBloomFilter());
    assertEquals(bf1, deserialized, 1L, 10000);
    assertEquals(bf1, deserialized, 2L, 10000);
}
 
Example #6
Source File: BloomNotUDFTest.java    From incubator-hivemall with Apache License 2.0 6 votes vote down vote up
@Nonnull
private static DynamicBloomFilter createBloomFilter(long seed, int size) {
    DynamicBloomFilter dbf = BloomFilterUtils.newDynamicBloomFilter(3000);
    final Key key = new Key();

    final Random rnd1 = new Random(seed);
    for (int i = 0; i < size; i++) {
        double d = rnd1.nextGaussian();
        String s = Double.toHexString(d);

        key.set(s.getBytes(), 1.0);
        dbf.add(key);
    }

    return dbf;
}
 
Example #7
Source File: BloomContainsUDFTest.java    From incubator-hivemall with Apache License 2.0 6 votes vote down vote up
@Nonnull
private static DynamicBloomFilter createBloomFilter(long seed, int size) {
    DynamicBloomFilter dbf = BloomFilterUtils.newDynamicBloomFilter(30);
    final Key key = new Key();

    final Random rnd1 = new Random(seed);
    for (int i = 0; i < size; i++) {
        double d = rnd1.nextGaussian();
        String s = Double.toHexString(d);
        Text t = new Text(s);
        key.set(t.copyBytes(), 1.0);
        dbf.add(key);
    }

    return dbf;
}
 
Example #8
Source File: BloomContainsUDFTest.java    From incubator-hivemall with Apache License 2.0 6 votes vote down vote up
@Test
public void testUDF() throws IOException, HiveException {
    BloomContainsUDF udf = new BloomContainsUDF();
    final long seed = 43L;
    final int size = 100;

    DynamicBloomFilter dbf = createBloomFilter(seed, size);
    Text bfstr = BloomFilterUtils.serialize(dbf, new Text());

    final Text key = new Text();
    final Random rnd1 = new Random(seed);
    for (int i = 0; i < size; i++) {
        double d = rnd1.nextGaussian();
        String s = Double.toHexString(d);
        key.set(s);
        Assert.assertEquals("Look up failed for key: " + key, Boolean.TRUE,
            udf.evaluate(bfstr, key));
    }
}
 
Example #9
Source File: BloomContainsAnyUDF.java    From incubator-hivemall with Apache License 2.0 6 votes vote down vote up
@Nonnull
private Filter getFilter(@Nonnull final Text bloomStr) throws HiveException {
    final Filter bloom;
    if (prevBf != null && prevBfStr.equals(bloomStr)) {
        bloom = prevBf;
    } else {
        try {
            bloom = BloomFilterUtils.deserialize(bloomStr, new DynamicBloomFilter());
        } catch (IOException e) {
            throw new HiveException(e);
        }
        this.prevBfStr = new Text(bloomStr);
        this.prevBf = bloom;
    }
    return bloom;
}
 
Example #10
Source File: BloomContainsUDF.java    From incubator-hivemall with Apache License 2.0 6 votes vote down vote up
@Nonnull
private Filter getFilter(@Nonnull final Text bloomStr) throws HiveException {
    final Filter bloom;
    if (prevBf != null && prevBfStr.equals(bloomStr)) {
        bloom = prevBf;
    } else {
        try {
            bloom = BloomFilterUtils.deserialize(bloomStr, new DynamicBloomFilter());
        } catch (IOException e) {
            throw new HiveException(e);
        }
        this.prevBfStr = new Text(bloomStr);
        this.prevBf = bloom;
    }
    return bloom;
}
 
Example #11
Source File: BloomMapFile.java    From big-c with Apache License 2.0 6 votes vote down vote up
private void initBloomFilter(Path dirName, 
                             Configuration conf) {
  
  DataInputStream in = null;
  try {
    FileSystem fs = dirName.getFileSystem(conf);
    in = fs.open(new Path(dirName, BLOOM_FILE_NAME));
    bloomFilter = new DynamicBloomFilter();
    bloomFilter.readFields(in);
    in.close();
    in = null;
  } catch (IOException ioe) {
    LOG.warn("Can't open BloomFilter: " + ioe + " - fallback to MapFile.");
    bloomFilter = null;
  } finally {
    IOUtils.closeStream(in);
  }
}
 
Example #12
Source File: BloomFilterUDAF.java    From incubator-hivemall with Apache License 2.0 5 votes vote down vote up
public boolean merge(@Nonnull Text partial) throws HiveException {
    final DynamicBloomFilter other;
    try {
        other = BloomFilterUtils.deserialize(partial, new DynamicBloomFilter());
    } catch (IOException e) {
        throw new HiveException(e);
    }

    if (filter == null) {
        this.filter = other;
    } else {
        filter.or(other);
    }
    return true;
}
 
Example #13
Source File: BloomFilterUtils.java    From incubator-hivemall with Apache License 2.0 5 votes vote down vote up
@Nonnull
public static DynamicBloomFilter newDynamicBloomFilter(
        @Nonnegative final int expectedNumberOfElements, @Nonnegative final float errorRate,
        @Nonnegative final int nbHash) {
    int vectorSize = (int) Math.ceil((-nbHash * expectedNumberOfElements)
            / Math.log(1.d - Math.pow(errorRate, 1.d / nbHash)));
    return new DynamicBloomFilter(vectorSize, nbHash, Hash.MURMUR_HASH,
        expectedNumberOfElements);
}
 
Example #14
Source File: BloomFilterUtils.java    From incubator-hivemall with Apache License 2.0 5 votes vote down vote up
@Nonnull
public static DynamicBloomFilter newDynamicBloomFilter(
        @Nonnegative final int expectedNumberOfElements, @Nonnegative final float errorRate) {
    // k = ceil(-log_2(false prob.))
    int nbHash = Math.max(2, (int) Math.ceil(-(Math.log(errorRate) / LOG2)));
    return newDynamicBloomFilter(expectedNumberOfElements, errorRate, nbHash);
}
 
Example #15
Source File: BloomNotUDFTest.java    From incubator-hivemall with Apache License 2.0 5 votes vote down vote up
@Test
public void test() throws IOException, HiveException {
    BloomNotUDF udf = new BloomNotUDF();

    DynamicBloomFilter bf1 = createBloomFilter(1L, 10000);
    Text bf1str = BloomFilterUtils.serialize(bf1, new Text());

    Text result = udf.evaluate(bf1str);
    DynamicBloomFilter actual = BloomFilterUtils.deserialize(result, new DynamicBloomFilter());

    bf1.not();

    Assert.assertEquals(bf1.toString(), actual.toString());
}
 
Example #16
Source File: BloomMapFile.java    From big-c with Apache License 2.0 5 votes vote down vote up
private synchronized void initBloomFilter(Configuration conf) {
  numKeys = conf.getInt("io.mapfile.bloom.size", 1024 * 1024);
  // vector size should be <code>-kn / (ln(1 - c^(1/k)))</code> bits for
  // single key, where <code> is the number of hash functions,
  // <code>n</code> is the number of keys and <code>c</code> is the desired
  // max. error rate.
  // Our desired error rate is by default 0.005, i.e. 0.5%
  float errorRate = conf.getFloat("io.mapfile.bloom.error.rate", 0.005f);
  vectorSize = (int)Math.ceil((double)(-HASH_COUNT * numKeys) /
      Math.log(1.0 - Math.pow(errorRate, 1.0/HASH_COUNT)));
  bloomFilter = new DynamicBloomFilter(vectorSize, HASH_COUNT,
      Hash.getHashType(conf), numKeys);
}
 
Example #17
Source File: BloomMapFile.java    From hadoop with Apache License 2.0 5 votes vote down vote up
private synchronized void initBloomFilter(Configuration conf) {
  numKeys = conf.getInt("io.mapfile.bloom.size", 1024 * 1024);
  // vector size should be <code>-kn / (ln(1 - c^(1/k)))</code> bits for
  // single key, where <code> is the number of hash functions,
  // <code>n</code> is the number of keys and <code>c</code> is the desired
  // max. error rate.
  // Our desired error rate is by default 0.005, i.e. 0.5%
  float errorRate = conf.getFloat("io.mapfile.bloom.error.rate", 0.005f);
  vectorSize = (int)Math.ceil((double)(-HASH_COUNT * numKeys) /
      Math.log(1.0 - Math.pow(errorRate, 1.0/HASH_COUNT)));
  bloomFilter = new DynamicBloomFilter(vectorSize, HASH_COUNT,
      Hash.getHashType(conf), numKeys);
}
 
Example #18
Source File: DistinctAggregator.java    From compiler with Apache License 2.0 5 votes vote down vote up
/** {@inheritDoc} */
@Override
public void start(final EmitKey key) {
	super.start(key);

	this.filter = new DynamicBloomFilter(this.vectorSize, HASH_COUNT, Hash.MURMUR_HASH, (int) this.getArg());
}
 
Example #19
Source File: BloomMapFile.java    From RDFS with Apache License 2.0 5 votes vote down vote up
private synchronized void initBloomFilter(Configuration conf) {
  numKeys = conf.getInt("io.mapfile.bloom.size", 1024 * 1024);
  // vector size should be <code>-kn / (ln(1 - c^(1/k)))</code> bits for
  // single key, where <code> is the number of hash functions,
  // <code>n</code> is the number of keys and <code>c</code> is the desired
  // max. error rate.
  // Our desired error rate is by default 0.005, i.e. 0.5%
  float errorRate = conf.getFloat("io.mapfile.bloom.error.rate", 0.005f);
  vectorSize = (int)Math.ceil((double)(-HASH_COUNT * numKeys) /
      Math.log(1.0 - Math.pow(errorRate, 1.0/HASH_COUNT)));
  bloomFilter = new DynamicBloomFilter(vectorSize, HASH_COUNT,
      Hash.getHashType(conf), numKeys);
}
 
Example #20
Source File: BloomMapFile.java    From RDFS with Apache License 2.0 5 votes vote down vote up
private void initBloomFilter(FileSystem fs, String dirName,
    Configuration conf) {
  try {
    DataInputStream in = fs.open(new Path(dirName, BLOOM_FILE_NAME));
    bloomFilter = new DynamicBloomFilter();
    bloomFilter.readFields(in);
    in.close();
  } catch (IOException ioe) {
    LOG.warn("Can't open BloomFilter: " + ioe + " - fallback to MapFile.");
    bloomFilter = null;
  }
}
 
Example #21
Source File: BloomMapFile.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
private synchronized void initBloomFilter(Configuration conf) {
  numKeys = conf.getInt("io.mapfile.bloom.size", 1024 * 1024);
  // vector size should be <code>-kn / (ln(1 - c^(1/k)))</code> bits for
  // single key, where <code> is the number of hash functions,
  // <code>n</code> is the number of keys and <code>c</code> is the desired
  // max. error rate.
  // Our desired error rate is by default 0.005, i.e. 0.5%
  float errorRate = conf.getFloat("io.mapfile.bloom.error.rate", 0.005f);
  vectorSize = (int)Math.ceil((double)(-HASH_COUNT * numKeys) /
      Math.log(1.0 - Math.pow(errorRate, 1.0/HASH_COUNT)));
  bloomFilter = new DynamicBloomFilter(vectorSize, HASH_COUNT,
      Hash.getHashType(conf), numKeys);
}
 
Example #22
Source File: BloomMapFile.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
private void initBloomFilter(FileSystem fs, String dirName,
    Configuration conf) {
  try {
    DataInputStream in = fs.open(new Path(dirName, BLOOM_FILE_NAME));
    bloomFilter = new DynamicBloomFilter();
    bloomFilter.readFields(in);
    in.close();
  } catch (IOException ioe) {
    LOG.warn("Can't open BloomFilter: " + ioe + " - fallback to MapFile.");
    bloomFilter = null;
  }
}
 
Example #23
Source File: BloomFilterUtils.java    From incubator-hivemall with Apache License 2.0 4 votes vote down vote up
@Nonnull
public static DynamicBloomFilter newDynamicBloomFilter(
        @Nonnegative final int expectedNumberOfElements) {
    return newDynamicBloomFilter(expectedNumberOfElements, DEFAULT_ERROR_RATE);
}
 
Example #24
Source File: BloomFilterUtils.java    From incubator-hivemall with Apache License 2.0 4 votes vote down vote up
@Nonnull
public static DynamicBloomFilter newDynamicBloomFilter() {
    return newDynamicBloomFilter(DEFAULT_BLOOM_FILTER_SIZE, DEFAULT_ERROR_RATE, NUM_HASHES);
}