/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.joshua.decoder.ff.lm.bloomfilter_lm; import java.io.Externalizable; import java.io.IOException; import java.io.ObjectInput; import java.io.ObjectOutput; import java.math.BigInteger; import java.util.BitSet; import java.util.Random; /** * A Bloom filter: a lossy data structure for set representation. A Bloom filter consists of a bit * set and a set of hash functions. A Bloom filter has two operations: add and query. We can add an * object to a Bloom filter to indicate that it should be considered part of the set that the Bloom * filter represents. We can query the Bloom filter to see if a given object is considered part of * its set. * <p> * An object is added by sending it through a number of hash functions, each of which returns an * index into the bit set. The bit at each of the indices is flipped on. We can query for an abject * by sending it through the same hash functions. Then we look the bit at each index that was * returned by a hash function. If any of the bits is unset, we know that the object is not in the * Bloom filter (for otherwise all the bits should have already been set). If all the bits are set, * we assume that the object is present in the Bloom filter. * <p> * We cannot know for sure that an object is in the bloom filter just because all its bits were set. * There may be many collisions in the hash space, and all the bits for some object might be set by * chance, rather than by adding that particular object. * <p> * The advantage of a Bloom filter is that its set representation can be stored in a significantly * smaller space than information-theoretic lossless lower bounds. The price we pay for this is a * certain amount of error in the query function. One nice feature of the Bloom filter is that its * error is one-sided. This means that while the query function may return false positives (saying * an object is present when it really isn't), it can never return false negatives (saying that an * object is not present when it was already added. */ public class BloomFilter implements Externalizable { /** * The main bit set of the Bloom filter. */ private BitSet bitSet; /** * The number of objects expected to be stored in the Bloom filter. The optimal number of hash * functions depends on this number. */ int expectedNumberOfObjects; /** * A prime number that should be bigger than the size of the bit set. */ long bigPrime; /** * The size of the bit set, in bits. */ int filterSize; /** * A random number generator for building hash functions. */ final transient private Random RANDOM = new Random(); /** * Builds an empty Bloom filter, ready to build hash functions and store objects. * * @param filterSize the size of Bloom filter to make, in bits * @param expectedNumberOfObjects the number of objects expected to be stored in the Bloom filter */ public BloomFilter(int filterSize, int expectedNumberOfObjects) { bitSet = new BitSet(filterSize); this.filterSize = filterSize; this.expectedNumberOfObjects = expectedNumberOfObjects; bigPrime = getPrimeLargerThan(filterSize); } /** * Adds an item (represented by an integer) to the bloom filter. * * @param objectToAdd the object to add * @param hashFunctions an array of pairs of long, representing the hash functions to be used on * the object */ public void add(int objectToAdd, long[][] hashFunctions) { for (long[] h : hashFunctions) { int i = hash(h, (long) objectToAdd); bitSet.set(i); } } public void add(long objectToAdd, long[][] hashFunctions) { for (long[] h : hashFunctions) { int i = hash(h, objectToAdd); bitSet.set(i); } } /** * Determines whether an item (represented by an integer) is present in the bloom filter. * * @param objectToQuery the object we want to query for membership * @param hashFunctions an array of pairs of long, representing the hash functions to be used * * @return true if the objects is assumed to be present in the Bloom filter, false if it is * definitely not present */ public boolean query(int objectToQuery, long[][] hashFunctions) { for (long[] h : hashFunctions) { int i = hash(h, (long) objectToQuery); if (!bitSet.get(i)) return false; } return true; } public boolean query(long objectToQuery, long[][] hashFunctions) { for (long[] h : hashFunctions) { int i = hash(h, objectToQuery); if (!bitSet.get(i)) return false; } return true; } /** * Builds an array of pairs of long that can be used as hash functions for this Bloom filter. * * @return an array of pairs of long suitable for use as hash functions */ public long[][] initializeHashFunctions() { int numberOfHashFunctions; int bigPrimeInt = (int) bigPrime; numberOfHashFunctions = (int) Math.floor(Math.log(2) * bitSet.length() / expectedNumberOfObjects); if (numberOfHashFunctions == 0) numberOfHashFunctions = 1; long[][] hashFunctions = new long[numberOfHashFunctions][2]; for (long[] h : hashFunctions) { h[0] = (long) RANDOM.nextInt(bigPrimeInt) + 1; h[1] = (long) RANDOM.nextInt(bigPrimeInt) + 1; } return hashFunctions; } /** * Determines which bit of the bit set should be either set, for add operations, or checked, for * query operations. * * @param h a length-2 array of long used as a hash function * @param objectToHash the object of interest * * @return an index into the bit set of the Bloom filter */ private int hash(long[] h, long objectToHash) { long obj = (objectToHash < Integer.MAX_VALUE) ? objectToHash : objectToHash - bigPrime; long h0 = h[0]; long h1 = (h[1] < (Long.MAX_VALUE / 2)) ? h[1] : h[1] - bigPrime; long ret = (obj * h0) % bigPrime; ret = (ret < (Long.MAX_VALUE / 2)) ? ret : ret - bigPrime; return (int) (((ret + h1) % bigPrime) % (long) filterSize); } /** * Finds a prime number that is larger than the given number. This is used to find bigPrime, a * prime that has to be larger than the size of the Bloom filter. * * @param n an integer * * @return a prime number larger than n */ private long getPrimeLargerThan(int n) { BigInteger ret; BigInteger maxLong = BigInteger.valueOf(Long.MAX_VALUE); int numBits = BigInteger.valueOf(n).bitLength() + 1; do { ret = BigInteger.probablePrime(numBits, RANDOM); } while (ret.compareTo(maxLong) > 1); return ret.longValue(); } /* * functions for interface externalizable */ public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException { expectedNumberOfObjects = in.readInt(); filterSize = in.readInt(); bigPrime = in.readLong(); bitSet = (BitSet) in.readObject(); } public void writeExternal(ObjectOutput out) throws IOException { out.writeInt(expectedNumberOfObjects); out.writeInt(filterSize); out.writeLong(bigPrime); out.writeObject(bitSet); } // only used for reconstruction via Externalizable public BloomFilter() {} }