/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.joshua.decoder.ff.lm.bloomfilter_lm;

import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.math.BigInteger;
import java.util.BitSet;
import java.util.Random;

/**
 * A Bloom filter: a lossy data structure for set representation. A Bloom filter consists of a bit
 * set and a set of hash functions. A Bloom filter has two operations: add and query. We can add an
 * object to a Bloom filter to indicate that it should be considered part of the set that the Bloom
 * filter represents. We can query the Bloom filter to see if a given object is considered part of
 * its set.
 * <p>
 * An object is added by sending it through a number of hash functions, each of which returns an
 * index into the bit set. The bit at each of the indices is flipped on. We can query for an abject
 * by sending it through the same hash functions. Then we look the bit at each index that was
 * returned by a hash function. If any of the bits is unset, we know that the object is not in the
 * Bloom filter (for otherwise all the bits should have already been set). If all the bits are set,
 * we assume that the object is present in the Bloom filter.
 * <p>
 * We cannot know for sure that an object is in the bloom filter just because all its bits were set.
 * There may be many collisions in the hash space, and all the bits for some object might be set by
 * chance, rather than by adding that particular object.
 * <p>
 * The advantage of a Bloom filter is that its set representation can be stored in a significantly
 * smaller space than information-theoretic lossless lower bounds. The price we pay for this is a
 * certain amount of error in the query function. One nice feature of the Bloom filter is that its
 * error is one-sided. This means that while the query function may return false positives (saying
 * an object is present when it really isn't), it can never return false negatives (saying that an
 * object is not present when it was already added.
 */
public class BloomFilter implements Externalizable {
  /**
   * The main bit set of the Bloom filter.
   */
  private BitSet bitSet;

  /**
   * The number of objects expected to be stored in the Bloom filter. The optimal number of hash
   * functions depends on this number.
   */
  int expectedNumberOfObjects;

  /**
   * A prime number that should be bigger than the size of the bit set.
   */
  long bigPrime;

  /**
   * The size of the bit set, in bits.
   */
  int filterSize;

  /**
   * A random number generator for building hash functions.
   */
  final transient private Random RANDOM = new Random();

  /**
   * Builds an empty Bloom filter, ready to build hash functions and store objects.
   * 
   * @param filterSize the size of Bloom filter to make, in bits
   * @param expectedNumberOfObjects the number of objects expected to be stored in the Bloom filter
   */
  public BloomFilter(int filterSize, int expectedNumberOfObjects) {
    bitSet = new BitSet(filterSize);
    this.filterSize = filterSize;
    this.expectedNumberOfObjects = expectedNumberOfObjects;
    bigPrime = getPrimeLargerThan(filterSize);
  }

  /**
   * Adds an item (represented by an integer) to the bloom filter.
   * 
   * @param objectToAdd the object to add
   * @param hashFunctions an array of pairs of long, representing the hash functions to be used on
   *        the object
   */
  public void add(int objectToAdd, long[][] hashFunctions) {
    for (long[] h : hashFunctions) {
      int i = hash(h, (long) objectToAdd);
      bitSet.set(i);
    }
  }

  public void add(long objectToAdd, long[][] hashFunctions) {
    for (long[] h : hashFunctions) {
      int i = hash(h, objectToAdd);
      bitSet.set(i);
    }
  }

  /**
   * Determines whether an item (represented by an integer) is present in the bloom filter.
   * 
   * @param objectToQuery the object we want to query for membership
   * @param hashFunctions an array of pairs of long, representing the hash functions to be used
   * 
   * @return true if the objects is assumed to be present in the Bloom filter, false if it is
   *         definitely not present
   */
  public boolean query(int objectToQuery, long[][] hashFunctions) {
    for (long[] h : hashFunctions) {
      int i = hash(h, (long) objectToQuery);
      if (!bitSet.get(i)) return false;
    }
    return true;
  }

  public boolean query(long objectToQuery, long[][] hashFunctions) {
    for (long[] h : hashFunctions) {
      int i = hash(h, objectToQuery);
      if (!bitSet.get(i)) return false;
    }
    return true;
  }

  /**
   * Builds an array of pairs of long that can be used as hash functions for this Bloom filter.
   * 
   * @return an array of pairs of long suitable for use as hash functions
   */
  public long[][] initializeHashFunctions() {
    int numberOfHashFunctions;
    int bigPrimeInt = (int) bigPrime;
    numberOfHashFunctions =
        (int) Math.floor(Math.log(2) * bitSet.length() / expectedNumberOfObjects);
    if (numberOfHashFunctions == 0) numberOfHashFunctions = 1;
    long[][] hashFunctions = new long[numberOfHashFunctions][2];
    for (long[] h : hashFunctions) {
      h[0] = (long) RANDOM.nextInt(bigPrimeInt) + 1;
      h[1] = (long) RANDOM.nextInt(bigPrimeInt) + 1;
    }
    return hashFunctions;
  }

  /**
   * Determines which bit of the bit set should be either set, for add operations, or checked, for
   * query operations.
   * 
   * @param h a length-2 array of long used as a hash function
   * @param objectToHash the object of interest
   * 
   * @return an index into the bit set of the Bloom filter
   */
  private int hash(long[] h, long objectToHash) {
    long obj = (objectToHash < Integer.MAX_VALUE) ? objectToHash : objectToHash - bigPrime;
    long h0 = h[0];
    long h1 = (h[1] < (Long.MAX_VALUE / 2)) ? h[1] : h[1] - bigPrime;
    long ret = (obj * h0) % bigPrime;
    ret = (ret < (Long.MAX_VALUE / 2)) ? ret : ret - bigPrime;
    return (int) (((ret + h1) % bigPrime) % (long) filterSize);
  }

  /**
   * Finds a prime number that is larger than the given number. This is used to find bigPrime, a
   * prime that has to be larger than the size of the Bloom filter.
   * 
   * @param n an integer
   * 
   * @return a prime number larger than n
   */
  private long getPrimeLargerThan(int n) {
    BigInteger ret;
    BigInteger maxLong = BigInteger.valueOf(Long.MAX_VALUE);
    int numBits = BigInteger.valueOf(n).bitLength() + 1;
    do {
      ret = BigInteger.probablePrime(numBits, RANDOM);
    } while (ret.compareTo(maxLong) > 1);
    return ret.longValue();
  }

  /*
   * functions for interface externalizable
   */

  public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
    expectedNumberOfObjects = in.readInt();
    filterSize = in.readInt();
    bigPrime = in.readLong();
    bitSet = (BitSet) in.readObject();
  }

  public void writeExternal(ObjectOutput out) throws IOException {
    out.writeInt(expectedNumberOfObjects);
    out.writeInt(filterSize);
    out.writeLong(bigPrime);
    out.writeObject(bitSet);
  }

  // only used for reconstruction via Externalizable
  public BloomFilter() {}
}