package pitt.search.semanticvectors.vectors; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Random; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; import java.util.logging.Logger; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.FixedBitSet; /** * Binary implementation of Vector. * * Uses an "elemental" representation which is a single bit string (Lucene FixedBitSet). * * Superposes on this a "semantic" representation which contains the weights with which different * vectors have been added (superposed) onto this one. Calling {@link #superpose} causes the * voting record to be updated, but for performance the votes are not tallied back into the * elemental bit set representation until {@link #normalize} or one of the writing functions * is called. * * @author Trevor Cohen */ public class BinaryVector implements Vector { /** * Enumeration of normalization options. */ public enum BinaryNormalizationMethod { /** * Set to one if more ones than zeros recorded in the voting record for * this dimension (zeros are recorded implicitly, by counting total votes), * and split at random (50% chance of 1) if the same number of ones and zeros. * Otherwise zero. */ SPATTERCODE, /** * The probability of a one is equal to the probability of the proportion of 1 to 0 * votes for this dimension, assuming a Gaussian distribution */ PROBABILISTIC } public static BinaryNormalizationMethod NORMALIZE_METHOD = BinaryNormalizationMethod.SPATTERCODE; public static void setNormalizationMethod(BinaryNormalizationMethod normalizationMethod) { logger.info("Globally setting binary vector NORMALIZATION_METHOD to: '" + normalizationMethod + "'"); NORMALIZE_METHOD = normalizationMethod; } public static final Logger logger = Logger.getLogger(BinaryVector.class.getCanonicalName()); /** Returns {@link VectorType#BINARY} */ public VectorType getVectorType() { return VectorType.BINARY; } // TODO: Determing proper interface for default constants. /** * Number of decimal places to consider in weighted superpositions of binary vectors. * Higher precision requires additional memory during training. */ public static final int BINARY_VECTOR_DECIMAL_PLACES = 2; public static final boolean BINARY_BINDING_WITH_PERMUTE = false; private static int DEBUG_PRINT_LENGTH = 64; private Random random; private final int dimension; /** * Elemental representation for binary vectors. */ protected FixedBitSet bitSet; private boolean isSparse; private AtomicBoolean unTallied = new AtomicBoolean(true); /** * Representation of voting record for superposition. Each FixedBitSet object contains one bit * of the count for the vote in each dimension. The count for any given dimension is derived from * all of the bits in that dimension across the FixedBitSets in the voting record. * * The precision of the voting record (in number of decimal places) is defined upon initialization. * By default, if the first weight added is an integer, rounding occurs to the nearest integer. * Otherwise, rounding occurs to the second binary place. */ private ArrayList<FixedBitSet> votingRecord; /** BINARY_VECTOR_DECIMAL_PLACESum of the weights with which vectors have been added into the voting record */ private AtomicLong totalNumberOfVotes = new AtomicLong(0); // TODO(widdows) Understand and comment this. private long minimum = 0; // Used only for temporary internal storage. private FixedBitSet tempSet; public BinaryVector(int dimension) { // Check "multiple-of-64" constraint, to facilitate permutation of 64-bit chunks if (dimension % 64 != 0) { throw new IllegalArgumentException("Dimension should be a multiple of 64: " + dimension + " will lead to trouble!"); } this.dimension = dimension; this.bitSet = new FixedBitSet(dimension); this.isSparse = true; this.random = new Random(); } /** * Returns a new copy of this vector, in dense format. */ @SuppressWarnings("unchecked") public BinaryVector copy() { BinaryVector copy = new BinaryVector(dimension); copy.bitSet = (FixedBitSet) bitSet.clone(); if (!isSparse) copy.votingRecord = (ArrayList<FixedBitSet>) votingRecord.clone(); if (tempSet != null) { copy.tempSet = tempSet.clone(); } copy.minimum = minimum; copy.totalNumberOfVotes = new AtomicLong(totalNumberOfVotes.longValue()); copy.unTallied = new AtomicBoolean(unTallied.get()); copy.isSparse = isSparse; return copy; } public String toString() { StringBuilder debugString = new StringBuilder(""); if (isSparse) { debugString.append(" Sparse. First " + DEBUG_PRINT_LENGTH + " values are:\n"); for (int x = 0; x < DEBUG_PRINT_LENGTH; x++) debugString.append(bitSet.get(x) ? "1 " : "0 "); debugString.append("\nCardinality " + bitSet.cardinality()+"\n"); } else { debugString.append(" Dense. First " + DEBUG_PRINT_LENGTH + " values are:\n"); for (int x = 0; x < DEBUG_PRINT_LENGTH; x++) debugString.append(bitSet.get(x) ? "1" : "0"); // output voting record for first DEBUG_PRINT_LENGTH dimension debugString.append("\nVOTING RECORD: \n"); for (int y =0; y < votingRecord.size(); y++) { for (int x = 0; x < DEBUG_PRINT_LENGTH; x++) debugString.append(votingRecord.get(y).get(x) ? "1 " : "0 "); debugString.append("\n"); } // Calculate actual values for first 20 dimension double[] actualvals = new double[DEBUG_PRINT_LENGTH]; debugString.append("COUNTS : "); for (int x =0; x < votingRecord.size(); x++) { for (int y = 0; y < DEBUG_PRINT_LENGTH; y++) { if (votingRecord.get(x).get(y)) actualvals[y] += Math.pow(2, x); } } for (int x = 0; x < DEBUG_PRINT_LENGTH; x++) { debugString.append((long) ((minimum + actualvals[x]) / Math.pow(10, BINARY_VECTOR_DECIMAL_PLACES)) + " "); } // TODO - output count from first DEBUG_PRINT_LENGTH dimension debugString.append("\nNORMALIZED: "); this.normalize(); for (int x = 0; x < DEBUG_PRINT_LENGTH; x++) debugString.append(bitSet.get(x) + " "); debugString.append("\n"); debugString.append("\nCardinality " + bitSet.cardinality()+"\n"); debugString.append("Votes " + totalNumberOfVotes.get()+"\n"); debugString.append("Minimum " + minimum + "\n"); debugString.append("\n"); } return debugString.toString(); } @Override public int getDimension() { return dimension; } public BinaryVector createZeroVector(int dimension) { // Check "multiple-of-64" constraint, to facilitate permutation of 64-bit chunks if (dimension % 64 != 0) { logger.severe("Dimension should be a multiple of 64: " + dimension + " will lead to trouble!"); } return new BinaryVector(dimension); } @Override public boolean isZeroVector() { if (isSparse) { return bitSet.cardinality() == 0; } else { return (votingRecord == null) || (votingRecord.size() == 0) || (votingRecord.size()==1 && votingRecord.get(0).cardinality() == 0); } } @Override /** * Generates a basic elemental vector with a given number of 1's and otherwise 0's. * For binary vectors, the numnber of 1's and 0's must be the same, half the dimension. * * @return representation of basic binary vector. */ public BinaryVector generateRandomVector(int dimension, int numEntries, Random random) { // Check "multiple-of-64" constraint, to facilitate permutation of 64-bit chunks if (dimension % 64 != 0) { throw new IllegalArgumentException("Dimension should be a multiple of 64: " + dimension + " will lead to trouble!"); } // Check for balance between 1's and 0's if (numEntries != dimension / 2) { logger.severe("Attempting to create binary vector with unequal number of zeros and ones." + " Unlikely to produce meaningful results. Therefore, seedlength has been set to " + " dimension/2, as recommended for binary vectors"); numEntries = dimension / 2; } BinaryVector randomVector = new BinaryVector(dimension); randomVector.bitSet = new FixedBitSet(dimension); ArrayList<Integer> dimensions = new ArrayList<Integer>(); for (int q=0; q< dimension; q++) dimensions.add(q); Collections.shuffle(dimensions, random); for (int r=0; r < numEntries; r++) { int testPlace = dimensions.get(r); randomVector.bitSet.set(testPlace); } return randomVector; } @Override /** * Measures overlap of two vectors using 1 - normalized Hamming distance * * Causes this and other vector to be converted to dense representation. */ public double measureOverlap(Vector other) { IncompatibleVectorsException.checkVectorsCompatible(this, other); if (isZeroVector()) return 0; BinaryVector binaryOther = (BinaryVector) other; if (binaryOther.isZeroVector()) return 0; // Calculate hamming distance in place. Have not checked if this is fastest performance. double hammingDistance = BinaryVectorUtils.xorCount(this.bitSet, binaryOther.bitSet); return 2*(0.5 - (hammingDistance / (double) dimension)); } @Override /** * Adds the other vector to this one. If this vector was an elemental vector, the * "semantic vector" components (i.e. the voting record and temporary bitset) will be * initialized. * * Note that the precision of the voting record (in decimal places) is decided at this point: * if the initialization weight is an integer, rounding will occur to the nearest integer. * If not, rounding will occur to the second decimal place. * * This is an attempt to save space, as voting records can be prohibitively expansive * if not contained. */ public synchronized void superpose(Vector other, double weight, int[] permutation) { IncompatibleVectorsException.checkVectorsCompatible(this, other); if (weight == 0d) return; if (other.isZeroVector()) return; BinaryVector binaryOther = (BinaryVector) other; boolean flippedBitSet = false; //for subtraction if (weight < 0) //subtraction { weight = Math.abs(weight); binaryOther.bitSet.flip(0, binaryOther.getDimension()); flippedBitSet = true; } if (isSparse) { elementalToSemantic(); } if (permutation != null) { // Rather than permuting individual dimensions, we permute 64 bit groups at a time. // This should be considerably quicker, and dimension/64 should allow for sufficient // permutations if (permutation.length != dimension / 64) { throw new IllegalArgumentException("Binary vector of dimension " + dimension + " must have permutation of length " + dimension / 64 + " not " + permutation.length); } //TODO permute in place and reverse, to avoid creating a new BinaryVector here BinaryVector temp = binaryOther.copy(); temp.permute(permutation); superposeBitSet(temp.bitSet, weight); } else { superposeBitSet(binaryOther.bitSet, weight); } if (flippedBitSet) binaryOther.bitSet.flip(0, binaryOther.getDimension()); //return to original configuration unTallied.set(true); //there are votes that haven't been tallied yet } /** * This method is the first of two required to facilitate superposition. The underlying representation * (i.e. the voting record) is an ArrayList of FixedBitSet, each with dimension "dimension", which can * be thought of as an expanding 2D array of bits. Each column keeps count (in binary) for the respective * dimension, and columns are incremented in parallel by sweeping a bitset across the rows. In any dimension * in which the BitSet to be added contains a "1", the effect will be that 1's are changed to 0's until a * new 1 is added (e.g. the column '110' would become '001' and so forth). * * The first method deals with floating point issues, and accelerates superposition by decomposing * the task into segments. * * @param incomingBitSet * @param weight */ protected synchronized void superposeBitSet(FixedBitSet incomingBitSet, double weight) { // If fractional weights are used, encode all weights as integers (1000 x double value). weight = (int) Math.round(weight * Math.pow(10, BINARY_VECTOR_DECIMAL_PLACES)); if (weight == 0) return; // Keep track of number (or cumulative weight) of votes. totalNumberOfVotes.set(totalNumberOfVotes.get() + (int) weight); // Decompose superposition task such that addition of some power of 2 (e.g. 64) is accomplished // by beginning the process at the relevant row (e.g. 7) instead of starting multiple (e.g. 64) // superposition processes at the first row. int logFloorOfWeight = (int) (Math.floor(Math.log(weight)/Math.log(2))); if (logFloorOfWeight < votingRecord.size() - 1) { while (logFloorOfWeight > 0) { superposeBitSetFromRowFloor(incomingBitSet, logFloorOfWeight); weight = weight - (int) Math.pow(2,logFloorOfWeight); logFloorOfWeight = (int) (Math.floor(Math.log(weight)/Math.log(2))); } } // Add remaining component of weight incrementally. for (int x = 0; x < weight; x++) superposeBitSetFromRowFloor(incomingBitSet, 0); } /** * Performs superposition from a particular row by sweeping a bitset across the voting record * such that for any column in which the incoming bitset contains a '1', 1's are changed * to 0's until a new 1 can be added, facilitating incrementation of the * binary number represented in this column. * * @param incomingBitSet the bitset to be added * @param rowfloor the index of the place in the voting record to start the sweep at */ protected synchronized void superposeBitSetFromRowFloor(FixedBitSet incomingBitSet, int rowfloor) { // Attempt to save space when minimum value across all columns > 0 // by decrementing across the board and raising the minimum where possible. int max = getMaximumSharedWeight(); if (max > 0) { decrement(max); } // Handle overflow: if any column that will be incremented // contains all 1's, add a new row to the voting record. tempSet.xor(tempSet); tempSet.xor(incomingBitSet); for (int x = rowfloor; x < votingRecord.size() && tempSet.cardinality() > 0; x++) { tempSet.and(votingRecord.get(x)); } if (tempSet.cardinality() > 0) { votingRecord.add(new FixedBitSet(dimension)); } // Sweep copy of bitset to be added across rows of voting record. // If a new '1' is added, this position in the copy is changed to zero // and will not affect future rows. // The xor step will transform 1's to 0's or vice versa for // dimension in which the temporary bitset contains a '1'. votingRecord.get(rowfloor).xor(incomingBitSet); tempSet.xor(tempSet); tempSet.xor(incomingBitSet); for (int x = rowfloor + 1; x < votingRecord.size(); x++) { tempSet.andNot(votingRecord.get(x-1)); //if 1 already added, eliminate dimension from tempSet votingRecord.get(x).xor(tempSet); // votingRecord.get(x).trimTrailingZeros(); //attempt to save in sparsely populated rows } } /** * Reverses a string - simplifies the decoding of the binary vector for the 'exact' method * although it wouldn't be difficult to reverse the counter instead */ public static String reverse(String str) { if ((null == str) || (str.length() <= 1)) { return str; } return new StringBuffer(str).reverse().toString(); } /** * Sets {@link #tempSet} to be a bitset with a "1" in the position of every dimension * in the {@link #votingRecord} that exactly matches the target number. */ private synchronized void setTempSetToExactMatches(long target) { if (target == 0) { tempSet.set(0, dimension); tempSet.xor(votingRecord.get(0)); for (int x = 1; x < votingRecord.size(); x++) tempSet.andNot(votingRecord.get(x)); } else { String inbinary = reverse(Long.toBinaryString(target)); tempSet.xor(tempSet); try { tempSet.xor(votingRecord.get(inbinary.indexOf("1"))); //this requires error checking, it is throwing an index out of bounds exception } catch (Exception e) { e.printStackTrace(); } for (int q =0; q < votingRecord.size(); q++) { if (q < inbinary.length() && inbinary.charAt(q) == '1') tempSet.and(votingRecord.get(q)); else tempSet.andNot(votingRecord.get(q)); } } } /** * This method is used determine which dimension will receive 1 and which 0 when the voting * process is concluded. It produces an FixedBitSet in which * "1" is assigned to all dimension with a count > 50% of the total number of votes (i.e. more 1's than 0's added) * "0" is assigned to all dimension with a count < 50% of the total number of votes (i.e. more 0's than 1's added) * "0" or "1" are assigned to all dimension with a count = 50% of the total number of votes (i.e. equal 1's and 0's added) * * @return an FixedBitSet representing the superposition of all vectors added up to this point */ protected synchronized FixedBitSet concludeVote() { if (votingRecord.size() == 0 || votingRecord.size() == 1 && votingRecord.get(0).cardinality() ==0) return new FixedBitSet(dimension); else return concludeVote(totalNumberOfVotes.get()); } protected synchronized FixedBitSet concludeVote(long target) { long target2 = (long) Math.ceil((double) target / (double) 2); target2 = target2 - minimum; // Unlikely other than in testing: minimum more than half the votes if (target2 < 0) { FixedBitSet ans = new FixedBitSet(dimension); ans.set(0, dimension); return ans; } boolean even = (target % 2 == 0); FixedBitSet result = concludeVote(target2, votingRecord.size() - 1); if (even) { setTempSetToExactMatches(target2); boolean switcher = true; // 50% chance of being true with split vote. int q = tempSet.nextSetBit(0); while (q != DocIdSetIterator.NO_MORE_DOCS) { switcher = !switcher; if (switcher) tempSet.clear(q); if (q+1 >= tempSet.length()) q = DocIdSetIterator.NO_MORE_DOCS; else q = tempSet.nextSetBit(q+1); } result.andNot(tempSet); } return result; } protected synchronized FixedBitSet concludeVote(long target, int row_ceiling) { /** logger.info("Entering conclude vote, target " + target + " row_ceiling " + row_ceiling + "voting record " + votingRecord.size() + " minimum "+ minimum + " index "+ Math.log(target)/Math.log(2) + " vector\n" + toString()); **/ if (target == 0) { FixedBitSet atLeastZero = new FixedBitSet(dimension); atLeastZero.set(0, dimension); return atLeastZero; } double rowfloor = Math.log(target)/Math.log(2); int row_floor = (int) Math.floor(rowfloor); //for 0 index long remainder = target - (long) Math.pow(2, row_floor); //System.out.println(target+"\t"+rowfloor+"\t"+row_floor+"\t"+remainder); if (row_floor >= votingRecord.size()) //In this instance, the number we are checking for is higher than the capacity of the voting record { return new FixedBitSet(dimension); } if (row_ceiling == 0 && target == 1) { return votingRecord.get(0); } if (remainder == 0) { // Simple case - the number we're looking for is 2^n, so anything with a "1" in row n or above is true. FixedBitSet definitePositives = new FixedBitSet(dimension); for (int q = row_floor; q <= row_ceiling; q++) definitePositives.or(votingRecord.get(q)); return definitePositives; } else { // Simple part of complex case: first get anything with a "1" in a row above n (all true). FixedBitSet definitePositives = new FixedBitSet(dimension); for (int q = row_floor+1; q <= row_ceiling; q++) definitePositives.or(votingRecord.get(q)); // Complex part of complex case: get those that have a "1" in the row of n. FixedBitSet possiblePositives = (FixedBitSet) votingRecord.get(row_floor).clone(); FixedBitSet definitePositives2 = concludeVote(remainder, row_floor-1); possiblePositives.and(definitePositives2); definitePositives.or(possiblePositives); return definitePositives; } } /** * Decrement every dimension. Assumes at least one count in each dimension * i.e: no underflow check currently - will wreak havoc with zero counts */ public synchronized void decrement() { tempSet.set(0, dimension); for (int q = 0; q < votingRecord.size(); q++) { votingRecord.get(q).xor(tempSet); tempSet.and(votingRecord.get(q)); } } /** * Decrement every dimension by the number passed as a parameter. Again at least one count in each dimension * i.e: no underflow check currently - will wreak havoc with zero counts */ public synchronized void decrement(int weight) { if (weight == 0) return; minimum+= weight; int logfloor = (int) (Math.floor(Math.log(weight)/Math.log(2))); if (logfloor < votingRecord.size() - 1) { while (logfloor > 0) { selectedDecrement(logfloor); weight = weight - (int) Math.pow(2,logfloor); logfloor = (int) (Math.floor(Math.log(weight)/Math.log(2))); } } for (int x = 0; x < weight; x++) { decrement(); } } public synchronized void selectedDecrement(int floor) { tempSet.set(0, dimension); for (int q = floor; q < votingRecord.size(); q++) { votingRecord.get(q).xor(tempSet); tempSet.and(votingRecord.get(q)); } } /** * Returns the highest value shared by all dimensions. */ protected synchronized int getMaximumSharedWeight() { int thismaximum = 0; tempSet.xor(tempSet); // Reset tempset to zeros. for (int x = votingRecord.size() - 1; x >= 0; x--) { tempSet.or(votingRecord.get(x)); if (tempSet.cardinality() == dimension) { thismaximum += (int) Math.pow(2, x); tempSet.xor(tempSet); } } return thismaximum; } /** * Implements binding using permutations and XOR. */ public void bind(Vector other, int direction) { IncompatibleVectorsException.checkVectorsCompatible(this, other); BinaryVector binaryOther = (BinaryVector) other.copy(); if (direction > 0) { //as per Kanerva 2009: bind(A,B) = perm+(A) XOR B = C //this also functions as the left inverse: left inverse (A,C) = perm+(A) XOR C = B this.permute(PermutationUtils.getShiftPermutation(VectorType.BINARY, dimension, 1)); //perm+(A) this.bitSet.xor(binaryOther.bitSet); //perm+(A) XOR B } else { //as per Kanerva 2009: right inverse(C,B) = perm-(C XOR B) = perm-(perm+(A)) = A this.bitSet.xor(binaryOther.bitSet); //C XOR B this.permute(PermutationUtils.getShiftPermutation(VectorType.BINARY, dimension, -1)); //perm-(C XOR B) = A } } /** * Implements inverse of binding using permutations and XOR. */ public void release(Vector other, int direction) { if (!BINARY_BINDING_WITH_PERMUTE) bind(other); else bind (other, direction); } @Override /** * Implements binding using exclusive OR. */ public void bind(Vector other) { IncompatibleVectorsException.checkVectorsCompatible(this, other); if (!BINARY_BINDING_WITH_PERMUTE) { BinaryVector binaryOther = (BinaryVector) other; this.bitSet.xor(binaryOther.bitSet); } else { bind(other, 1); } //cleanup - the voting record is erased upon //binding, so tallying the votes won't walk back //the bind votingRecord = new ArrayList<FixedBitSet>(); votingRecord.add((FixedBitSet) bitSet.clone()); totalNumberOfVotes.set(1); tempSet = new FixedBitSet(dimension); minimum = 0; } @Override /** * Implements inverse binding using exclusive OR. */ public void release(Vector other) { if (!BINARY_BINDING_WITH_PERMUTE) bind(other); else bind(other, -1); } @Override /** * Normalizes the vector, converting sparse to dense representations in the process. This approach deviates from the "majority rule" * approach that is standard in the Binary Spatter Code(). Rather, the probability of assigning a one in a particular dimension * is a function of the probability of encountering the number of votes in the voting record in this dimension. * * This will be slower than normalizeBSC() below, but discards less information with positive effects on accuracy in preliminary experiments * * As a simple example to illustrate why this would be the case, consider the superposition of vectors for the terms "jazz","jazz" and "rock" * With the BSC normalization, the vector produced is identical to "jazz" (as jazz wins the vote in each case). With probabilistic normalization, * the vector produced is somewhat similar to both "jazz" and "rock", with a similarity that is proportional to the weights assigned to the * superposition, e.g. 0.624000:jazz; 0.246000:rock */ public synchronized void normalize() { if (votingRecord == null) return; if (votingRecord.size() == 1) { this.bitSet = votingRecord.get(0); return; } if (NORMALIZE_METHOD.equals(BinaryNormalizationMethod.SPATTERCODE)) { //faster majority rule normalization this.bitSet = concludeVote(); } else { //slower probabilistic normalization //clear bitset; this.bitSet.xor(this.bitSet); //Ensure that the same set of superposed vectors will always produce the same result long theSuperpositionSeed = 0; for (int q =0; q < votingRecord.size(); q++) theSuperpositionSeed += votingRecord.get(q).getBits()[0]; random.setSeed(theSuperpositionSeed); //Determine value above the universal minimum for each dimension of the voting record long max = totalNumberOfVotes.get(); //Determine the maximum possible votes on the voting record int maxpossiblevotesonrecord = 0; for (int q=0; q < votingRecord.size(); q++) maxpossiblevotesonrecord += Math.pow(2, q); //For each possible value on the record, get a BitSet with a "1" in the //position of the dimensions that match this value for (int x = 1; x <= maxpossiblevotesonrecord; x++) { this.setTempSetToExactMatches(x); //no exact matches if (this.tempSet.cardinality() == 0) continue; //For each y==1 on said BitSet (indicating votes in dimension[y] == x) int y = tempSet.nextSetBit(0); //determine total number of votes double votes = minimum+x; //calculate standard deviations above/below the mean of max/2 double z = (votes - (max/2)) / (Math.sqrt(max)/2); //find proportion of data points anticipated within z standard deviations of the mean (assuming approximately normal distribution) double proportion = erf(z/Math.sqrt(2)); //convert into a value between 0 and 1 (i.e. centered on 0.5 rather than centered on 0) proportion = (1+proportion) /2; while (y != DocIdSetIterator.NO_MORE_DOCS) { //probabilistic normalization if ((random.nextDouble()) <= proportion) this.bitSet.set(y); y++; if (y == this.dimension) break; y = tempSet.nextSetBit(y); } } } //housekeeping votingRecord = new ArrayList<FixedBitSet>(); votingRecord.add((FixedBitSet) bitSet.clone()); totalNumberOfVotes.set(1); tempSet = new FixedBitSet(dimension); minimum = 0; } /** * approximation of error function, equation 7.1.27 from * Abramowitz, M. and Stegun, I. A. (Eds.). "Repeated Integrals of the Error Function." S 7.2 * in Handbook of Mathematical Functions with Formulas, Graphs, and Mathematical Tables, * 9th printing. New York: Dover, pp. 299-300, 1972. * error of approximation <= 5*10^-4 */ public double erf(double z) { //erf(-x) == -erf(x) double sign = Math.signum(z); z = Math.abs(z); double a1 = 0.278393, a2 = 0.230389, a3 = 0.000972, a4 = 0.078108; double sumterm = 1 + a1*z + a2*Math.pow(z,2) + a3*Math.pow(z,3) + a4*Math.pow(z,4); return sign * ( 1-1/(Math.pow(sumterm, 4))); } /** * Faster normalization according to the Binary Spatter Code's "majority" rule */ public synchronized void normalizeBSC() { if (!isSparse) this.bitSet = concludeVote(); votingRecord = new ArrayList<FixedBitSet>(); votingRecord.add((FixedBitSet) bitSet.clone()); totalNumberOfVotes.set(1); tempSet = new FixedBitSet(dimension); minimum = 0; } /** * Counts votes without normalizing vector (i.e. voting record is not altered). Used in SemanticVectorCollider. */ public synchronized void tallyVotes() { if (isSparse) elementalToSemantic(); if (unTallied.get()) //only count if there are votes since the last tally try { this.bitSet = concludeVote(); unTallied.set(false); } catch (Exception e) {e.printStackTrace();} } @Override /** * Writes vector out to object output stream. Converts to dense format if necessary. */ public void writeToLuceneStream(IndexOutput outputStream) { if (isSparse) { elementalToSemantic(); } long[] bitArray = bitSet.getBits(); for (int i = 0; i < bitArray.length; i++) { try { outputStream.writeLong(bitArray[i]); } catch (IOException e) { logger.severe("Couldn't write binary vector to lucene output stream."); e.printStackTrace(); } } } /** * Writes vector out to object output stream. Converts to dense format if necessary. Truncates to length k. */ public void writeToLuceneStream(IndexOutput outputStream, int k) { if (isSparse) { elementalToSemantic(); } long[] bitArray = bitSet.getBits(); for (int i = 0; i < k/64; i++) { try { outputStream.writeLong(bitArray[i]); } catch (IOException e) { logger.severe("Couldn't write binary vector to lucene output stream."); e.printStackTrace(); } } } @Override /** * Reads a (dense) version of a vector from a Lucene input stream. */ public void readFromLuceneStream(IndexInput inputStream) { long bitArray[] = new long[(dimension / 64)]; for (int i = 0; i < dimension / 64; ++i) { try { bitArray[i] = inputStream.readLong(); } catch (IOException e) { logger.severe("Couldn't read binary vector from lucene output stream."); e.printStackTrace(); } } this.bitSet = new FixedBitSet(bitArray, dimension); this.isSparse = true; } @Override /** * Writes vector to a string of the form 010 etc. (no delimiters). * * No terminating newline or delimiter. */ public String writeToString() { StringBuilder builder = new StringBuilder(); for (int i = 0; i < dimension; ++i) { builder.append(this.bitSet.get(i) ? "1" : "0"); } return builder.toString(); } /** * Writes vector to a string of the form 010 etc. (no delimiters). * * No terminating newline or delimiter. */ public String writeLongToString() { StringBuilder builder = new StringBuilder(); for (int i = 0; i < (bitSet.getBits().length); ++i) { builder.append(Long.toString(bitSet.getBits()[i])+"|"); } return builder.toString(); } @Override /** * Writes vector from a string of the form 01001 etc. */ public void readFromString(String input) { if (input.length() != dimension) { throw new IllegalArgumentException("Found " + (input.length()) + " possible coordinates: " + "expected " + dimension); } for (int i = 0; i < dimension; ++i) { if (input.charAt(i) == '1') bitSet.set(i); } } /** * Automatically translate elemental vector (no storage capacity) into * semantic vector (storage capacity initialized, this will occupy RAM) */ protected void elementalToSemantic() { if (!isSparse) { logger.warning("Tried to transform an elemental vector which is not in fact elemental." + "This may be a programming error."); return; } votingRecord = new ArrayList<FixedBitSet>(); tempSet = new FixedBitSet(dimension); if (bitSet.cardinality() != 0) this.superposeBitSet(bitSet.clone(), 1); isSparse = false; } /** * Permute the long[] array underlying the FixedBitSet binary representation */ public void permute(int[] permutation) { if (permutation.length != getDimension() / 64) { throw new IllegalArgumentException("Binary vector of dimension " + getDimension() + " must have permutation of length " + getDimension() / 64 + " not " + permutation.length); } //TODO permute in place without creating additional long[] (if proves problematic at scale) long[] coordinates = bitSet.getBits(); long[] newCoordinates = new long[coordinates.length]; for (int i = 0; i < coordinates.length; ++i) { int positionToAdd = i; positionToAdd = permutation[positionToAdd]; newCoordinates[i] = coordinates[positionToAdd]; } bitSet = new FixedBitSet(newCoordinates, getDimension()); } // Available for testing and copying. protected BinaryVector(FixedBitSet inSet) { this.dimension = (int) inSet.length(); this.bitSet = inSet; } // Available for testing protected int bitLength() { return bitSet.getBits().length; } // Monitor growth of voting record. protected int numRows() { if (isSparse) return 0; return votingRecord.size(); } //access bitset directly protected FixedBitSet getCoordinates() { // TODO Auto-generated method stub return this.bitSet; } //access bitset directly protected void setCoordinates(FixedBitSet incomingBitSet) { // TODO Auto-generated method stub this.bitSet = incomingBitSet; } //set DEBUG_PRINT_LENGTH public static void setDebugPrintLength(int length) { DEBUG_PRINT_LENGTH = length; } }