package com.github.kilianB.datastructures.tree.binaryTree; import java.io.Serializable; import java.math.BigInteger; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.List; import java.util.PriorityQueue; import com.github.kilianB.datastructures.tree.AbstractBinaryTree; import com.github.kilianB.datastructures.tree.NodeInfo; import com.github.kilianB.datastructures.tree.Result; import com.github.kilianB.hash.Hash; /** * A not thread safe binary tree implementation used to quickly compute the * <a href="https://en.wikipedia.org/wiki/Hamming_distance">hamming distance</a> * of multiple hashes. The tree can be used to keep all hashes in memory, if a * persistent storage is required take a look at the database examples. * <p> * * To keep storage space minimal the tree is lazily populated and creates nodes * on the fly only if the node is required to represent a hash. * </p> * * <b>Example </b> * * Hash(1011011) * <ul> * <li>The left child of a node represents a 1 bit</li> * <li>The right child of a node represents a 0 bit</li> * <li>Padding bit 1 is ignored as it's the same</li> * </ul> * * <pre> * root * 0 * 1 * 1 * 0 * 1 * leaf * </pre> * * Using a tree like structure allows to prune searches once the distance of the * current branch deviates further away than the threshold would allow. * <p> * * Currently the tree only allows traversal from the root node allowing to * search all hashes which are within a given distance from a needle. A more * performant optimization might save the leaves in a hash structure and keep a * reference to the parent nodes allowing to start searching from the leaf * instead. * * @author Kilian */ public class BinaryTree<T> extends AbstractBinaryTree<T> implements Serializable { private static final long serialVersionUID = 4193396415197848158L; /** * * @param ensureHashConsistency If true adding and matching hashes will check * weather they are generated by the same * algorithms as the first hash added to the tree * */ public BinaryTree(boolean ensureHashConsistency) { super(ensureHashConsistency); } protected BinaryTree() { } public void addHash(Hash hash, T value) { // Expose method super.addHash(hash, value); } /** * Return all elements of the tree whose hamming distance is smaller or equal * than the supplied max distance. * * If the tree is configured to ensureHashConsistency this function will throw * an unchecked IlleglStateException if the checked hash does not comply with * the first hash added to the tree. * * @param hash The hash to search for * @param maxDistance The maximal hamming distance deviation all found hashes * may possess. A distance of 0 will return all objects added * whose hash is exactly the hash supplied as the first * argument * * @return Search results contain objects and distances matching the search * criteria. The results returned are ordered to return the closest * match first. */ @Override public PriorityQueue<Result<T>> getElementsWithinHammingDistance(Hash hash, int maxDistance) { if (ensureHashConsistency && algoId != hash.getAlgorithmId()) { throw new IllegalStateException("Tried to add an incompatible hash to the binary tree"); } // Iterative implementation. Recursion might get too expensive if the key lenght // increases and we need to be aware of the stack depth PriorityQueue<Result<T>> result = new PriorityQueue<Result<T>>(); BigInteger hashValue = hash.getHashValue(); int treeDepth = hash.getBitResolution(); ArrayDeque<NodeInfo<T>> queue = new ArrayDeque<>(); // Breadth first search // Begin search at the root queue.add(new NodeInfo<T>(root, 0, treeDepth)); while (!queue.isEmpty()) { NodeInfo<T> info = queue.poll(); // We reached a leaf if (info.depth == 0) { @SuppressWarnings("unchecked") Leaf<T> leaf = (Leaf<T>) info.node; for (T o : leaf.getData()) { result.add(new Result<T>(o, info.distance, info.distance / (double) treeDepth)); } continue; } /* * else { System.out.printf("%-8s Depth: %d Distance: %d Next Bit: %s%n", * info.curPath, info.depth, info.distance, hashValue.testBit(info.depth - 1) ? * "1" : "0"); } */ // Next bit boolean bit = hashValue.testBit(info.depth - 1); // Are children of the current Node correctChild = info.node.getChild(bit); if (correctChild != null) { queue.add(new NodeInfo<T>(correctChild, info.distance, info.depth - 1)); } if (info.distance + 1 <= maxDistance) { Node failedChild = info.node.getChild(!bit); // Maybe the child does not exist if (failedChild != null) { queue.add(new NodeInfo<T>(failedChild, info.distance + 1, info.depth - 1)); } } } return result; } /** * Retrieve the hash that is the most similar to the queried hash. The closest * hash is the hash with the smallest distance. * * @param hash to search the neighbor for. * @return the closest hash saved in this tree. * @since 3.0.0 */ @Override public List<Result<T>> getNearestNeighbour(Hash hash) { if (ensureHashConsistency && algoId != hash.getAlgorithmId()) { throw new IllegalStateException("Tried to add an incompatible hash to the binary tree"); } BigInteger hashValue = hash.getHashValue(); int treeDepth = hash.getBitResolution(); ArrayDeque<NodeInfo<T>> queue = new ArrayDeque<>(); List<Result<T>> result = new ArrayList<>(); double curBestDistance = Double.MAX_VALUE; // Depth first search with aggressive pruning // Begin search at the root queue.add(new NodeInfo<T>(root, 0, treeDepth)); while (!queue.isEmpty()) { NodeInfo<T> info = queue.removeLast(); // If we found a better result ignore it. if (info.distance > curBestDistance) { continue; } // We reached a leaf if (info.depth == 0) { if (curBestDistance > info.distance) { result.clear(); curBestDistance = info.distance; } @SuppressWarnings("unchecked") Leaf<T> leaf = (Leaf<T>) info.node; for (T o : leaf.getData()) { result.add(new Result<T>(o, info.distance, info.distance / (double) treeDepth)); } continue; } // TODO das ist keine tiefensuche! // Next bit boolean bit = hashValue.testBit(info.depth - 1); // Are children of the current if (info.distance + 1 <= curBestDistance) { Node failedChild = info.node.getChild(!bit); // Maybe the child does not exist if (failedChild != null) { queue.add(new NodeInfo<T>(failedChild, info.distance + 1, info.depth - 1)); } } Node correctChild = info.node.getChild(bit); if (correctChild != null) { queue.add(new NodeInfo<T>(correctChild, info.distance, info.depth - 1)); } } return result; } }