java source code of CrfUtilities

package com.asher_stern.crf.crf;

import java.lang.reflect.Array;
import java.math.BigDecimal;
import java.util.Collection;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.log4j.Logger;

import com.asher_stern.crf.crf.filters.CrfFeaturesAndFilters;
import com.asher_stern.crf.crf.filters.CrfFilteredFeature;
import com.asher_stern.crf.crf.filters.Filter;
import com.asher_stern.crf.utilities.CrfException;
import com.asher_stern.crf.utilities.ArithmeticUtilities;
import com.asher_stern.crf.utilities.TaggedToken;

import static com.asher_stern.crf.utilities.ArithmeticUtilities.*;

/**
 * A collection of static functions needed by CRF.
 * 
 * @author Asher Stern
 * Date: Nov 8, 2014
 *
 */
public class CrfUtilities
{
	public static final BigDecimal ROUGHLY_EQUAL_DISTANCE_FROM_ZERO = big(0.001);
	public static final BigDecimal ROUGHLY_EQUAL_DEVIATION_FROM_ONE = big(0.01);

	
	/**
	 * Returns the set of tags that can be assigned to the token which precedes the given token, assuming "currentTag" is
	 * the tag of the current token.
	 * @param sentence a sequence of tokens
	 * @param index the index of the "current token"
	 * @param currentTag the tag of the "current token"
	 * @param crfTags A data-structure that holds all the tags in the training corpus, and the restrictions over them.
	 * @return the set of tags that can be assigned to the token which precedes the given token, assuming "currentTag" is
	 * the tag of the current token.
	 */
	public static <K,G> Set<G> getPreviousTags(K[] sentence, int index, G currentTag, CrfTags<G> crfTags)
	{
		Set<G> previousTags = null;
		if (index<0) throw new CrfException("Error: no tag can precede the virtual token that precedes the first token.");
		if (index==0)
		{
			previousTags = crfTags.getPrecedeWhenFirst().get(currentTag);
		}
		else
		{
			previousTags = crfTags.getCanPrecedeNonNull().get(currentTag);
		}
		return previousTags;
	}
	
	/**
	 * Adds all the items in "fromCollection" into "intoCollection".
	 * <BR>
	 * There was a bug in some implementations of Collection.addAll() method in some versions of J2SE, so to be on the
	 * safe side, I implement it here.
	 * @param intoCollection
	 * @param fromCollection
	 */
	public static <T> void addAll(Collection<T> intoCollection, Collection<? extends T> fromCollection)
	{
		for (T t : fromCollection)
		{
			intoCollection.add(t);
		}
	}
	
	/**
	 * Finds and returns the feature-indexes for which it is not sure that they return 0.<BR>
	 * Typically in CRF, most of the features return 0 in most inputs.
	 * For example, a feature that returns 1 if the token is "the" and its tag is "DETERMINER". This feature returns 0 for most
	 * of the words in the corpus, and most of the tags.<BR>
	 * This static function returns the features, for a given token and tags, that <b>might</b> return non-zero.
	 * <BR>
	 * See also {@link Filter}
	 * 
	 * 
	 * @param features
	 * @param sentence
	 * @param tokenIndex
	 * @param currentTag
	 * @param previousTag
	 * @return
	 */
	public static <K,G> Set<Integer> getActiveFeatureIndexes(CrfFeaturesAndFilters<K,G> features, K[] sentence, int tokenIndex, G currentTag, G previousTag)
	{
		Set<Integer> activeFeatureIndexes = new LinkedHashSet<Integer>();
		addAll(activeFeatureIndexes, features.getIndexesOfFeaturesWithNoFilter());
		
		Set<Filter<K, G>> filters = features.getFilterFactory().createFilters(sentence, tokenIndex, currentTag, previousTag);
		for (Filter<K, G> filter : filters)
		{
			Set<Integer> featureIndexesForFilter = features.getMapActiveFeatures().get(filter);
			if (featureIndexesForFilter!=null)
			{
				addAll(activeFeatureIndexes, featureIndexesForFilter);
			}
		}
		
		return activeFeatureIndexes;
	}
	
	/**
	 * Returns \Sum_{i=0}^{k-1}{\theta_i*f_i(x,j,s,s')}, where k is the number of features, \theta_i is parameter number i,
	 * f_i is feature number i, x is the given sentence, j is the index of the token, s is the tag of token number j,
	 * and s' is the tag of token number j-1.
	 * 
	 * @param model the CRF model: holds the features and the parameters.
	 * @param sentence a sentence (sequence of tokens)
	 * @param tokenIndex token index
	 * @param currentTag tag of the token in tokenIndex
	 * @param previousTag tag of the token in tokenIndex-1
	 * @return \Sum_{i=0}^{k-1}{\theta_i*f_i(x,j,s,s')}
	 */
	public static <K,G> BigDecimal oneTokenSumWeightedFeatures(CrfModel<K, G> model, K[] sentence, int tokenIndex, G currentTag, G previousTag)
	{
		Set<Integer> activeFeatureIndexes = getActiveFeatureIndexes(model.getFeatures(),sentence,tokenIndex,currentTag,previousTag);
		return oneTokenSumWeightedFeatures(model,sentence,tokenIndex,currentTag,previousTag,activeFeatureIndexes);
	}

	/**
	 * Returns \Sum_{i=0}^{k-1}{\theta_i*f_i(x,j,s,s')}, where k is the number of features, \theta_i is parameter number i,
	 * f_i is feature number i, x is the given sentence, j is the index of the token, s is the tag of token number j,
	 * and s' is the tag of token number j-1.
	 * <BR>
	 * This function is also given a set of features for which <b>it is not known</b> that they return zero for (x,j,s,s').
	 * See {@link #getActiveFeatureIndexes(CrfFeaturesAndFilters, Object[], int, Object, Object)}.
	 *  
	 * @param model the CRF model: holds the features and the parameters.
	 * @param sentence a sentence (sequence of tokens)
	 * @param tokenIndex token index
	 * @param currentTag tag of the token in tokenIndex
	 * @param previousTag tag of the token in tokenIndex-1
	 * @param knownActiveFeatureIndexes a set of features for which <b>it is not known</b> that they return zero for (x,j,s,s').
	 * @return \Sum_{i=0}^{k-1}{\theta_i*f_i(x,j,s,s')}
	 */
	public static <K,G> BigDecimal oneTokenSumWeightedFeatures(CrfModel<K, G> model, K[] sentence, int tokenIndex, G currentTag, G previousTag, Set<Integer> knownActiveFeatureIndexes)
	{
		BigDecimal sum = BigDecimal.ZERO;
		for (int index : knownActiveFeatureIndexes)
		{
			CrfFilteredFeature<K, G> feature = model.getFeatures().getFilteredFeatures()[index];
			BigDecimal featureValue = BigDecimal.ZERO;
			if (feature.isWhenNotFilteredIsAlwaysOne())
			{
				featureValue = BigDecimal.ONE;
			}
			else
			{
				featureValue = big(feature.getFeature().value(sentence,tokenIndex,currentTag,previousTag));
			}
			
			BigDecimal weightedValue = safeMultiply(model.getParameters().get(index), featureValue);
			sum = safeAdd(sum, weightedValue);
		}
		return sum;
	}
	
	/**
	 * Returns e^{\Sum_{i=0}^{k-1}{\theta_i*f_i(x,j,s,s')}}, where k is the number of features, \theta_i is parameter number i,
	 * f_i is feature number i, x is the given sentence, j is the index of the token, s is the tag of token number j,
	 * and s' is the tag of token number j-1.
	 * 
	 * @param model the CRF model: holds the features and the parameters.
	 * @param sentence a sentence (sequence of tokens)
	 * @param tokenIndex token index
	 * @param currentTag tag of the token in tokenIndex
	 * @param previousTag tag of the token in tokenIndex-1
	 * @return e^{\Sum_{i=0}^{k-1}{\theta_i*f_i(x,j,s,s')}}
	 */
	public static <K,G> BigDecimal oneTokenFormula(CrfModel<K, G> model, K[] sentence, int tokenIndex, G currentTag, G previousTag)
	{
		Set<Integer> activeFeatureIndexes = getActiveFeatureIndexes(model.getFeatures(),sentence,tokenIndex,currentTag,previousTag);
		return oneTokenFormula(model,sentence,tokenIndex,currentTag,previousTag,activeFeatureIndexes);
	}
	
	/**
	 * Returns e^{\Sum_{i=0}^{k-1}{\theta_i*f_i(x,j,s,s')}}, where k is the number of features, \theta_i is parameter number i,
	 * f_i is feature number i, x is the given sentence, j is the index of the token, s is the tag of token number j,
	 * and s' is the tag of token number j-1.
	 * <BR>
	 * This function is also given a set of features for which <b>it is not known</b> that they return zero for (x,j,s,s').
	 * See {@link #getActiveFeatureIndexes(CrfFeaturesAndFilters, Object[], int, Object, Object)}.
	 * 
	 * @param model the CRF model: holds the features and the parameters.
	 * @param sentence a sentence (sequence of tokens)
	 * @param tokenIndex token index
	 * @param currentTag tag of the token in tokenIndex
	 * @param previousTag tag of the token in tokenIndex-1
	 * @param knownActiveFeatureIndexes a set of features for which <b>it is not known</b> that they return zero for (x,j,s,s').
	 * @return e^{\Sum_{i=0}^{k-1}{\theta_i*f_i(x,j,s,s')}}
	 */
	public static <K,G> BigDecimal oneTokenFormula(CrfModel<K, G> model, K[] sentence, int tokenIndex, G currentTag, G previousTag,Set<Integer> knownActiveFeatureIndexes)
	{
		
		return  ArithmeticUtilities.exp(oneTokenSumWeightedFeatures(model,sentence,tokenIndex,currentTag,previousTag,knownActiveFeatureIndexes));
	}
	
	
	/**
	 * Returns a sentence as an array, for the given sentence (given as list of tagged tokens)
	 * @param sentence a sentence
	 * @return the given sentence as an array.
	 */
	public static <K> K[] extractSentence(List<? extends TaggedToken<K, ?>> sentence)
	{
		if (sentence==null) throw new CrfException("The input is an empty sentence.");
		if (sentence.size()<1) throw new CrfException("The input is an empty sentence.");
		@SuppressWarnings("unchecked")
		K[] ret = (K[]) Array.newInstance(sentence.iterator().next().getToken().getClass(), sentence.size());
		int index=0;
		for (TaggedToken<K, ?> taggedToken : sentence)
		{
			ret[index] = taggedToken.getToken();
			++index;
		}
		if (index!=ret.length) {throw new CrfException("BUG");}
		return ret;
	}
	

	
	/**
	 * If |value1|>|value2| returns |value1|/|value2|. Otherwise returns |value2|/|value1|.
	 */
	public static BigDecimal relativeDifference(BigDecimal value1, BigDecimal value2)
	{
		if (value1.equals(value2))  {return BigDecimal.ONE;}
		
		value1 = value1.abs();
		value2 = value2.abs();
		
		BigDecimal smaller;
		BigDecimal larger;
		if (value1.compareTo(value2)<0)
		{
			smaller = value1;
			larger = value2;
		}
		else
		{
			smaller = value2;
			larger = value1;
		}
		
		BigDecimal ret = safeDivide(larger, smaller);
		return ret;
	}
	
	/**
	 * Returns true if the two given numbers are roughly equal.
	 * <BR>
	 * The numbers are considered "roughly equal" if dividing the (absolute value of the) larger by the (absolute value of the)
	 * smaller is around 1, where "around 1" means no larger than 1.01.
	 * <BR>
	 * In case they are near 0, then the criterion is that subtracting one from the other yields absolute value no larger then 0.001.
	 *  
	 * @param value1
	 * @param value2
	 * @return
	 */
	public static boolean roughlyEqual(BigDecimal value1, BigDecimal value2)
	{
		boolean ret = true;
		if ( ( (value1.compareTo(BigDecimal.ZERO) < 0) || (value2.compareTo(BigDecimal.ZERO) < 0) ) &&  ( (value1.compareTo(BigDecimal.ZERO)>=0) || (value2.compareTo(BigDecimal.ZERO)>=0) ) ) // If they doen't have the same sign
		{
			
			BigDecimal gap = safeSubtract(value1, value2).abs();
			if (gap.compareTo(ROUGHLY_EQUAL_DISTANCE_FROM_ZERO) > 0)
			{
				ret = false;
			}
		}
		else
		{
			if ( safeSubtract(relativeDifference(value1,value2),BigDecimal.ONE).compareTo(ROUGHLY_EQUAL_DEVIATION_FROM_ONE) > 0)
			{
				ret = false;
			}
		}
		return ret;
		
	}

	
	/**
	 * Given a map from K to set of V - adds v to the set of k.
	 * @param map
	 * @param key
	 * @param value
	 */
	public static <K,V> void putInMapSet(Map<K, Set<V>> map, K key, V value)
	{
		Set<V> set = map.get(key);
		if (null==set)
		{
			set = new LinkedHashSet<V>();
			map.put(key, set);
		}
		set.add(value);
	}

	@SuppressWarnings("unused")
	private static final Logger logger = Logger.getLogger(CrfUtilities.class);
}