java source code of StringUtils

package edu.uncc.cs.watsonsim;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import edu.stanford.nlp.util.CacheMap;

/**
*@author Jagan Vujjini
*/
public class StringUtils extends org.apache.commons.lang3.StringUtils {
	private static Analyzer analyzer = new StandardAnalyzer();
	//private static Database db = new Database(); // Used for semantic distribution
	public static final int CONTEXT_LENGTH = 1000;
	
	//private static final int CONTEXT_HASH_COUNT = 20;
	private static final int CACHE_SIZE = 256;
	private static CacheMap<String, ArrayList<Double>> context_cache_map = new CacheMap<String, ArrayList<Double>>(CACHE_SIZE);
	
	/**
	 * Try to canonicalize a string somewhat conservatively.
	 * Basically, we:
	 *    ignore case
	 *    ignore punctuation such as (){}\\\\/\\[\\]—<>;:,.\"'“”‘’«»「」…-
	 *        The ' is probably debatable. The rest are generally
	 *        inaudible so they wouldn't have counted in a question
	 *        had they been spoken anyway.
	 *    ignore stopwords and some stems
	 *        This is the effect of Lucene's filters.
	 */
	public static String canonicalize(String dirty) {
		dirty = dirty
				.toLowerCase()
				.replaceAll("[(){}\\\\/\\[\\]—<>;:,.\"'“”‘’«»「」…-]", "")
				.trim();
		
		StringBuilder clean = new StringBuilder();
		for (String token : tokenize(dirty)) {
			clean.append(token);
			clean.append(' ');
		}
		return clean.toString().trim();
	}
	
	/**
	 * Remove all characters except alphanumerics and space.
	 * 
	 * This is a pretty wild thing to do since it clears out tons of usually
	 * useful stuff, like accent marks, punctuation, capitals..
	 */
	public static String sanitize(String input) {
		return input.replaceAll("[^A-Za-z0-9 ]", " ");
	}
	
	/** splits the given string into tokens */
	public static List<String> tokenize(String text) {
		List<String> tokens = new ArrayList<>();
		
		try (TokenStream tokenStream = analyzer.tokenStream("text", text)) {
			//TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_46, new StringReader(text));
			//tokenStream = new org.apache.lucene.analysis.core.StopFilter(Version.LUCENE_46, tokenStream, EnglishAnalyzer.getDefaultStopSet());
			CharTermAttribute token = tokenStream.addAttribute(CharTermAttribute.class);
			
			// On the fence whether it is better to error here or not. Suggestions?
			tokenStream.reset();
		
			while (tokenStream.incrementToken()) {
				tokens.add(token.toString());
			}
		} catch (IOException e) {
			// If we can't trim it, so what?
			e.printStackTrace();
		}
		return tokens;
	}
	
	/** Conservatively normalize a string while tokenizing it */
	public static List<String> conservativeTokenize(String text) {
		String[] token_arr = text.toLowerCase().split("[ \t~`@#$%^&\\*\\(\\)_\\+-=\\{\\}\\[\\]:\";'<>\\?,./\\|\\\\]+");
		return Arrays.asList(token_arr);
	}
	
	
    /** Returns true if every non-stopword from candidate is found in reference */
    public static boolean matchSubset(String candidate, String reference){
            // Match these two sets in linear (or linearithmic) time
            HashSet<String> reference_terms = new HashSet<String>();
            reference_terms.addAll(StringUtils.tokenize(candidate));
            return reference_terms.containsAll(StringUtils.tokenize(reference));
    }
    
    /**
	 * Fetch and merge the phrase contexts from a database.
	 * The safe part about this is that it may give the wrong answer but not
	 * an exception.
	 * @param phrase
	 * @return the merged phrase vector, unless an error occurred.
	 */
	/*public static ArrayList<Double> getPhraseContextSafe(String phrase) {
		ArrayList<Double> merged_context = context_cache_map.get(	phrase);
		if (merged_context == null) {
			merged_context = new ArrayList<>();
			for (int i=0; i<CONTEXT_LENGTH; i++) merged_context.add(0.0);
			
			// Filter repeated words
			// word_set = S.toList $ S.fromList $ words phrase 
			PreparedStatement context_retriever = db.prep("SELECT context, count FROM rindex WHERE word == ?;");
			HashSet<String> word_set = new HashSet<String>();
			word_set.addAll(StringUtils.conservativeTokenize(phrase));
			
			// Sum the context vectors
			// foldl' (V.zipWith (+)) (V.replicate 1000) context_vectors
			try {
				for (String word : word_set) {
					context_retriever.setString(1, word);
					ResultSet sql_context = context_retriever.executeQuery();
					if (sql_context.next()) {
						java.nio.DoubleBuffer buffer = java.nio.ByteBuffer.wrap(sql_context.getBytes(1)).asDoubleBuffer();
						double total = 0;
						// Normalize each word so that they have the same weight when combined
						for (int i=0; i<CONTEXT_LENGTH; i++)
							total += buffer.get(i);
						for (int i=0; i<CONTEXT_LENGTH; i++)
							merged_context.set(i, merged_context.get(i) + (buffer.get(i) / total));
						
					}
				}
			} catch (SQLException e) {} // At worst, return what we have so far. Maybe nothing.
		}
		context_cache_map.put(phrase, merged_context);
		return merged_context;
	}*/
    
	/**
	 * Find the cosine similarity between two vectors
	 * 1 is identical, 0 is orthogonal
	 * Synonyms are usually between 0.6 and 0.8.
	 * @param vec1
	 * @param vec2
	 * @return double between 0 and 1
	 */
	public static double getCosineSimilarity(ArrayList<Double> vec1, ArrayList<Double> vec2) {
		double xy = 0;
		double xsquared = 0;
		double ysquared = 0;
		int length = Math.min(vec1.size(), vec2.size());
		for (int i=0; i<length; i++) {
			double x = vec1.get(i);
			double y = vec2.get(i);
			// Ignore uncertain dimensions
			// This little kludge makes a big difference
			if (Math.max(Math.abs(x), Math.abs(y)) > 0.1) {
				xy += x * y;
				xsquared += x * x;
				ysquared += y * y;	
			}
		}
		return xy / (Math.sqrt(xsquared) * Math.sqrt(ysquared) + Double.MIN_NORMAL);
	}
}