java source code of DistinctTokenCount

/**
 * 
 */
package codemining.languagetools.tui;

import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.logging.Logger;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.DirectoryFileFilter;
import org.apache.commons.io.filefilter.RegexFileFilter;
import org.apache.commons.lang.exception.ExceptionUtils;

import codemining.languagetools.ITokenizer;
import codemining.languagetools.TokenizerUtils;

import com.google.common.collect.Multiset.Entry;
import com.google.common.collect.TreeMultiset;

/**
 * Print to stdout the total count of all unique tokens in the text.
 * 
 * Used to answer the question: Do we have a zipf-ian distribution of tokens in
 * Java Code?
 * 
 * @author Miltos Allamanis <[email protected]>
 * 
 */
public class DistinctTokenCount {

	private static final Logger LOGGER = Logger
			.getLogger(DistinctTokenCount.class.getName());

	/**
	 * @param args
	 * @throws ClassNotFoundException
	 * @throws IllegalAccessException
	 * @throws InstantiationException
	 */
	public static void main(final String[] args) throws InstantiationException,
			IllegalAccessException, ClassNotFoundException {

		if (args.length != 2) {
			System.err.println("Usage: <directory> <tokenizerClass>");
			return;
		}

		final DistinctTokenCount tokCount = new DistinctTokenCount(args[1]);
		for (final File fi : FileUtils.listFiles(new File(args[0]),
				new RegexFileFilter(".*\\.java$"),
				DirectoryFileFilter.DIRECTORY)) {
			try {
				tokCount.addTokens(fi);
			} catch (final IOException e) {
				LOGGER.warning(ExceptionUtils.getFullStackTrace(e));
			}
		}

		tokCount.printCounts();
	}

	private final TreeMultiset<String> allTokens = TreeMultiset.create();

	private final ITokenizer tokenizer;

	public DistinctTokenCount(final String tokenizerClass)
			throws InstantiationException, IllegalAccessException,
			ClassNotFoundException {
		tokenizer = TokenizerUtils.tokenizerForClass(tokenizerClass);
	}

	public void addTokens(final File file) throws IOException {
		LOGGER.finer("Reading file " + file.getAbsolutePath());
		final char[] code = FileUtils.readFileToString(file).toCharArray();
		final List<String> tokens = tokenizer.tokenListFromCode(code);
		allTokens.addAll(tokens);

	}

	/**
	 * Prints the counts.
	 */
	public void printCounts() {
		for (final Entry<String> token : allTokens.entrySet()) {
			System.out.println(token.getCount());
		}
	}

}