package renaming.tools; import static com.google.common.base.Preconditions.checkArgument; import java.io.File; import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.util.Collection; import java.util.logging.Logger; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.PosixParser; import org.apache.commons.io.FileUtils; import org.apache.commons.io.filefilter.DirectoryFileFilter; import renaming.ngram.IdentifierNeighborsNGramLM; import codemining.java.tokenizers.JavaTokenizer; import codemining.languagetools.ITokenizer; import codemining.lm.ngram.AbstractNGramLM; import codemining.lm.ngram.smoothing.StupidBackoff; import codemining.util.serialization.ISerializationStrategy.SerializationException; import codemining.util.serialization.Serializer; /** * Tool for building an LM. * * @author Miltos Allamanis <[email protected]> * */ public class IdentifierNGramModelBuilder { private static final Logger LOGGER = Logger .getLogger(IdentifierNGramModelBuilder.class.getName()); /** * @param args * @throws IllegalAccessException * @throws InstantiationException * @throws ClassNotFoundException * @throws IOException * @throws SecurityException * @throws NoSuchMethodException * @throws InvocationTargetException * @throws IllegalArgumentException * @throws SerializationException */ public static void main(final String[] args) throws InstantiationException, IllegalAccessException, ClassNotFoundException, IOException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException, SerializationException { final CommandLineParser parser = new PosixParser(); final Options options = new Options(); options.addOption(OptionBuilder.isRequired(true) .withDescription("n-gram n parameter. The size of n.").hasArg() .create("n")); options.addOption(OptionBuilder .isRequired(true) .withLongOpt("trainDir") .hasArg() .withDescription("The directory containing the training files.") .create("t")); options.addOption(OptionBuilder.isRequired(true).withLongOpt("output") .hasArg() .withDescription("File to output the serialized n-gram model.") .create("o")); final CommandLine parse; try { parse = parser.parse(options, args); } catch (ParseException ex) { System.err.println(ex.getMessage()); final HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("buildlm", options); return; } final ITokenizer tokenizer = new JavaTokenizer(); final String nStr = parse.getOptionValue("n"); final int n = Integer.parseInt(nStr); final File trainDirectory = new File(parse.getOptionValue("t")); checkArgument(trainDirectory.isDirectory()); final String targetSerFile = parse.getOptionValue("o"); final IdentifierNeighborsNGramLM dict = new IdentifierNeighborsNGramLM( n, tokenizer); LOGGER.info("NGramLM Model builder started with " + n + "-gram for files in " + trainDirectory.getAbsolutePath()); final Collection<File> files = FileUtils.listFiles(trainDirectory, dict.modelledFilesFilter(), DirectoryFileFilter.DIRECTORY); dict.trainModel(files); LOGGER.info("Ngram model build. Adding Smoother..."); final AbstractNGramLM ng = new StupidBackoff(dict); LOGGER.info("Ngram model build. Serializing..."); Serializer.getSerializer().serialize(ng, targetSerFile); } }