package slp.core.lexing.util; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.List; import java.util.stream.Collectors; import slp.core.lexing.Lexer; import slp.core.lexing.code.JavaLexer; import slp.core.translating.Vocabulary; import slp.core.translating.VocabularyRunner; import slp.core.util.Util; /** * Including for legacy reasons only! * * @author Vincent Hellendoorn * */ public class IntsCreator { private static final int LINE_CUTOFF = 1000; public static void main(String[] args) throws IOException { File root = new File(args[0]); File trainDir = new File(root, "Train"); File testDir = new File(root, "Test"); File validDir = new File(root, "Valid"); File trainOut = new File(root, "ix-train"); File testOut = new File(root, "ix-test"); File validOut = new File(root, "ix-valid"); VocabularyRunner.read(new File("vocab.out")); writeIXs(trainDir, trainOut); writeIXs(testDir, testOut); writeIXs(validDir, validOut); } private static void writeIXs(File dir, File out) throws IOException { Lexer lexer = new JavaLexer(); Vocabulary vocabulary = new Vocabulary(); try (FileWriter fw = new FileWriter(out)) { List<File> files = Util.getFiles(dir); for (File file : files) { List<String> tokens = lexer.lexFile(file) .flatMap(vocabulary::toIndices) .map(ix -> "" + ix) .collect(Collectors.toList()); while (tokens.size() > 1.1*LINE_CUTOFF) { write(fw, tokens.subList(0, LINE_CUTOFF)); tokens = tokens.subList(LINE_CUTOFF, tokens.size()); } write(fw, tokens); } } } private static void write(FileWriter fw, List<String> tokens) throws IOException { for (String token : tokens) { fw.append(token); fw.append(" "); } fw.append(Vocabulary.EOS); fw.append("\n"); } }