codemining.languagetools.ITokenizer Java Examples

The following examples show how to use codemining.languagetools.ITokenizer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CommonNameRenamingEvaluator.java    From naturalize with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * @param args
 */
public static void main(String[] args) {
	if (args.length < 5) {
		System.err.println("Usage <folder> <junkNames> ...");
		return;
	}

	final File directory = new File(args[0]);

	final ITokenizer tokenizer = new JavaTokenizer();

	final Class<? extends AbstractNGramLM> smoothedNgramClass = StupidBackoff.class;

	final CommonNameRenamingEvaluator evaluator = new CommonNameRenamingEvaluator(
			directory, tokenizer, smoothedNgramClass,
			Sets.newTreeSet(Arrays.asList(args).subList(1, args.length)));
	evaluator.evaluate();
	evaluator.printResults();

}
 
Example #2
Source File: ScopedIdentifierRenaming.java    From api-mining with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Crudely rename the name of an identifier by searching for similarly named
 * tokens.
 * 
 * @param snippet
 * @param variableMapping
 *            from, to
 * @return
 */
private String renameVariableInSnippet(final String snippet,
		final Map<String, String> variableMapping) {
	final List<String> tokens = tokenizer.tokenListFromCode(snippet
			.toCharArray());

	final StringBuffer bf = new StringBuffer();
	for (final String token : tokens) {
		if (variableMapping.containsKey(token)) {
			bf.append(variableMapping.get(token));
		} else if (token.equals(ITokenizer.SENTENCE_START)
				|| token.equals(ITokenizer.SENTENCE_END)) {
			continue;
		} else {
			bf.append(token);
		}
		bf.append(" ");
	}
	return bf.toString();

}
 
Example #3
Source File: JavaASTExtractor.java    From codemining-core with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Hacky way to compare snippets.
 *
 * @param snippet
 * @return
 */
private String normalizeCode(final char[] snippet) {
	final List<String> tokens = (new JavaTokenizer())
			.tokenListFromCode(snippet);

	final StringBuffer bf = new StringBuffer();
	for (final String token : tokens) {
		if (token.equals(ITokenizer.SENTENCE_START)
				|| token.equals(ITokenizer.SENTENCE_END)) {
			continue;
		} else {
			bf.append(token);
		}
		bf.append(" ");
	}
	return bf.toString();

}
 
Example #4
Source File: HumanEvaluationOutput.java    From naturalize with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * @param args
 * @throws IllegalAccessException
 * @throws InstantiationException
 * @throws ClassNotFoundException
 */
public static void main(String[] args) throws InstantiationException,
		IllegalAccessException, ClassNotFoundException {
	if (args.length < 5) {
		System.err
				.println("Usage: <projectDir> <tokenizerClass> variable|method|class examplesToGenerate <renamerClass> [renamerParams]");
		return;
	}

	final File directory = new File(args[0]);
	final IScopeExtractor extractor = ScopesTUI
			.getScopeExtractorByName(args[2]);
	final long nExamples = Long.parseLong(args[3]);

	final Class<? extends ITokenizer> tokenizerName = (Class<? extends ITokenizer>) Class
			.forName(args[1]);
	final ITokenizer tokenizer = tokenizerName.newInstance();

	final String renamerClass = args[4];

	final HumanEvaluationOutput heo = new HumanEvaluationOutput(directory,
			extractor, renamerClass, args.length == 5 ? null : args[5],
			tokenizer, nExamples);
	heo.getOutput();

}
 
Example #5
Source File: SegmentRenamingSuggestion.java    From naturalize with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
public static SortedSet<Suggestion> getVariableSuggestions(
		final File currentFile, final File directory, final boolean useUNK)
		throws IOException {
	final ITokenizer tokenizer = new JavaTokenizer();

	final AbstractIdentifierRenamings renamer = new BaseIdentifierRenamings(
			tokenizer);

	final Collection<java.io.File> trainingFiles = FileUtils.listFiles(
			directory, tokenizer.getFileFilter(),
			DirectoryFileFilter.DIRECTORY);

	trainingFiles.remove(currentFile);

	renamer.buildRenamingModel(trainingFiles);

	final IScopeExtractor scopeExtractor = new VariableScopeExtractor.VariableScopeSnippetExtractor();

	final SegmentRenamingSuggestion suggestion = new SegmentRenamingSuggestion(
			renamer, scopeExtractor, useUNK);

	return suggestion.rankSuggestions(currentFile);
}
 
Example #6
Source File: SegmentRenamingSuggestion.java    From naturalize with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public static void main(final String[] args)
		throws IllegalArgumentException, SecurityException,
		InstantiationException, IllegalAccessException,
		InvocationTargetException, NoSuchMethodException,
		ClassNotFoundException, IOException {
	if (args.length < 4) {
		System.err
				.println("Usage <TestFile> <TrainDirectory> <renamerClass> variable|method");
		return;
	}

	final ITokenizer tokenizer = new JavaTokenizer();

	final AbstractIdentifierRenamings renamer = (AbstractIdentifierRenamings) Class
			.forName(args[2]).getDeclaredConstructor(ITokenizer.class)
			.newInstance(tokenizer);

	renamer.buildRenamingModel(FileUtils.listFiles(new File(args[1]),
			tokenizer.getFileFilter(), DirectoryFileFilter.DIRECTORY));

	final IScopeExtractor scopeExtractor = ScopesTUI
			.getScopeExtractorByName(args[3]);
	final SegmentRenamingSuggestion suggestion = new SegmentRenamingSuggestion(
			renamer, scopeExtractor, true);

	System.out.println(suggestion.rankSuggestions(new File(args[0])));

}
 
Example #7
Source File: CDTTokenizer.java    From tassal with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public FullToken getTokenFromString(final String token) {
	if (token.equals(ITokenizer.SENTENCE_START)) {
		return new FullToken(ITokenizer.SENTENCE_START,
				ITokenizer.SENTENCE_START);
	}

	if (token.equals(ITokenizer.SENTENCE_END)) {
		return new FullToken(ITokenizer.SENTENCE_END,
				ITokenizer.SENTENCE_END);
	}
	return getTokenListFromCode(token.toCharArray()).get(1);
}
 
Example #8
Source File: JavaTokenizer.java    From tassal with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public FullToken getTokenFromString(final String token) {
	if (token.equals(ITokenizer.SENTENCE_START)) {
		return new FullToken(ITokenizer.SENTENCE_START,
				ITokenizer.SENTENCE_START);
	}

	if (token.equals(ITokenizer.SENTENCE_END)) {
		return new FullToken(ITokenizer.SENTENCE_END,
				ITokenizer.SENTENCE_END);
	}
	return getTokenListFromCode(token.toCharArray()).get(1);
}
 
Example #9
Source File: TokenizeJavaCodeTest.java    From codemining-core with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * @param tokenizer
 */
protected void testSample2(ITokenizer tokenizer) {
	final List<String> tok = tokenizer.tokenListFromCode(CODE_SAMPLE2);

	for (int i = 0; i < TOKENS_SAMPLE2.length; i++) {
		assertEquals(tok.get(i), TOKENS_SAMPLE2[i]);
	}
}
 
Example #10
Source File: TokenizeJavaCodeTest.java    From codemining-core with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test
public void testTokenTypes() {
	ITokenizer tokenizer = new JavaTokenizer();
	assertEquals(
			tokenizer.getTokenFromString("hello"),
			new ITokenizer.FullToken("hello", tokenizer.getIdentifierType()));
	assertEquals(
			tokenizer.getTokenFromString("{"),
			new ITokenizer.FullToken("{", Integer
					.toString(ITerminalSymbols.TokenNameLBRACE)));

}
 
Example #11
Source File: TokenizeJavascriptCodeTest.java    From tassal with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * @param tokenizer
 */
protected void testSample3(final ITokenizer tokenizer) {
	final List<String> tok = tokenizer.tokenListFromCode(CODE_SAMPLE3);
	for (int i = 0; i < TOKENS_SAMPLE1.length; i++) {
		assertEquals(tok.get(i), TOKENS_SAMPLE1[i]);
	}
	assertEquals(tok.size(), TOKENS_SAMPLE1.length);
}
 
Example #12
Source File: TokenizeJavaCodeTest.java    From tassal with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * @param tokenizer
 */
protected void testSample3(ITokenizer tokenizer) {
	final List<String> tok = tokenizer.tokenListFromCode(CODE_SAMPLE3);
	for (int i = 0; i < TOKENS_SAMPLE1.length; i++) {
		assertEquals(tok.get(i), TOKENS_SAMPLE1[i]);
	}
	assertEquals(tok.size(), TOKENS_SAMPLE1.length);
}
 
Example #13
Source File: DynamicRangeEval.java    From naturalize with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * 
 */
public DynamicRangeEval(final File directory, final ITokenizer tokenizer,
		final IScopeExtractor ex) {
	allFiles = FileUtils.listFiles(directory, tokenizer.getFileFilter(),
			DirectoryFileFilter.DIRECTORY);
	scopeExtractor = ex;
}
 
Example #14
Source File: PerturbationEvaluator.java    From naturalize with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public PerturbationEvaluator(final File directory,
		final ITokenizer tokenizer, final IScopeExtractor scopeExtractor,
		final String renamerClass) {
	allFiles = FileUtils.listFiles(directory, tokenizer.getFileFilter(),
			DirectoryFileFilter.DIRECTORY);
	this.tokenizer = tokenizer;
	this.scopeExtractor = scopeExtractor;
	this.renamerClass = renamerClass;
	varRenamer = new ScopedIdentifierRenaming(scopeExtractor,
			ParseType.COMPILATION_UNIT);
}
 
Example #15
Source File: JavascriptTokenizer.java    From tassal with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public FullToken getTokenFromString(final String token) {
	if (token.equals(ITokenizer.SENTENCE_START)) {
		return new FullToken(ITokenizer.SENTENCE_START,
				ITokenizer.SENTENCE_START);
	}

	if (token.equals(ITokenizer.SENTENCE_END)) {
		return new FullToken(ITokenizer.SENTENCE_END,
				ITokenizer.SENTENCE_END);
	}
	return getTokenListFromCode(token.toCharArray()).get(1);
}
 
Example #16
Source File: JavaMethodDeclarationBindingExtractor.java    From codemining-core with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public JavaMethodDeclarationBindingExtractor(final ITokenizer tokenizer,
		final boolean includeOverrides, final File inputFolder) {
	super(tokenizer);
	this.includeOverrides = includeOverrides;
	if (!includeOverrides) {
		pti = buildProjectTypeInformation(inputFolder);
	} else {
		pti = null;
	}
}
 
Example #17
Source File: JavaTokenizer.java    From codemining-core with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public FullToken getTokenFromString(final String token) {
	if (token.equals(ITokenizer.SENTENCE_START)) {
		return new FullToken(ITokenizer.SENTENCE_START,
				ITokenizer.SENTENCE_START);
	}

	if (token.equals(ITokenizer.SENTENCE_END)) {
		return new FullToken(ITokenizer.SENTENCE_END,
				ITokenizer.SENTENCE_END);
	}
	return getTokenListFromCode(token.toCharArray()).get(1);
}
 
Example #18
Source File: CommonNameRenamingEvaluator.java    From naturalize with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * @param smoothedNgramClass
 * @param tokenizer
 * @param directory
 * 
 */
public CommonNameRenamingEvaluator(File directory, ITokenizer tokenizer,
		Class<? extends AbstractNGramLM> smoothedNgramClass,
		Set<String> junkVariables) {
	allFiles = FileUtils.listFiles(directory, tokenizer.getFileFilter(),
			DirectoryFileFilter.DIRECTORY);
	this.tokenizer = tokenizer;
	this.smoothedNgramClass = smoothedNgramClass;
	this.junkVariables = junkVariables;
}
 
Example #19
Source File: CppWhitespaceTokenizer.java    From tassal with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public FullToken getTokenFromString(final String token) {
	if (token.equals(ITokenizer.SENTENCE_START)) {
		return new FullToken(ITokenizer.SENTENCE_START, SENTENCE_START);
	}

	if (token.equals(ITokenizer.SENTENCE_END)) {
		return new FullToken(ITokenizer.SENTENCE_END, SENTENCE_END);
	}
	return getTokenListFromCode(token.toCharArray()).get(1);
}
 
Example #20
Source File: BaseIdentifierRenamings.java    From naturalize with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * 
 */
public BaseIdentifierRenamings(final ITokenizer tokenizer) {
	super();
	this.tokenizer = tokenizer;
	try {
		smoothedNgramClass = (Class<? extends AbstractNGramLM>) Class
				.forName(SettingsLoader.getStringSetting(
						"ngramSmootherClass",
						"codemining.lm.ngram.smoothing.StupidBackoff"));
	} catch (final ClassNotFoundException e) {
		LOGGER.severe(ExceptionUtils.getFullStackTrace(e));
		throw new IllegalArgumentException(e);
	}
}
 
Example #21
Source File: LeaveOneOutEvaluator.java    From naturalize with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public LeaveOneOutEvaluator(final File directory,
		final ITokenizer tokenizer,
		final Class<? extends AbstractNGramLM> smoother) {
	allFiles = FileUtils.listFiles(directory, tokenizer.getFileFilter(),
			DirectoryFileFilter.DIRECTORY);
	this.tokenizer = tokenizer;
	for (int i = 0; i < data.length; i++) {
		data[i] = new ResultObject();
	}
}
 
Example #22
Source File: LeaveOneOutEvaluator.java    From naturalize with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * @param args
 * @throws IllegalAccessException
 * @throws InstantiationException
 * @throws ClassNotFoundException
 */
public static void main(String[] args) throws InstantiationException,
		IllegalAccessException, ClassNotFoundException,
		SerializationException {
	if (args.length < 5) {
		System.err
				.println("Usage <folder> <tokenizerClass> <wrapperClass> variable|method <renamingClass> [<renamerConstrParams> ..]");
		return;
	}

	final File directory = new File(args[0]);

	final Class<? extends ITokenizer> tokenizerName = (Class<? extends ITokenizer>) Class
			.forName(args[1]);
	final ITokenizer tokenizer = tokenizerName.newInstance();

	final Class<? extends AbstractNGramLM> smoothedNgramClass = (Class<? extends AbstractNGramLM>) Class
			.forName(args[2]);

	final LeaveOneOutEvaluator eval = new LeaveOneOutEvaluator(directory,
			tokenizer, smoothedNgramClass);

	final IScopeExtractor scopeExtractor = ScopesTUI
			.getScopeExtractorByName(args[3]);

	eval.performEvaluation(scopeExtractor, args[4],
			args.length == 6 ? args[5] : null);
}
 
Example #23
Source File: TokenizeJavaCodeTest.java    From tassal with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * @param tokenizer
 */
protected void testSample1Position(ITokenizer tokenizer) {
	final Map<Integer, String> toks = tokenizer
			.tokenListWithPos(CODE_SAMPLE1);
	for (int i = 0; i < TOKEN_POS_SAMPLE1.length; i++) {
		assertTrue(toks.containsKey(TOKEN_POS_SAMPLE1[i]));
		assertEquals(toks.get(TOKEN_POS_SAMPLE1[i]), TOKENS_SAMPLE1[i]);
	}
	assertEquals(toks.size(), TOKENS_SAMPLE1.length);
}
 
Example #24
Source File: SelectionSuggestionEval.java    From naturalize with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * 
 */
public SelectionSuggestionEval(final File directory,
		final ITokenizer codeTokenizer, final IScopeExtractor extractor) {
	tokenizer = codeTokenizer;
	allFiles = FileUtils.listFiles(directory, tokenizer.getFileFilter(),
			DirectoryFileFilter.DIRECTORY);
	scopeExtractor = extractor;
}
 
Example #25
Source File: TokenizeJavascriptCodeTest.java    From tassal with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test
public void testTokenTypes() {
	final ITokenizer tokenizer = new JavascriptTokenizer();
	assertEquals(
			tokenizer.getTokenFromString("hello"),
			new ITokenizer.FullToken("hello", tokenizer.getIdentifierType()));
	assertEquals(
			tokenizer.getTokenFromString("{"),
			new ITokenizer.FullToken("{", Integer
					.toString(ITerminalSymbols.TokenNameLBRACE)));

}
 
Example #26
Source File: TokenizeJavascriptCodeTest.java    From api-mining with GNU General Public License v3.0 5 votes vote down vote up
@Test
public void testTokenTypes() {
	final ITokenizer tokenizer = new JavascriptTokenizer();
	assertEquals(
			tokenizer.getTokenFromString("hello"),
			new ITokenizer.FullToken("hello", tokenizer.getIdentifierType()));
	assertEquals(
			tokenizer.getTokenFromString("{"),
			new ITokenizer.FullToken("{", Integer
					.toString(ITerminalSymbols.TokenNameLBRACE)));

}
 
Example #27
Source File: TokenizeJavascriptCodeTest.java    From tassal with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * @param tokenizer
 */
protected void testSample2(final ITokenizer tokenizer) {
	final List<String> tok = tokenizer.tokenListFromCode(CODE_SAMPLE2);

	for (int i = 0; i < TOKENS_SAMPLE2.length; i++) {
		assertEquals(tok.get(i), TOKENS_SAMPLE2[i]);
	}
}
 
Example #28
Source File: TokenizeJavascriptCodeTest.java    From api-mining with GNU General Public License v3.0 5 votes vote down vote up
/**
 * @param tokenizer
 */
protected void testSample1Position(final ITokenizer tokenizer) {
	final Map<Integer, String> toks = tokenizer
			.tokenListWithPos(CODE_SAMPLE1);
	for (int i = 0; i < TOKEN_POS_SAMPLE1.length; i++) {
		assertTrue(toks.containsKey(TOKEN_POS_SAMPLE1[i]));
		assertEquals(toks.get(TOKEN_POS_SAMPLE1[i]), TOKENS_SAMPLE1[i]);
	}
	assertEquals(toks.size(), TOKENS_SAMPLE1.length);
}
 
Example #29
Source File: TokenizeJavaCodeTest.java    From api-mining with GNU General Public License v3.0 5 votes vote down vote up
@Test
public void testTokenTypes() {
	ITokenizer tokenizer = new JavaTokenizer();
	assertEquals(
			tokenizer.getTokenFromString("hello"),
			new ITokenizer.FullToken("hello", tokenizer.getIdentifierType()));
	assertEquals(
			tokenizer.getTokenFromString("{"),
			new ITokenizer.FullToken("{", Integer
					.toString(ITerminalSymbols.TokenNameLBRACE)));

}
 
Example #30
Source File: TokenCounter.java    From codemining-core with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * @param args
 * @throws IOException
 * @throws ClassNotFoundException
 * @throws IllegalAccessException
 * @throws InstantiationException
 */
public static void main(final String[] args) throws IOException,
		InstantiationException, IllegalAccessException,
		ClassNotFoundException {
	if (args.length != 2) {
		System.err.println("Usage <codeDir> <TokenizerClass>");
		return;
	}

	long tokenCount = 0;

	final ITokenizer tokenizer = TokenizerUtils.tokenizerForClass(args[1]);

	for (final File fi : FileUtils.listFiles(new File(args[0]),
			tokenizer.getFileFilter(), DirectoryFileFilter.DIRECTORY)) {
		try {
			final char[] code = FileUtils.readFileToString(fi)
					.toCharArray();
			tokenCount += tokenizer.tokenListFromCode(code).size() - 2; // Remove
																		// sentence
																		// start/end
		} catch (final IOException e) {
			LOGGER.warning(ExceptionUtils.getFullStackTrace(e));
		}
	}

	System.out.println("Tokens: " + tokenCount);
}