codemining.languagetools.ITokenizer.FullToken Java Examples

The following examples show how to use codemining.languagetools.ITokenizer.FullToken. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TokenizerUtils.java    From api-mining with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Crudely join tokens together.
 * 
 * @param tokens
 * @param sb
 * @return
 */
public final static StringBuffer joinFullTokens(
		final List<FullToken> tokens, final StringBuffer sb) {
	for (final FullToken token : tokens) {
		sb.append(token.token);
		sb.append(" ");
	}

	return sb;
}
 
Example #2
Source File: TokenizerUtils.java    From api-mining with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Remove the sentence start/end FullTokens.
 * 
 * @param tokenSequence
 */
public static final void removeSentenceStartEndFullTokens(
		final List<FullToken> tokenSequence) {
	checkArgument(tokenSequence.get(0).token
			.equals(ITokenizer.SENTENCE_START));
	tokenSequence.remove(0);
	checkArgument(tokenSequence.get(tokenSequence.size() - 1).token
			.equals(ITokenizer.SENTENCE_END));
	tokenSequence.remove(tokenSequence.size() - 1);
}
 
Example #3
Source File: TokenizerUtils.java    From codemining-core with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Remove the sentence start/end FullTokens.
 * 
 * @param tokenSequence
 */
public static final void removeSentenceStartEndFullTokens(
		final List<FullToken> tokenSequence) {
	checkArgument(tokenSequence.get(0).token
			.equals(ITokenizer.SENTENCE_START));
	tokenSequence.remove(0);
	checkArgument(tokenSequence.get(tokenSequence.size() - 1).token
			.equals(ITokenizer.SENTENCE_END));
	tokenSequence.remove(tokenSequence.size() - 1);
}
 
Example #4
Source File: IdentifierNeighborsNGramLM.java    From naturalize with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public void addRelevantNGrams(final List<FullToken> lst) {

			final SortedSet<Integer> identifierPositions = new TreeSet<Integer>();
			final List<String> sentence = Lists.newArrayList();

			for (int i = 0; i < lst.size(); i++) {
				final FullToken fullToken = lst.get(i);
				sentence.add(fullToken.token);
				if (fullToken.tokenType.equals(tokenizer.getIdentifierType())) {
					identifierPositions.add(i);
				}
			}

			// Construct the rest
			for (int i = 0; i < sentence.size(); i++) {
				// Filter n-grams with no identifiers
				if (identifierPositions.subSet(i - getN() + 1, i + 1).isEmpty()) {
					continue;
				}
				final NGram<String> ngram = NGram.constructNgramAt(i, sentence,
						getN());
				if (ngram.size() > 1) {
					addNgram(ngram, false);
				}
			}

		}
 
Example #5
Source File: IdentifierNeighborsNGramLM.java    From naturalize with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public void run() {
	LOGGER.finer("Reading file " + codeFile.getAbsolutePath());
	try {
		final List<FullToken> tokens = tokenizer
				.getTokenListFromCode(codeFile);

		addRelevantNGrams(tokens);
	} catch (final IOException e) {
		LOGGER.warning(ExceptionUtils.getFullStackTrace(e));
	}
}
 
Example #6
Source File: AbstractJavaNameBindingsExtractor.java    From tassal with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public final List<TokenNameBinding> getNameBindings(final ASTNode node,
		final File file) throws IOException {
	final Set<Set<ASTNode>> nodeBindings = getNameBindings(node);
	final SortedMap<Integer, String> tokenPositions = Maps.transformValues(
			tokenizer.tokenListWithPos(file),
			FullToken.TOKEN_NAME_CONVERTER);
	return getTokenBindings(tokenPositions, nodeBindings);
}
 
Example #7
Source File: AbstractJavaNameBindingsExtractor.java    From api-mining with GNU General Public License v3.0 5 votes vote down vote up
public final List<TokenNameBinding> getNameBindings(final ASTNode node,
		final File file) throws IOException {
	final Set<Set<ASTNode>> nodeBindings = getNameBindings(node);
	final SortedMap<Integer, String> tokenPositions = Maps.transformValues(
			tokenizer.tokenListWithPos(file),
			FullToken.TOKEN_NAME_CONVERTER);
	return getTokenBindings(tokenPositions, nodeBindings);
}
 
Example #8
Source File: TokenizerUtils.java    From tassal with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Crudely join tokens together.
 * 
 * @param tokens
 * @param sb
 * @return
 */
public final static StringBuffer joinFullTokens(
		final List<FullToken> tokens, final StringBuffer sb) {
	for (final FullToken token : tokens) {
		sb.append(token.token);
		sb.append(" ");
	}

	return sb;
}
 
Example #9
Source File: TokenizerUtils.java    From tassal with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Remove the sentence start/end FullTokens.
 * 
 * @param tokenSequence
 */
public static final void removeSentenceStartEndFullTokens(
		final List<FullToken> tokenSequence) {
	checkArgument(tokenSequence.get(0).token
			.equals(ITokenizer.SENTENCE_START));
	tokenSequence.remove(0);
	checkArgument(tokenSequence.get(tokenSequence.size() - 1).token
			.equals(ITokenizer.SENTENCE_END));
	tokenSequence.remove(tokenSequence.size() - 1);
}
 
Example #10
Source File: TokenizerUtils.java    From codemining-core with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Crudely join tokens together.
 * 
 * @param tokens
 * @param sb
 * @return
 */
public final static StringBuffer joinFullTokens(
		final List<FullToken> tokens, final StringBuffer sb) {
	for (final FullToken token : tokens) {
		sb.append(token.token);
		sb.append(" ");
	}

	return sb;
}
 
Example #11
Source File: AbstractJavaNameBindingsExtractor.java    From codemining-core with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public final List<TokenNameBinding> getNameBindings(final ASTNode node,
		final File file) throws IOException {
	final Set<Set<ASTNode>> nodeBindings = getNameBindings(node);
	final SortedMap<Integer, String> tokenPositions = Maps.transformValues(
			tokenizer.tokenListWithPos(file),
			FullToken.TOKEN_NAME_CONVERTER);
	return getTokenBindings(tokenPositions, nodeBindings);
}
 
Example #12
Source File: TokenizerTUI.java    From tassal with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
public static void main(final String[] args) throws InvalidInputException,
		IOException, InstantiationException, IllegalAccessException,
		ClassNotFoundException, IllegalArgumentException,
		SecurityException, InvocationTargetException, NoSuchMethodException {
	if (args.length < 2) {
		System.err
				.println("Usage <codeDir> <TokenizerClass> [TokenizerArgs]");
		return;
	}

	final ITokenizer tok;
	final String tokenizerClass = args[1];
	if (args.length == 2) {
		tok = TokenizerUtils.tokenizerForClass(tokenizerClass);
	} else {
		final String tokenizerArguments = args[2];
		tok = TokenizerUtils.tokenizerForClass(tokenizerClass,
				tokenizerArguments);
	}

	final File baseFile = new File(args[0]);
	final Collection<File> allFiles;
	if (baseFile.isDirectory()) {
		allFiles = FileUtils.listFiles(baseFile, tok.getFileFilter(),
				DirectoryFileFilter.DIRECTORY);
	} else {
		allFiles = Lists.newArrayList(baseFile);
	}

	for (final File fi : allFiles) {

		final StringBuffer buf = new StringBuffer();
		for (final FullToken token : tok.getTokenListFromCode(fi)) {
			buf.append(token);
			buf.append(System.getProperty("line.separator"));
		}

		System.out.println(buf.toString());
		System.out.println();

	}
}
 
Example #13
Source File: CodePrinter.java    From codemining-core with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * Return a StringBuffer with colored tokens as specified from the
 * coloredTokens. There should be one-to-one correspondence with the actual
 * tokens.
 */
public StringBuffer getHTMLwithColors(
		final List<ColoredToken> coloredTokens, final File codeFile)
		throws IOException, InstantiationException, IllegalAccessException {
	final String code = FileUtils.readFileToString(codeFile);
	lineNumber = 1;

	final StringBuffer buf = new StringBuffer();

	final SortedMap<Integer, FullToken> toks = tokenizer
			.fullTokenListWithPos(code.toCharArray());

	int i = 0;
	int prevPos = 0;
	buf.append("<html>\n<head>\n<link href='http://fonts.googleapis.com/css?family=Source+Code+Pro:300,400,500,600,700,800,900' rel='stylesheet' type='text/css'>\n");
	buf.append(CSS_STYLE);
	buf.append("</head>\n<body style='background-color:rgb("
			+ documentBackgroundColor.getRed() + ","
			+ documentBackgroundColor.getGreen() + ","
			+ documentBackgroundColor.getBlue() + ")'>");
	appendLineDiv(buf, false);
	for (final Entry<Integer, FullToken> entry : toks.entrySet()) {
		if (i == 0 || entry.getKey() == Integer.MAX_VALUE) {
			i++;
			continue;
		}
		addSlack(code.substring(prevPos, entry.getKey()), buf);
		final ColoredToken tok = coloredTokens.get(i);

		buf.append("<span style='background-color:rgba("
				+ tok.bgColor.getRed() + "," + tok.bgColor.getGreen() + ","
				+ tok.bgColor.getBlue() + "," + (ignoreTokBG ? "0" : "1")
				+ "); color:rgb(" + tok.fontColor.getRed() + ","
				+ tok.fontColor.getGreen() + "," + tok.fontColor.getBlue()
				+ "); " + tok.extraStyle + "'>"
				+ StringEscapeUtils.escapeHtml(entry.getValue().token)
				+ "</span>");
		i++;
		prevPos = entry.getKey() + entry.getValue().token.length();
	}
	buf.append("</div></body></html>");
	return buf;

}
 
Example #14
Source File: TokenizerTUI.java    From codemining-core with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
public static void main(final String[] args) throws InvalidInputException,
		IOException, InstantiationException, IllegalAccessException,
		ClassNotFoundException, IllegalArgumentException,
		SecurityException, InvocationTargetException, NoSuchMethodException {
	if (args.length < 2) {
		System.err
				.println("Usage <codeDir> <TokenizerClass> [TokenizerArgs]");
		return;
	}

	final ITokenizer tok;
	final String tokenizerClass = args[1];
	if (args.length == 2) {
		tok = TokenizerUtils.tokenizerForClass(tokenizerClass);
	} else {
		final String tokenizerArguments = args[2];
		tok = TokenizerUtils.tokenizerForClass(tokenizerClass,
				tokenizerArguments);
	}

	final File baseFile = new File(args[0]);
	final Collection<File> allFiles;
	if (baseFile.isDirectory()) {
		allFiles = FileUtils.listFiles(baseFile, tok.getFileFilter(),
				DirectoryFileFilter.DIRECTORY);
	} else {
		allFiles = Lists.newArrayList(baseFile);
	}

	for (final File fi : allFiles) {

		final StringBuffer buf = new StringBuffer();
		for (final FullToken token : tok.getTokenListFromCode(fi)) {
			buf.append(token);
			buf.append(System.getProperty("line.separator"));
		}

		System.out.println(buf.toString());
		System.out.println();

	}
}
 
Example #15
Source File: CodePrinter.java    From tassal with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * Return a StringBuffer with colored tokens as specified from the
 * coloredTokens. There should be one-to-one correspondence with the actual
 * tokens.
 */
public StringBuffer getHTMLwithColors(
		final List<ColoredToken> coloredTokens, final File codeFile)
		throws IOException, InstantiationException, IllegalAccessException {
	final String code = FileUtils.readFileToString(codeFile);
	lineNumber = 1;

	final StringBuffer buf = new StringBuffer();

	final SortedMap<Integer, FullToken> toks = tokenizer
			.fullTokenListWithPos(code.toCharArray());

	int i = 0;
	int prevPos = 0;
	buf.append("<html>\n<head>\n<link href='http://fonts.googleapis.com/css?family=Source+Code+Pro:300,400,500,600,700,800,900' rel='stylesheet' type='text/css'>\n");
	buf.append(CSS_STYLE);
	buf.append("</head>\n<body style='background-color:rgb("
			+ documentBackgroundColor.getRed() + ","
			+ documentBackgroundColor.getGreen() + ","
			+ documentBackgroundColor.getBlue() + ")'>");
	appendLineDiv(buf, false);
	for (final Entry<Integer, FullToken> entry : toks.entrySet()) {
		if (i == 0 || entry.getKey() == Integer.MAX_VALUE) {
			i++;
			continue;
		}
		addSlack(code.substring(prevPos, entry.getKey()), buf);
		final ColoredToken tok = coloredTokens.get(i);

		buf.append("<span style='background-color:rgba("
				+ tok.bgColor.getRed() + "," + tok.bgColor.getGreen() + ","
				+ tok.bgColor.getBlue() + "," + (ignoreTokBG ? "0" : "1")
				+ "); color:rgb(" + tok.fontColor.getRed() + ","
				+ tok.fontColor.getGreen() + "," + tok.fontColor.getBlue()
				+ "); " + tok.extraStyle + "'>"
				+ StringEscapeUtils.escapeHtml(entry.getValue().token)
				+ "</span>");
		i++;
		prevPos = entry.getKey() + entry.getValue().token.length();
	}
	buf.append("</div></body></html>");
	return buf;

}
 
Example #16
Source File: CodePrinter.java    From api-mining with GNU General Public License v3.0 4 votes vote down vote up
/**
 * Return a StringBuffer with colored tokens as specified from the
 * coloredTokens. There should be one-to-one correspondence with the actual
 * tokens.
 */
public StringBuffer getHTMLwithColors(
		final List<ColoredToken> coloredTokens, final File codeFile)
		throws IOException, InstantiationException, IllegalAccessException {
	final String code = FileUtils.readFileToString(codeFile);
	lineNumber = 1;

	final StringBuffer buf = new StringBuffer();

	final SortedMap<Integer, FullToken> toks = tokenizer
			.fullTokenListWithPos(code.toCharArray());

	int i = 0;
	int prevPos = 0;
	buf.append("<html>\n<head>\n<link href='http://fonts.googleapis.com/css?family=Source+Code+Pro:300,400,500,600,700,800,900' rel='stylesheet' type='text/css'>\n");
	buf.append(CSS_STYLE);
	buf.append("</head>\n<body style='background-color:rgb("
			+ documentBackgroundColor.getRed() + ","
			+ documentBackgroundColor.getGreen() + ","
			+ documentBackgroundColor.getBlue() + ")'>");
	appendLineDiv(buf, false);
	for (final Entry<Integer, FullToken> entry : toks.entrySet()) {
		if (i == 0 || entry.getKey() == Integer.MAX_VALUE) {
			i++;
			continue;
		}
		addSlack(code.substring(prevPos, entry.getKey()), buf);
		final ColoredToken tok = coloredTokens.get(i);

		buf.append("<span style='background-color:rgba("
				+ tok.bgColor.getRed() + "," + tok.bgColor.getGreen() + ","
				+ tok.bgColor.getBlue() + "," + (ignoreTokBG ? "0" : "1")
				+ "); color:rgb(" + tok.fontColor.getRed() + ","
				+ tok.fontColor.getGreen() + "," + tok.fontColor.getBlue()
				+ "); " + tok.extraStyle + "'>"
				+ StringEscapeUtils.escapeHtml(entry.getValue().token)
				+ "</span>");
		i++;
		prevPos = entry.getKey() + entry.getValue().token.length();
	}
	buf.append("</div></body></html>");
	return buf;

}
 
Example #17
Source File: TokenizerTUI.java    From api-mining with GNU General Public License v3.0 4 votes vote down vote up
public static void main(final String[] args) throws InvalidInputException,
		IOException, InstantiationException, IllegalAccessException,
		ClassNotFoundException, IllegalArgumentException,
		SecurityException, InvocationTargetException, NoSuchMethodException {
	if (args.length < 2) {
		System.err
				.println("Usage <codeDir> <TokenizerClass> [TokenizerArgs]");
		return;
	}

	final ITokenizer tok;
	final String tokenizerClass = args[1];
	if (args.length == 2) {
		tok = TokenizerUtils.tokenizerForClass(tokenizerClass);
	} else {
		final String tokenizerArguments = args[2];
		tok = TokenizerUtils.tokenizerForClass(tokenizerClass,
				tokenizerArguments);
	}

	final File baseFile = new File(args[0]);
	final Collection<File> allFiles;
	if (baseFile.isDirectory()) {
		allFiles = FileUtils.listFiles(baseFile, tok.getFileFilter(),
				DirectoryFileFilter.DIRECTORY);
	} else {
		allFiles = Lists.newArrayList(baseFile);
	}

	for (final File fi : allFiles) {

		final StringBuffer buf = new StringBuffer();
		for (final FullToken token : tok.getTokenListFromCode(fi)) {
			buf.append(token);
			buf.append(System.getProperty("line.separator"));
		}

		System.out.println(buf.toString());
		System.out.println();

	}
}