package codemining.java.tokenizers; import java.io.File; import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.SortedMap; import java.util.logging.Logger; import org.apache.commons.io.FileUtils; import org.apache.commons.io.filefilter.AbstractFileFilter; import org.apache.commons.io.filefilter.RegexFileFilter; import org.apache.commons.lang.exception.ExceptionUtils; import org.eclipse.jdt.core.compiler.ITerminalSymbols; import org.eclipse.jdt.core.compiler.InvalidInputException; import org.eclipse.jdt.internal.core.util.PublicScanner; import codemining.languagetools.ITokenizer; import com.google.common.collect.Lists; import com.google.common.collect.Maps; /** * A Java Code tokenizer using Eclipse JDT. * * @author Miltos Allamanis <[email protected]> * */ public class JavaTokenizer implements ITokenizer { private static final long serialVersionUID = 505587999946057082L; private static final Logger LOGGER = Logger.getLogger(JavaTokenizer.class .getName()); /** * A filter for the files being tokenized. */ public static final RegexFileFilter javaCodeFileFilter = new RegexFileFilter( ".*\\.java$"); private final boolean tokenizeComments; public static final String IDENTIFIER_ID = Integer .toString(ITerminalSymbols.TokenNameIdentifier); public static final String[] KEYWORD_TYPE_IDs = new String[] { Integer.toString(ITerminalSymbols.TokenNameboolean), Integer.toString(ITerminalSymbols.TokenNamebyte), Integer.toString(ITerminalSymbols.TokenNamechar), Integer.toString(ITerminalSymbols.TokenNamedouble), Integer.toString(ITerminalSymbols.TokenNamefloat), Integer.toString(ITerminalSymbols.TokenNameint), Integer.toString(ITerminalSymbols.TokenNamelong), Integer.toString(ITerminalSymbols.TokenNameshort), Integer.toString(ITerminalSymbols.TokenNamevoid) }; public static final String[] STRING_LITERAL_IDs = new String[] { Integer.toString(ITerminalSymbols.TokenNameStringLiteral), Integer.toString(ITerminalSymbols.TokenNameCharacterLiteral) }; public static final String[] NUMBER_LITERAL_IDs = new String[] { Integer.toString(ITerminalSymbols.TokenNameDoubleLiteral), Integer.toString(ITerminalSymbols.TokenNameFloatingPointLiteral), Integer.toString(ITerminalSymbols.TokenNameIntegerLiteral), Integer.toString(ITerminalSymbols.TokenNameLongLiteral) }; public static final String[] COMMENT_IDs = new String[] { Integer.toString(ITerminalSymbols.TokenNameCOMMENT_BLOCK), Integer.toString(ITerminalSymbols.TokenNameCOMMENT_JAVADOC), Integer.toString(ITerminalSymbols.TokenNameCOMMENT_LINE) }; public static final String[] OPERATOR_IDs = new String[] { Integer.toString(ITerminalSymbols.TokenNameAND), Integer.toString(ITerminalSymbols.TokenNameAND_AND), Integer.toString(ITerminalSymbols.TokenNameAND_EQUAL), Integer.toString(ITerminalSymbols.TokenNameCOLON), Integer.toString(ITerminalSymbols.TokenNameCOMMA), Integer.toString(ITerminalSymbols.TokenNameDIVIDE), Integer.toString(ITerminalSymbols.TokenNameDIVIDE_EQUAL), Integer.toString(ITerminalSymbols.TokenNameDOT), Integer.toString(ITerminalSymbols.TokenNameELLIPSIS), Integer.toString(ITerminalSymbols.TokenNameEQUAL), Integer.toString(ITerminalSymbols.TokenNameEQUAL_EQUAL), Integer.toString(ITerminalSymbols.TokenNameGREATER), Integer.toString(ITerminalSymbols.TokenNameGREATER_EQUAL), Integer.toString(ITerminalSymbols.TokenNameLBRACKET), Integer.toString(ITerminalSymbols.TokenNameLEFT_SHIFT), Integer.toString(ITerminalSymbols.TokenNameLEFT_SHIFT_EQUAL), Integer.toString(ITerminalSymbols.TokenNameLESS), Integer.toString(ITerminalSymbols.TokenNameLESS_EQUAL), Integer.toString(ITerminalSymbols.TokenNameLPAREN), Integer.toString(ITerminalSymbols.TokenNameMINUS), Integer.toString(ITerminalSymbols.TokenNameMINUS_EQUAL), Integer.toString(ITerminalSymbols.TokenNameMINUS_MINUS), Integer.toString(ITerminalSymbols.TokenNameMULTIPLY), Integer.toString(ITerminalSymbols.TokenNameMULTIPLY_EQUAL), Integer.toString(ITerminalSymbols.TokenNameNOT), Integer.toString(ITerminalSymbols.TokenNameNOT_EQUAL), Integer.toString(ITerminalSymbols.TokenNameOR), Integer.toString(ITerminalSymbols.TokenNameOR_EQUAL), Integer.toString(ITerminalSymbols.TokenNameOR_OR), Integer.toString(ITerminalSymbols.TokenNamePLUS), Integer.toString(ITerminalSymbols.TokenNamePLUS_EQUAL), Integer.toString(ITerminalSymbols.TokenNamePLUS_PLUS), Integer.toString(ITerminalSymbols.TokenNameQUESTION), Integer.toString(ITerminalSymbols.TokenNameRBRACKET), Integer.toString(ITerminalSymbols.TokenNameREMAINDER), Integer.toString(ITerminalSymbols.TokenNameREMAINDER_EQUAL), Integer.toString(ITerminalSymbols.TokenNameRIGHT_SHIFT), Integer.toString(ITerminalSymbols.TokenNameRIGHT_SHIFT_EQUAL), Integer.toString(ITerminalSymbols.TokenNameRPAREN), Integer.toString(ITerminalSymbols.TokenNameSEMICOLON), Integer.toString(ITerminalSymbols.TokenNameTWIDDLE), Integer.toString(ITerminalSymbols.TokenNameUNSIGNED_RIGHT_SHIFT), Integer.toString(ITerminalSymbols.TokenNameUNSIGNED_RIGHT_SHIFT_EQUAL), Integer.toString(ITerminalSymbols.TokenNameXOR), Integer.toString(ITerminalSymbols.TokenNameXOR_EQUAL) }; public static final String[] BRACE_IDs = new String[] { Integer.toString(ITerminalSymbols.TokenNameLBRACE), Integer.toString(ITerminalSymbols.TokenNameRBRACE), }; public static final String[] SYNTAX_IDs = { Integer.toString(ITerminalSymbols.TokenNameCOMMA), Integer.toString(ITerminalSymbols.TokenNameDOT), Integer.toString(ITerminalSymbols.TokenNameELLIPSIS), Integer.toString(ITerminalSymbols.TokenNameSEMICOLON), Integer.toString(ITerminalSymbols.TokenNameLBRACE), Integer.toString(ITerminalSymbols.TokenNameRBRACE), Integer.toString(ITerminalSymbols.TokenNameLPAREN), Integer.toString(ITerminalSymbols.TokenNameRPAREN), Integer.toString(ITerminalSymbols.TokenNameLBRACKET), Integer.toString(ITerminalSymbols.TokenNameRBRACKET) }; public JavaTokenizer() { tokenizeComments = false; } public JavaTokenizer(final boolean tokenizeComments) { this.tokenizeComments = tokenizeComments; } @Override public SortedMap<Integer, FullToken> fullTokenListWithPos(final char[] code) { // TODO Duplicate Code final PublicScanner scanner = prepareScanner(); final SortedMap<Integer, FullToken> tokens = Maps.newTreeMap(); tokens.put(-1, new FullToken(SENTENCE_START, SENTENCE_START)); tokens.put(Integer.MAX_VALUE, new FullToken(SENTENCE_END, SENTENCE_END)); scanner.setSource(code); while (!scanner.atEnd()) { do { try { final int token = scanner.getNextToken(); if (token == ITerminalSymbols.TokenNameEOF) { break; } final String nxtToken = transformToken(token, scanner.getCurrentTokenString()); final int position = scanner.getCurrentTokenStartPosition(); tokens.put(position, new FullToken(nxtToken, Integer.toString(token))); } catch (final InvalidInputException e) { LOGGER.warning(ExceptionUtils.getFullStackTrace(e)); } } while (!scanner.atEnd()); } return tokens; } @Override public AbstractFileFilter getFileFilter() { return javaCodeFileFilter; } @Override public String getIdentifierType() { return IDENTIFIER_ID; } @Override public Collection<String> getKeywordTypes() { return Arrays.asList(KEYWORD_TYPE_IDs); } @Override public Collection<String> getLiteralTypes() { final List<String> allLiterals = Lists.newArrayList(Arrays .asList(NUMBER_LITERAL_IDs)); allLiterals.addAll(Arrays.asList(STRING_LITERAL_IDs)); return allLiterals; } @Override public FullToken getTokenFromString(final String token) { if (token.equals(ITokenizer.SENTENCE_START)) { return new FullToken(ITokenizer.SENTENCE_START, ITokenizer.SENTENCE_START); } if (token.equals(ITokenizer.SENTENCE_END)) { return new FullToken(ITokenizer.SENTENCE_END, ITokenizer.SENTENCE_END); } return getTokenListFromCode(token.toCharArray()).get(1); } @Override public List<FullToken> getTokenListFromCode(final char[] code) { final List<FullToken> tokens = Lists.newArrayList(); tokens.add(new FullToken(SENTENCE_START, SENTENCE_START)); final PublicScanner scanner = prepareScanner(); scanner.setSource(code); do { try { final int token = scanner.getNextToken(); if (token == ITerminalSymbols.TokenNameEOF) { break; } final String nxtToken = transformToken(token, scanner.getCurrentTokenString()); tokens.add(new FullToken(stripTokenIfNeeded(nxtToken), Integer .toString(token))); } catch (final InvalidInputException e) { LOGGER.warning(ExceptionUtils.getFullStackTrace(e)); } catch (final StringIndexOutOfBoundsException e) { LOGGER.warning(ExceptionUtils.getFullStackTrace(e)); } } while (!scanner.atEnd()); tokens.add(new FullToken(SENTENCE_END, SENTENCE_END)); return tokens; } @Override public List<FullToken> getTokenListFromCode(final File codeFile) throws IOException { return getTokenListFromCode(FileUtils.readFileToString(codeFile) .toCharArray()); } /** * Create the scanner. * * @return */ protected PublicScanner prepareScanner() { final PublicScanner scanner = new PublicScanner(); scanner.tokenizeComments = tokenizeComments; return scanner; } /** * @param token * @return */ protected String stripTokenIfNeeded(final String token) { return token.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ') .replace("\n", " ").replace("\t", " ").replace("\r", " ") .replace("\'\\\\\'", "\'|\'").replace("\\", "|"); } /* * (non-Javadoc) * * @see uk.ac.ed.inf.javacodeutils.ITokenizer#tokenListFromCode(char[]) */ @Override public List<String> tokenListFromCode(final char[] code) { final PublicScanner scanner = prepareScanner(); final List<String> tokens = Lists.newArrayList(); tokens.add(SENTENCE_START); scanner.setSource(code); do { try { final int token = scanner.getNextToken(); if (token == ITerminalSymbols.TokenNameEOF) { break; } final String nxtToken = transformToken(token, scanner.getCurrentTokenString()); tokens.add(stripTokenIfNeeded(nxtToken)); } catch (final InvalidInputException e) { LOGGER.warning(ExceptionUtils.getFullStackTrace(e)); } catch (final StringIndexOutOfBoundsException e) { LOGGER.warning(ExceptionUtils.getFullStackTrace(e)); } } while (!scanner.atEnd()); tokens.add(SENTENCE_END); return tokens; } @Override public List<String> tokenListFromCode(final File codeFile) throws IOException { return tokenListFromCode(FileUtils.readFileToString(codeFile) .toCharArray()); } /* * (non-Javadoc) * * @see uk.ac.ed.inf.javacodeutils.ITokenizer#tokenListWithPos(char[]) */ @Override public SortedMap<Integer, String> tokenListWithPos(final char[] code) { final PublicScanner scanner = prepareScanner(); final SortedMap<Integer, String> tokens = Maps.newTreeMap(); tokens.put(-1, SENTENCE_START); tokens.put(Integer.MAX_VALUE, SENTENCE_END); scanner.setSource(code); while (!scanner.atEnd()) { do { try { final int token = scanner.getNextToken(); if (token == ITerminalSymbols.TokenNameEOF) { break; } final String nxtToken = transformToken(token, scanner.getCurrentTokenString()); final int position = scanner.getCurrentTokenStartPosition(); tokens.put(position, stripTokenIfNeeded(nxtToken)); } catch (final InvalidInputException e) { LOGGER.warning(ExceptionUtils.getFullStackTrace(e)); } } while (!scanner.atEnd()); } return tokens; } @Override public SortedMap<Integer, FullToken> tokenListWithPos(final File file) throws IOException { return fullTokenListWithPos(FileUtils.readFileToString(file) .toCharArray()); } /** * Function used to transform the tokens. Useful when overriding some tokens * in subclasses. * * @param tokenType * @param token * @return */ protected String transformToken(final int tokenType, final String token) { return token; } }