package codemining.python.codeutils; import java.io.File; import java.io.IOException; import java.util.Collection; import java.util.List; import java.util.SortedMap; import org.apache.commons.io.FileUtils; import org.apache.commons.io.filefilter.AbstractFileFilter; import org.apache.commons.io.filefilter.RegexFileFilter; import org.apache.commons.lang.NotImplementedException; import org.python.pydev.parser.grammarcommon.ITokenManager; import org.python.pydev.parser.jython.FastCharStream; import org.python.pydev.parser.jython.Token; import codemining.languagetools.ITokenizer; import com.google.common.collect.Lists; import com.google.common.collect.Maps; /** * An abstract python tokenizer using the PyDev interface. * * @author Miltos Allamanis <[email protected]> * */ public abstract class AbstractPythonTokenizer implements ITokenizer { private static final long serialVersionUID = 5009530263783901964L; /** * A filter for the files being tokenized. */ private static final RegexFileFilter pythonCodeFilter = new RegexFileFilter( ".*\\.py$"); public AbstractPythonTokenizer() { super(); } @Override public SortedMap<Integer, FullToken> fullTokenListWithPos(final char[] code) { final FastCharStream stream = new FastCharStream(code); final ITokenManager mng = getPythonTokenizer(stream); final SortedMap<Integer, FullToken> tokens = Maps.newTreeMap(); Token nextToken = mng.getNextToken(); while (nextToken.kind != 0) { if (shouldAdd(nextToken)) { // TODO: Bad Heurisitc... tokens.put( nextToken.getBeginLine() * 500 + nextToken.getBeginCol(), new FullToken(nextToken.image, Integer .toString(nextToken.kind))); } nextToken = mng.getNextToken(); } return tokens; } @Override public AbstractFileFilter getFileFilter() { return pythonCodeFilter; } @Override public String getIdentifierType() { return "92"; // TODO from not hard coded? } /* * (non-Javadoc) * * @see codemining.languagetools.ITokenizer#getKeywordTypes() */ @Override public Collection<String> getKeywordTypes() { throw new NotImplementedException(); } /* * (non-Javadoc) * * @see codemining.languagetools.ITokenizer#getLiteralTypes() */ @Override public Collection<String> getLiteralTypes() { throw new NotImplementedException(); } public abstract ITokenManager getPythonTokenizer(final FastCharStream stream); @Override public FullToken getTokenFromString(final String token) { final FastCharStream stream = new FastCharStream(token.toCharArray()); final ITokenManager mng = getPythonTokenizer(stream); final Token pyToken = mng.getNextToken(); return new FullToken(pyToken.image, Integer.toString(pyToken.kind)); } @Override public List<FullToken> getTokenListFromCode(final char[] code) { final FastCharStream stream = new FastCharStream(code); final ITokenManager mng = getPythonTokenizer(stream); final List<FullToken> tokens = Lists.newArrayList(); Token nextToken = mng.getNextToken(); while (nextToken.kind != 0) { if (shouldAdd(nextToken)) { tokens.add(new FullToken(nextToken.image, Integer .toString(nextToken.kind))); } nextToken = mng.getNextToken(); } return tokens; } @Override public List<FullToken> getTokenListFromCode(final File codeFile) throws IOException { return getTokenListFromCode(FileUtils.readFileToString(codeFile) .toCharArray()); } /** * @param nextToken * @return */ public boolean shouldAdd(final Token nextToken) { // disallow whitespace, indent and docstrings return nextToken.kind != 6 && nextToken.kind != 14 && nextToken.kind != 13 && nextToken.kind != 115; } @Override public List<String> tokenListFromCode(final char[] code) { final FastCharStream stream = new FastCharStream(code); final ITokenManager mng = getPythonTokenizer(stream); final List<String> tokens = Lists.newArrayList(); Token nextToken = mng.getNextToken(); while (nextToken.kind != 0) { if (shouldAdd(nextToken)) { tokens.add(nextToken.image); } nextToken = mng.getNextToken(); } return tokens; } @Override public List<String> tokenListFromCode(final File codeFile) throws IOException { return tokenListFromCode(FileUtils.readFileToString(codeFile) .toCharArray()); } @Override public SortedMap<Integer, String> tokenListWithPos(final char[] code) { final FastCharStream stream = new FastCharStream(code); final ITokenManager mng = getPythonTokenizer(stream); final SortedMap<Integer, String> tokens = Maps.newTreeMap(); Token nextToken = mng.getNextToken(); while (nextToken.kind != 0) { if (shouldAdd(nextToken)) { // TODO: Bad Heurisitc... tokens.put( nextToken.getBeginLine() * 500 + nextToken.getBeginCol(), nextToken.image); } nextToken = mng.getNextToken(); } return tokens; } @Override public SortedMap<Integer, FullToken> tokenListWithPos(final File file) throws IOException { return fullTokenListWithPos(FileUtils.readFileToString(file) .toCharArray()); } }