package pitt.search.lucene; import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.core.LowerCaseTokenizer; import org.apache.lucene.analysis.en.PorterStemFilter; import java.io.*; import java.util.logging.Logger; import static pitt.search.semanticvectors.LuceneUtils.LUCENE_VERSION; public class PorterAnalyzer extends Analyzer { @Override protected TokenStreamComponents createComponents(String s) { Tokenizer source = new LowerCaseTokenizer(); return new TokenStreamComponents(source, new PorterStemFilter(source)); } /** * Performs Porter stemming on a query String passed as a parameter * @param query string * @return query string with each word replaced with a stemmed version */ public String stemQuery(String query) { Logger logger = Logger.getLogger("pitt.search.lucene"); String stemmedQuery = ""; TokenStream theTS = createComponents(query).getTokenStream(); try { while (theTS.incrementToken()) { String theTS_s = theTS.toString().replaceAll(".*term=", ""); stemmedQuery += theTS_s.substring(0, theTS_s.length()-1) + " "; } } catch (IOException e) { logger.info("Error while stemming query "+query); } return stemmedQuery; } /** * convenience method: takes text file name as argument, produces stemmed version of this text file * as command line output * @param args : name of text file */ public static void main(String[] args) throws Exception { PorterAnalyzer thePorterAnalyzer = new PorterAnalyzer(); System.err.println("Attempting to perform Porter stemming on file "+args[0]); BufferedReader inReader = new BufferedReader(new FileReader(args[0])); String inLine = inReader.readLine(); while (inLine != null) { System.out.println(thePorterAnalyzer.stemQuery(inLine)); inLine = inReader.readLine(); } thePorterAnalyzer.close(); inReader.close(); } }