package org.apache.lucene.search; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.AttributeImpl; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.TreeSet; import lucene4ir.Lucene4IRConstants; /** * Abstract class that provides common functionalities to perform batch retrieval. * This class is intended for implementation of retrieval models by Apache Lucene * index API. * * In order to implement a new retrieval model and/or a new document matching procedure * the abstract method {@link #runQuery(String, String) runQuery} should be implemented. * See {@link RetrieverOkapiBM25} for an implementation of Okapi BM25. * * Created by dibuccio on 09/09/2016. */ public abstract class Retriever { RetrieverParams p; IndexReader reader; IndexSearcher searcher; Analyzer analyzer; QueryParser parser; String fieldToQuery = "content"; public Retriever(RetrieverParams retrievalParams){ this.p=retrievalParams; try { reader = DirectoryReader.open(FSDirectory.open( new File(p.indexName).toPath()) ); searcher = new IndexSearcher(reader); analyzer = Lucene4IRConstants.ANALYZER; parser = new QueryParser("content", analyzer); } catch (Exception e){ System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } } /** * Returns the list of tokens extracted from the query string using the specified analyzer. * * @param field document field. * * @param queryTerms query string. * * @param distinctTokens if true, return the distinct tokens in the query string. * * @return the list of tokens extracted from the given query. * * @throws IOException */ List<String> getTokens(String field, String queryTerms, boolean distinctTokens) throws IOException { List<String> tokens = new ArrayList<String>(); StringReader topicTitleReader = new StringReader(queryTerms); Set<String> seenTokens = new TreeSet<String>(); TokenStream tok; tok = analyzer.tokenStream(field, topicTitleReader); tok.reset(); while (tok.incrementToken()) { Iterator<AttributeImpl> atts = tok.getAttributeImplsIterator(); AttributeImpl token = atts.next(); String text = "" + token; if (seenTokens.contains(text) && distinctTokens) { continue; } seenTokens.add(text); tokens.add(text); } tok.close(); return tokens; } /** * Process the query file specified in the retrieval parameters (queryFile) * and print the results in trec_eval format in a text file (resultFile). */ public void processQueryFile(){ try { BufferedReader br = new BufferedReader(new FileReader(p.queryFile)); File file = new File(p.resultFile); FileWriter fw = new FileWriter(file); try { String line = br.readLine(); while (line != null){ String[] parts = line.split(" "); String qno = parts[0]; String queryTerms = ""; for (int i=1; i<parts.length; i++) queryTerms = queryTerms + " " + parts[i]; ScoreDoc[] scored = runQuery(qno, queryTerms); int n = Math.min(p.maxResults, scored.length); for(int i=0; i<n; i++){ Document doc = searcher.doc(scored[i].doc); String docno = doc.get("docnum"); fw.write(qno + " QO " + docno + " " + (i+1) + " " + scored[i].score + " " + p.runTag); fw.write(System.lineSeparator()); } line = br.readLine(); } } finally { br.close(); fw.close(); } } catch (Exception e){ System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); e.printStackTrace(); } } /** * Performs document matching for the query (queryTerms) with the given identifier (qno). * * @param qno identifier / number of the query. * * @param queryTerms string containing the query content. * * @return array of {@link ScoreDoc} containing the ranked result list. * */ public abstract ScoreDoc[] runQuery(String qno, String queryTerms); }