package edu.uncc.cs.watsonsim.search; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.lucene.document.Document; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import edu.uncc.cs.watsonsim.Environment; import edu.uncc.cs.watsonsim.Passage; import edu.uncc.cs.watsonsim.Question; import edu.uncc.cs.watsonsim.Score; import edu.uncc.cs.watsonsim.scorers.Merge; /** * @author Phani Rahul */ public class LuceneSearcher extends Searcher { private final IndexSearcher lucene; public LuceneSearcher(Environment env) { super(env); lucene = env.lucene; Score.register("LUCENE_ANSWER_RANK", -1, Merge.Mean); Score.register("LUCENE_ANSWER_SCORE", -1, Merge.Mean); Score.register("LUCENE_ANSWER_PRESENT", 0.0, Merge.Sum); } /** * Create a Lucene query using the bigrams in the given text * @param text */ public BooleanQuery queryFromSkipBigrams(String text) { BooleanQuery q = new BooleanQuery(); String prev_word = null; for (String word : text.split("\\W+")) { if (prev_word != null) { PhraseQuery pq = new PhraseQuery(); pq.setSlop(1); pq.add(new Term("text", prev_word)); pq.add(new Term("text", word)); q.add(pq, BooleanClause.Occur.SHOULD); } q.add(new TermQuery(new Term("text", word)), BooleanClause.Occur.SHOULD); prev_word = word; } return q; } public List<Passage> query(Question question) { List<Passage> results = new ArrayList<>(); try { //ScoreDoc[] hits = env.simpleLuceneQuery(question.text, MAX_RESULTS); ScoreDoc[] hits = lucene.search( queryFromSkipBigrams( question.text + " " + question.getCategory()), MAX_RESULTS).scoreDocs; // This isn't range based because we need the rank for (int i=0; i < hits.length; i++) { ScoreDoc s = hits[i]; Document doc = lucene.doc(s.doc); results.add(new edu.uncc.cs.watsonsim.Passage( "lucene", // Engine "", // Title - filled in by shared db "", // Text - filled in by shared db doc.get("docno")) // Reference .score("LUCENE_ANSWER_RANK", (double) i) // Rank .score("LUCENE_ANSWER_SCORE", (double) s.score) // Source .score("LUCENE_ANSWER_PRESENT", 1.0) ); } } catch (IOException e) { System.out.println("Failed to query Lucene. Is the index in the correct location?"); e.printStackTrace(); } // Fill any missing full text from sources return fillFromSources(results); } }