package edu.uncc.cs.watsonsim.search; import java.io.IOException; import java.util.ArrayList; import java.util.List; import pitt.search.semanticvectors.CloseableVectorStore; import pitt.search.semanticvectors.FlagConfig; import pitt.search.semanticvectors.LuceneUtils; import pitt.search.semanticvectors.SearchResult; import pitt.search.semanticvectors.VectorSearcher; import pitt.search.semanticvectors.VectorStoreReader; import pitt.search.semanticvectors.vectors.ZeroVectorException; import edu.uncc.cs.watsonsim.Environment; import edu.uncc.cs.watsonsim.Passage; import edu.uncc.cs.watsonsim.Question; import edu.uncc.cs.watsonsim.Score; import edu.uncc.cs.watsonsim.scorers.Merge; /** * A variant on LuceneSearcher called SemanticVectors (D. Widdow's project) * that uses distributional semantics to score & rank the results of a Lucene * query. * @author Sean */ public class SemanticVectorSearcher extends Searcher { private FlagConfig fconfig; private CloseableVectorStore queryVecReader; private CloseableVectorStore resultsVecReader; private LuceneUtils luceneUtils; public SemanticVectorSearcher(Environment env) { super(env); try { // How to use SemanticVectors comes from their Wiki. // The search function takes many arguments, which are what we are // storing as fields here. fconfig = FlagConfig.getFlagConfig( new String[]{"-luceneindexpath", env.getConfOrDie("lucene_index"), "-docvectorsfile", "data/semanticvectors/docvectors.bin", "-termvectorsfile", "data/semanticvectors/termvectors.bin"}); queryVecReader = VectorStoreReader.openVectorStore( fconfig.termvectorsfile(), fconfig); resultsVecReader = VectorStoreReader.openVectorStore( fconfig.docvectorsfile(), fconfig); luceneUtils = new LuceneUtils(fconfig); } catch (IOException e) { e.printStackTrace(); } Score.register("SEMVEC_RANK", -1, Merge.Mean); Score.register("SEMVEC_SCORE", -1, Merge.Mean); Score.register("SEMVEC_PRESENT", 0.0, Merge.Sum); } public List<Passage> query(Question question) { List<Passage> passages = new ArrayList<>(); VectorSearcher[] sv_searchers; try { sv_searchers = new VectorSearcher[]{ new VectorSearcher.VectorSearcherCosine( queryVecReader, resultsVecReader, luceneUtils, fconfig, question.getTokens().toArray(new String[]{})), /*new VectorSearcher.VectorSearcherLucene(luceneUtils, fconfig, question.getTokens().toArray(new String[]{})), new VectorSearcher.VectorSearcherMaxSim( queryVecReader, resultsVecReader, luceneUtils, fconfig, question.getTokens().toArray(new String[]{})),*/ new VectorSearcher.VectorSearcherMinSim( queryVecReader, resultsVecReader, luceneUtils, fconfig, question.getTokens().toArray(new String[]{})), /*new VectorSearcher.VectorSearcherSubspaceSim( queryVecReader, resultsVecReader, luceneUtils, fconfig, question.getTokens().toArray(new String[]{})),*/ }; System.out.println("sv_searchers = " + sv_searchers); for (VectorSearcher sv_searcher : sv_searchers) if (sv_searcher != null) { List<SearchResult> results = sv_searcher.getNearestNeighbors(10); System.out.println("result = " + results); int rank = 0; for (SearchResult result: results) { passages.add(new Passage( "semvec", // Engine "", // Title "", // Text result.getObjectVector().getObject().toString()) // Reference .score("SEMVEC_RANK", (double) rank++) // Rank .score("SEMVEC_SCORE", (double) result.getScore()) // Score .score("SEMVEC_PRESENT", 1.0) ); } } /*sv_searcher = new VectorSearcher.VectorSearcherCosine( queryVecReader, resultsVecReader, luceneUtils, fconfig, question.tokens.toArray(new String[]{}));*/ } catch (ZeroVectorException e) { // TODO: Under what circumstances does this happen? e.printStackTrace(); } return fillFromSources(passages); } }