org.apache.lucene.queries.mlt.MoreLikeThis Java Exaples

Source File: SearchImpl.java From lucene-solr with Apache License 2.0

6 votes

@Override
public Query mltQuery(int docid, MLTConfig mltConfig, Analyzer analyzer) {
  MoreLikeThis mlt = new MoreLikeThis(reader);

  mlt.setAnalyzer(analyzer);
  mlt.setFieldNames(mltConfig.getFieldNames());
  mlt.setMinDocFreq(mltConfig.getMinDocFreq());
  mlt.setMaxDocFreq(mltConfig.getMaxDocFreq());
  mlt.setMinTermFreq(mltConfig.getMinTermFreq());

  try {
    return mlt.like(docid);
  } catch (IOException e) {
    throw new LukeException("Failed to create MLT query for doc: " + docid);
  }
}

Source File: KNearestNeighborClassifier.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Creates a {@link KNearestNeighborClassifier}.
 *
 * @param indexReader     the reader on the index to be used for classification
 * @param analyzer       an {@link Analyzer} used to analyze unseen text
 * @param similarity     the {@link Similarity} to be used by the underlying {@link IndexSearcher} or {@code null}
 *                       (defaults to {@link org.apache.lucene.search.similarities.BM25Similarity})
 * @param query          a {@link Query} to eventually filter the docs used for training the classifier, or {@code null}
 *                       if all the indexed docs should be used
 * @param k              the no. of docs to select in the MLT results to find the nearest neighbor
 * @param minDocsFreq    {@link MoreLikeThis#minDocFreq} parameter
 * @param minTermFreq    {@link MoreLikeThis#minTermFreq} parameter
 * @param classFieldName the name of the field used as the output for the classifier
 * @param textFieldNames the name of the fields used as the inputs for the classifier, they can contain boosting indication e.g. title^10
 */
public KNearestNeighborClassifier(IndexReader indexReader, Similarity similarity, Analyzer analyzer, Query query, int k, int minDocsFreq,
                                  int minTermFreq, String classFieldName, String... textFieldNames) {
  this.textFieldNames = textFieldNames;
  this.classFieldName = classFieldName;
  this.mlt = new MoreLikeThis(indexReader);
  this.mlt.setAnalyzer(analyzer);
  this.mlt.setFieldNames(textFieldNames);
  this.indexSearcher = new IndexSearcher(indexReader);
  if (similarity != null) {
    this.indexSearcher.setSimilarity(similarity);
  } else {
    this.indexSearcher.setSimilarity(new BM25Similarity());
  }
  if (minDocsFreq > 0) {
    mlt.setMinDocFreq(minDocsFreq);
  }
  if (minTermFreq > 0) {
    mlt.setMinTermFreq(minTermFreq);
  }
  this.query = query;
  this.k = k;
}

Source File: MoreLikeThisHandler.java From lucene-solr with Apache License 2.0

4 votes

public MoreLikeThisHelper( SolrParams params, SolrIndexSearcher searcher )
{
  this.searcher = searcher;
  this.reader = searcher.getIndexReader();
  this.uniqueKeyField = searcher.getSchema().getUniqueKeyField();
  this.needDocSet = params.getBool(FacetParams.FACET,false);
  
  SolrParams required = params.required();
  String[] fl = required.getParams(MoreLikeThisParams.SIMILARITY_FIELDS);
  List<String> list = new ArrayList<>();
  for (String f : fl) {
    if (!StringUtils.isEmpty(f))  {
      String[] strings = splitList.split(f);
      for (String string : strings) {
        if (!StringUtils.isEmpty(string)) {
          list.add(string);
        }
      }
    }
  }
  String[] fields = list.toArray(new String[list.size()]);
  if( fields.length < 1 ) {
    throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, 
        "MoreLikeThis requires at least one similarity field: "+MoreLikeThisParams.SIMILARITY_FIELDS );
  }
  
  this.mlt = new MoreLikeThis( reader ); // TODO -- after LUCENE-896, we can use , searcher.getSimilarity() );
  mlt.setFieldNames(fields);
  mlt.setAnalyzer( searcher.getSchema().getIndexAnalyzer() );
  
  // configurable params
  
  mlt.setMinTermFreq(       params.getInt(MoreLikeThisParams.MIN_TERM_FREQ,         MoreLikeThis.DEFAULT_MIN_TERM_FREQ));
  mlt.setMinDocFreq(        params.getInt(MoreLikeThisParams.MIN_DOC_FREQ,          MoreLikeThis.DEFAULT_MIN_DOC_FREQ));
  mlt.setMaxDocFreq(        params.getInt(MoreLikeThisParams.MAX_DOC_FREQ,          MoreLikeThis.DEFAULT_MAX_DOC_FREQ));
  mlt.setMinWordLen(        params.getInt(MoreLikeThisParams.MIN_WORD_LEN,          MoreLikeThis.DEFAULT_MIN_WORD_LENGTH));
  mlt.setMaxWordLen(        params.getInt(MoreLikeThisParams.MAX_WORD_LEN,          MoreLikeThis.DEFAULT_MAX_WORD_LENGTH));
  mlt.setMaxQueryTerms(     params.getInt(MoreLikeThisParams.MAX_QUERY_TERMS,       MoreLikeThis.DEFAULT_MAX_QUERY_TERMS));
  mlt.setMaxNumTokensParsed(params.getInt(MoreLikeThisParams.MAX_NUM_TOKENS_PARSED, MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED));
  mlt.setBoost(            params.getBool(MoreLikeThisParams.BOOST, false ) );
  
  // There is no default for maxDocFreqPct. Also, it's a bit oddly expressed as an integer value 
  // (percentage of the collection's documents count). We keep Lucene's convention here. 
  if (params.getInt(MoreLikeThisParams.MAX_DOC_FREQ_PCT) != null) {
    mlt.setMaxDocFreqPct(params.getInt(MoreLikeThisParams.MAX_DOC_FREQ_PCT));
  }

  boostFields = SolrPluginUtils.parseFieldBoosts(params.getParams(MoreLikeThisParams.QF));
}

Source File: MoreLikeThisHandler.java From lucene-solr with Apache License 2.0

4 votes

public MoreLikeThis getMoreLikeThis()
{
  return mlt;
}

Source File: LuceneIndexer.java From MtgDesktopCompanion with GNU General Public License v3.0

4 votes

public Map<MagicCard,Float> similarity(MagicCard mc) throws IOException 
{
	Map<MagicCard,Float> ret = new LinkedHashMap<>();
	
	if(mc==null)
		return ret;
	
	if(dir==null)
		open();
	
	logger.debug("search similar cards for " + mc);
	
	try (IndexReader indexReader = DirectoryReader.open(dir))
	{
		
	 IndexSearcher searcher = new IndexSearcher(indexReader);
	 Query query = new QueryParser("text", analyzer).parse("name:\""+mc.getName()+"\"");
	 logger.trace(query);
	 TopDocs top = searcher.search(query, 1);
	 
	 if(top.totalHits.value>0)
	 {
		 MoreLikeThis mlt = new MoreLikeThis(indexReader);
		  mlt.setFieldNames(getArray(FIELDS));
		  mlt.setAnalyzer(analyzer);
		  mlt.setMinTermFreq(getInt(MIN_TERM_FREQ));
		  mlt.setBoost(getBoolean(BOOST));
		  
		  
		  
		 ScoreDoc d = top.scoreDocs[0];
		 logger.trace("found doc id="+d.doc);
		 Query like = mlt.like(d.doc);
		 
		 logger.trace("mlt="+Arrays.asList(mlt.retrieveInterestingTerms(d.doc)));
		 logger.trace("Like query="+like);
		 TopDocs likes = searcher.search(like,getInt(MAX_RESULTS));
		 
		 for(ScoreDoc l : likes.scoreDocs)
			 ret.put(serializer.fromJson(searcher.doc(l.doc).get("data"),MagicCard.class),l.score);

		 logger.debug("found " + likes.scoreDocs.length + " results");
		 close();
		
	 }
	 else
	 {
		 logger.error("can't found "+mc);
	 }
	 
	} catch (ParseException e) {
		logger.error(e);
	}
	return ret;
	
}

Source File: ContextAnalyzerIndex.java From modernmt with Apache License 2.0

4 votes

public ContextVector getContextVector(UUID user, LanguageDirection direction, Corpus queryDocument, int limit, Rescorer rescorer) throws IOException {
    String contentFieldName = DocumentBuilder.makeContentFieldName(direction);

    IndexSearcher searcher = this.getIndexSearcher();
    IndexReader reader = searcher.getIndexReader();

    // Get matching documents

    int rawLimit = limit < MIN_RESULT_BATCH ? MIN_RESULT_BATCH : limit;

    MoreLikeThis mlt = new MoreLikeThis(reader);
    mlt.setFieldNames(new String[]{contentFieldName});
    mlt.setMinDocFreq(0);
    mlt.setMinTermFreq(1);
    mlt.setMinWordLen(2);
    mlt.setBoost(true);
    mlt.setAnalyzer(analyzer);

    TopScoreDocCollector collector = TopScoreDocCollector.create(rawLimit, true);

    Reader queryDocumentReader = queryDocument.getRawContentReader();

    try {
        Query mltQuery = mlt.like(contentFieldName, queryDocumentReader);
        BooleanQuery ownerQuery = new BooleanQuery();

        if (user == null) {
            ownerQuery.add(DocumentBuilder.makePublicOwnerMatchingQuery(), BooleanClause.Occur.MUST);
        } else {
            ownerQuery.add(DocumentBuilder.makePublicOwnerMatchingQuery(), BooleanClause.Occur.SHOULD);
            ownerQuery.add(DocumentBuilder.makeOwnerMatchingQuery(user), BooleanClause.Occur.SHOULD);
            ownerQuery.setMinimumNumberShouldMatch(1);
        }

        FilteredQuery query = new FilteredQuery(mltQuery, new QueryWrapperFilter(ownerQuery));
        searcher.search(query, collector);
    } finally {
        IOUtils.closeQuietly(queryDocumentReader);
    }

    ScoreDoc[] topDocs = collector.topDocs().scoreDocs;

    // Rescore result

    if (rescorer != null) {
        Document referenceDocument = DocumentBuilder.newInstance(direction, queryDocument);
        rescorer.rescore(reader, this.analyzer, topDocs, referenceDocument, contentFieldName);
    }

    // Build result

    ContextVector.Builder resultBuilder = new ContextVector.Builder(topDocs.length);
    resultBuilder.setLimit(limit);

    for (ScoreDoc topDocRef : topDocs) {
        Document topDoc = searcher.doc(topDocRef.doc);

        long memory = DocumentBuilder.getMemory(topDoc);
        resultBuilder.add(memory, topDocRef.score);
    }

    return resultBuilder.build();
}

org.apache.lucene.queries.mlt.MoreLikeThis Java Examples