org.apache.lucene.search.similarities.DefaultSimilarity Java Examples

The following examples show how to use org.apache.lucene.search.similarities.DefaultSimilarity. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MoreLikeThisQuery.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
@Override
public Query rewrite(IndexReader reader) throws IOException {
    XMoreLikeThis mlt = new XMoreLikeThis(reader, similarity == null ? new DefaultSimilarity() : similarity);

    mlt.setFieldNames(moreLikeFields);
    mlt.setAnalyzer(analyzer);
    mlt.setMinTermFreq(minTermFrequency);
    mlt.setMinDocFreq(minDocFreq);
    mlt.setMaxDocFreq(maxDocFreq);
    mlt.setMaxQueryTerms(maxQueryTerms);
    mlt.setMinWordLen(minWordLen);
    mlt.setMaxWordLen(maxWordLen);
    mlt.setStopWords(stopWords);
    mlt.setBoost(boostTerms);
    mlt.setBoostFactor(boostTermsFactor);

    if (this.unlikeText != null || this.unlikeFields != null) {
        handleUnlike(mlt, this.unlikeText, this.unlikeFields);
    }
    
    return createQuery(mlt);
}
 
Example #2
Source File: ContextAnalyzerIndex.java    From modernmt with Apache License 2.0 6 votes vote down vote up
public ContextAnalyzerIndex(Directory directory, Rescorer rescorer) throws IOException {
    this.indexDirectory = directory;
    this.analyzer = new CorpusAnalyzer();
    this.rescorer = rescorer;

    // Index writer setup
    IndexWriterConfig indexConfig = new IndexWriterConfig(Version.LUCENE_4_10_4, this.analyzer);
    indexConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    indexConfig.setSimilarity(new DefaultSimilarity() {

        @Override
        public float lengthNorm(FieldInvertState state) {
            return 1.f;
        }

    });

    this.indexWriter = new IndexWriter(this.indexDirectory, indexConfig);

    // Ensure index exists
    if (!DirectoryReader.indexExists(directory))
        this.indexWriter.commit();
}
 
Example #3
Source File: TermVectorsFilter.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
public TermVectorsFilter(Fields termVectorsByField, Fields topLevelFields, Set<String> selectedFields, @Nullable AggregatedDfs dfs) {
    this.fields = termVectorsByField;
    this.topLevelFields = topLevelFields;
    this.selectedFields = selectedFields;

    this.dfs = dfs;
    this.scoreTerms = new HashMap<>();
    this.sizes = AtomicLongMap.create();
    this.similarity = new DefaultSimilarity();
}
 
Example #4
Source File: DefaultSimilarityProvider.java    From Elasticsearch with Apache License 2.0 4 votes vote down vote up
/**
 * {@inheritDoc}
 */
@Override
public DefaultSimilarity get() {
    return similarity;
}
 
Example #5
Source File: XMoreLikeThis.java    From Elasticsearch with Apache License 2.0 4 votes vote down vote up
/**
 * Constructor requiring an IndexReader.
 */
public XMoreLikeThis(IndexReader ir) {
    this(ir, new DefaultSimilarity());
}
 
Example #6
Source File: FullTextIndexTupleSerializer.java    From database with GNU General Public License v2.0 4 votes vote down vote up
protected ITermDocKey<V> deserialize(final ITuple tuple,
            final boolean keyOnly) {
    
        // key is {term,docId,fieldId}
        // final byte[] key = tuple.getKey();
        //      
        // // decode the document identifier.
        // final long docId = KeyBuilder.decodeLong(key, key.length
        // - Bytes.SIZEOF_LONG /*docId*/ - Bytes.SIZEOF_INT/*fieldId*/);

        final ByteArrayBuffer kbuf = tuple.getKeyBuffer();

        /*
         * The byte offset of the docId in the key.
         * 
         * Note: This is also the byte length of the match on the unicode sort
         * key, which appears at the head of the key.
         */
        final int docIdOffset = kbuf.limit() - Bytes.SIZEOF_LONG /* docId */
                - (fieldsEnabled ? Bytes.SIZEOF_INT/* fieldId */: 0);

        final V docId = (V) (Object)Long.valueOf(KeyBuilder.decodeLong(kbuf.array(),
                docIdOffset));

        // Decode field when present
        final int fieldId;
        if (fieldsEnabled) {
            fieldId = KeyBuilder.decodeShort(kbuf.array(), kbuf.limit()
                    - Bytes.SIZEOF_INT);
        } else {
            fieldId = -1;
        }
        
        final int termWeightOffset = docIdOffset - Bytes.SIZEOF_BYTE;
        
        final byte termWeightCompact = kbuf.getByte(termWeightOffset);
        
        /*
         * See: http://lucene.apache.org/core/5_1_0/core/org/apache/lucene/search/similarities/DefaultSimilarity.html
         * 
         * For more information on the round-trip of normalized term weight.
         */
        
        final DefaultSimilarity similarity = new DefaultSimilarity();

        final double termWeight = similarity.decodeNormValue(termWeightCompact);

        if (keyOnly) {

            return new ReadOnlyTermDocKey(docId, fieldId, termWeight);
            
        }
        
//        final int termFreq;
//        final double termWeight;
//        try {
//
//            final DataInputBuffer dis = tuple.getValueStream();
//
//            termFreq = dis.readShort();
//
//            if(doublePrecision)
//                termWeight = dis.readDouble();
//            else
//                termWeight = dis.readFloat();
//            
//        } catch (IOException ex) {
//            
//            throw new RuntimeException(ex);
//
//        }
//
        return new ReadOnlyTermDocRecord<V>(null/* token */, docId, fieldId,
                /* termFreq, */ termWeight);

    }
 
Example #7
Source File: RDFFullTextIndexTupleSerializer.java    From database with GNU General Public License v2.0 4 votes vote down vote up
protected ITermDocKey deserialize(final ITuple tuple, final boolean keyOnly) {

        final ByteArrayBuffer kbuf = tuple.getKeyBuffer();

        // The byte length of the docId IV.
        final int byteLength;
        try {
//            byteLength = LongPacker.unpackInt((DataInput) tuple
//                    .getValueStream());
            byteLength = ShortPacker.unpackShort((DataInput) tuple
            		.getValueStream());
        } catch (IOException ex) {
            throw new RuntimeException(ex);
        }
        
        final int docIdOffset = kbuf.limit() - byteLength;

        // Decode the IV.
        final IV docId = (IV) IVUtility.decodeFromOffset(kbuf.array(),
                docIdOffset);

        final int termWeightOffset = docIdOffset - Bytes.SIZEOF_BYTE;
        
        final byte termWeightCompact = kbuf.getByte(termWeightOffset);
        
        /*
         * See: http://lucene.apache.org/core/old_versioned_docs/versions/3_0_2/api/all/org/apache/lucene/search/Similarity.html
         * 
         * For more information on the round-trip of normalized term weight.
         */
        
        final DefaultSimilarity similarity = new DefaultSimilarity(); 

        final double termWeight = similarity.decodeNormValue(termWeightCompact);

        if (keyOnly) {

            return new ReadOnlyTermDocKey(docId, NO_FIELD, termWeight);

        }

//        final int termFreq;
//        final double termWeight;
//        try {
//
//            final DataInputBuffer dis = tuple.getValueStream();
//
//            // skip the byte length of the IV.
//            LongPacker.unpackInt((DataInput) dis);
//            
//            termFreq = dis.readShort();
//            termFreq = LongPacker.unpackInt((DataInput) dis);

//            if (doublePrecision)
//                termWeight = dis.readDouble();
//            else
//                termWeight = dis.readFloat();
//
//        } catch (IOException ex) {
//
//            throw new RuntimeException(ex);
//
//        }

        return new ReadOnlyTermDocRecord(null/* token */, docId, NO_FIELD,
                /* termFreq, */ termWeight);

    }
 
Example #8
Source File: FullTextIndexTupleSerializer.java    From database with GNU General Public License v2.0 2 votes vote down vote up
@Override
public byte[] serializeKey(final Object obj) {

    @SuppressWarnings("unchecked")
    final ITermDocKey<V> entry = (ITermDocKey<V>) obj;

    final String termText = entry.getToken();
    
    final double termWeight = entry.getLocalTermWeight();
    
    /*
     * See: http://lucene.apache.org/core/5_1_0/core/org/apache/lucene/search/similarities/DefaultSimilarity.html
     * 
     * For more information on the round-trip of normalized term weight.
     */
    
    final DefaultSimilarity similarity = new DefaultSimilarity();
    
    final long termWeightCompact = similarity.encodeNormValue((float) termWeight);
    
    final V docId = entry.getDocId();

    final IKeyBuilder keyBuilder = getKeyBuilder();

    keyBuilder.reset();

    // the token text (or its successor as desired).
    keyBuilder
            .appendText(termText, true/* unicode */, false/* successor */);
    
    keyBuilder.append(termWeightCompact);

    keyBuilder.append((V) docId);

    if (fieldsEnabled)
        keyBuilder.append(entry.getFieldId());

    final byte[] key = keyBuilder.getKey();

    if (log.isDebugEnabled()) {

        log.debug("{" + termText + "," + docId
                + (fieldsEnabled ? "," + entry.getFieldId() : "")
                + "}, key=" + BytesUtil.toString(key));

    }

    return key;

}
 
Example #9
Source File: RDFFullTextIndexTupleSerializer.java    From database with GNU General Public License v2.0 2 votes vote down vote up
@Override
public byte[] serializeKey(final Object obj) {

    final ITermDocKey entry = (ITermDocKey) obj;

    final String termText = entry.getToken();
    
    final double termWeight = entry.getLocalTermWeight();
    
    /*
     * See: http://lucene.apache.org/core/old_versioned_docs/versions/3_0_2/api/all/org/apache/lucene/search/Similarity.html
     * 
     * For more information on the round-trip of normalized term weight.
     */
    final DefaultSimilarity similarity = new DefaultSimilarity(); 
    final long termWeightCompact = similarity.encodeNormValue((float) termWeight);
    
    final IV docId = (IV)entry.getDocId();

    final IKeyBuilder keyBuilder = getKeyBuilder();

    keyBuilder.reset();

    // the token text (or its successor as desired).
    keyBuilder
            .appendText(termText, true/* unicode */, false/* successor */);

    keyBuilder.append(termWeightCompact);

    IVUtility.encode(keyBuilder, docId);

    final byte[] key = keyBuilder.getKey();

    if (log.isDebugEnabled()) {

        log.debug("{" + termText + "," + docId + "}, key="
                + BytesUtil.toString(key));

    }

    return key;

}