Java Code Examples for org.apache.lucene.index.TermsEnum#term()

The following examples show how to use org.apache.lucene.index.TermsEnum#term() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AbstractFeatureBuilder.java    From jate with GNU Lesser General Public License v3.0 6 votes vote down vote up
protected Set<String> getUniqueWords() throws JATEException, IOException {
    Terms ngramInfo = SolrUtil.getTermVector(properties.getSolrFieldNameJATENGramInfo(), solrIndexSearcher);

    TermsEnum termsEnum = ngramInfo.iterator();
    Set<String> allWords = new HashSet<>();

    while (termsEnum.next() != null) {
        BytesRef t = termsEnum.term();
        if (t.length == 0)
            continue;
        String termStr=t.utf8ToString();
        if(!termStr.contains(" "))
            allWords.add(termStr);
    }
    if(allWords.size()==0)
        throw new JATEException("MWEMetadata are required on 'Words', however there are no single-token lexical units in the "+
        properties.getSolrFieldNameJATENGramInfo()+" field. Check to see if your analyzer pipeline outputs uni-grams");
    return allWords;
}
 
Example 2
Source File: LuceneIndexCorpus.java    From word2vec-lucene with Apache License 2.0 6 votes vote down vote up
@Override
public void learnVocab() throws IOException {
  super.learnVocab();

  final String field = ((LuceneIndexConfig)config).getField();
  final Terms terms = MultiFields.getTerms(reader, field);
  final BytesRef maxTerm = terms.getMax();
  final BytesRef minTerm = terms.getMin();
  Query q = new TermRangeQuery(field, minTerm, maxTerm, true, true);
  IndexSearcher searcher = new IndexSearcher(reader);
  topDocs = searcher.search(q, Integer.MAX_VALUE);

  TermsEnum termsEnum = null;
  termsEnum = terms.iterator(termsEnum);

  termsEnum.seekCeil(new BytesRef());
  BytesRef term = termsEnum.term();
  while(term != null){
    int p = addWordToVocab(term.utf8ToString());
    vocab[p].setCn((int)termsEnum.totalTermFreq());
    term = termsEnum.next();
  }
}
 
Example 3
Source File: TermGroupFacetCollector.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
SegmentResult(int[] counts, int total, int missingCountIndex, TermsEnum tenum, int startFacetOrd, int endFacetOrd) throws IOException {
  super(counts, total - counts[missingCountIndex], counts[missingCountIndex],
      endFacetOrd == missingCountIndex + 1 ?  missingCountIndex : endFacetOrd);
  this.tenum = tenum;
  this.mergePos = startFacetOrd;
  if (tenum != null) {
    tenum.seekExact(mergePos);
    mergeTerm = tenum.term();
  }
}
 
Example 4
Source File: SrndTruncQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void visitMatchingTerms(
  IndexReader reader,
  String fieldName,
  MatchingTermVisitor mtv) throws IOException
{
  int prefixLength = prefix.length();
  Terms terms = MultiTerms.getTerms(reader, fieldName);
  if (terms != null) {
    Matcher matcher = pattern.matcher("");
    try {
      TermsEnum termsEnum = terms.iterator();

      TermsEnum.SeekStatus status = termsEnum.seekCeil(prefixRef);
      BytesRef text;
      if (status == TermsEnum.SeekStatus.FOUND) {
        text = prefixRef;
      } else if (status == TermsEnum.SeekStatus.NOT_FOUND) {
        text = termsEnum.term();
      } else {
        text = null;
      }

      while(text != null) {
        if (text != null && StringHelper.startsWith(text, prefixRef)) {
          String textString = text.utf8ToString();
          matcher.reset(textString.substring(prefixLength));
          if (matcher.matches()) {
            mtv.visitMatchingTerm(new Term(fieldName, textString));
          }
        } else {
          break;
        }
        text = termsEnum.next();
      }
    } finally {
      matcher.reset();
    }
  }
}
 
Example 5
Source File: TestFSTs.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void assertSame(TermsEnum termsEnum, BytesRefFSTEnum<?> fstEnum, boolean storeOrd) throws Exception {
  if (termsEnum.term() == null) {
    assertNull(fstEnum.current());
  } else {
    assertNotNull(fstEnum.current());
    assertEquals(termsEnum.term().utf8ToString() + " != " + fstEnum.current().input.utf8ToString(), termsEnum.term(), fstEnum.current().input);
    if (storeOrd) {
      // fst stored the ord
      assertEquals("term=" + termsEnum.term().utf8ToString() + " " + termsEnum.term(), termsEnum.ord(), ((Long) fstEnum.current().output).longValue());
    } else {
      // fst stored the docFreq
      assertEquals("term=" + termsEnum.term().utf8ToString() + " " + termsEnum.term(), termsEnum.docFreq(), (int) (((Long) fstEnum.current().output).longValue()));
    }
  }
}
 
Example 6
Source File: UnInvertedField.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Called for each term in the field being uninverted.
 * Collects {@link #maxTermCounts} for all bigTerms as well as storing them in {@link #bigTerms}.
 * @param te positioned at the current term.
 * @param termNum the ID/pointer/ordinal of the current term. Monotonically increasing between calls.
 */
@Override
protected void visitTerm(TermsEnum te, int termNum) throws IOException {

  if (termNum >= maxTermCounts.length) {
    // resize by doubling - for very large number of unique terms, expanding
    // by 4K and resultant GC will dominate uninvert times.  Resize at end if material
    int[] newMaxTermCounts = new int[ Math.min(Integer.MAX_VALUE-16, maxTermCounts.length*2) ];
    System.arraycopy(maxTermCounts, 0, newMaxTermCounts, 0, termNum);
    maxTermCounts = newMaxTermCounts;
  }

  final BytesRef term = te.term();

  if (te.docFreq() > maxTermDocFreq) {
    Term t = new Term(field, term);  // this makes a deep copy of the term bytes
    TopTerm topTerm = new TopTerm();
    topTerm.term = t.bytes();
    topTerm.termNum = termNum;
    topTerm.termQuery = new TermQuery(t);

    bigTerms.put(topTerm.termNum, topTerm);

    if (deState == null) {
      deState = new SolrIndexSearcher.DocsEnumState();
      deState.fieldName = field;
      deState.liveDocs = searcher.getLiveDocsBits();
      deState.termsEnum = te;  // TODO: check for MultiTermsEnum in SolrIndexSearcher could now fail?
      deState.postingsEnum = postingsEnum;
      deState.minSetSizeCached = maxTermDocFreq;
    }

    postingsEnum = deState.postingsEnum;
    DocSet set = searcher.getDocSet(deState);
    maxTermCounts[termNum] = set.size();
  }
}
 
Example 7
Source File: QueryAutoFilteringComponent.java    From query-autofiltering-component with Apache License 2.0 5 votes vote down vote up
private void buildFieldMap( ResponseBuilder rb ) throws IOException {
  Log.debug( "buildFieldMap" );
  SolrIndexSearcher searcher = rb.req.getSearcher();
  // build a synonym map from the SortedDocValues -
  // for each field value: lower case, stemmed, lookup synonyms from synonyms.txt - map to fieldValue
  SynonymMap.Builder fieldBuilder = new SynonymMap.Builder( true );
  SynonymMap.Builder termBuilder = new SynonymMap.Builder( true );
    
  ArrayList<String> searchFields = getStringFields( searcher );

  for (String searchField : searchFields ) {
    Log.debug( "adding searchField " + searchField );
    CharsRef fieldChars = new CharsRef( searchField );
    SortedSetDocValues sdv = FieldCache.DEFAULT.getDocTermOrds( searcher.getAtomicReader( ), searchField );
    if (sdv == null) continue;
    Log.debug( "got SortedSetDocValues for " + searchField );
    TermsEnum te = sdv.termsEnum();
    while (te.next() != null) {
      BytesRef term = te.term();
      String fieldValue = term.utf8ToString( );
      addTerm ( fieldChars, fieldValue, fieldBuilder, termBuilder );
    }
  }
    
  addDistributedTerms( rb, fieldBuilder, termBuilder, searchFields );
    
  fieldMap = fieldBuilder.build( );
  termMap = termBuilder.build( );
}
 
Example 8
Source File: AbstractFeatureBuilder.java    From jate with GNU Lesser General Public License v3.0 5 votes vote down vote up
/**
 * Retrieve term candidates from solr field
 *      see @code {uk.ac.shef.dcs.jate.JATEProperties.PROPERTY_SOLR_FIELD_CONTENT_TERMS}
 *
 * The method assumes that the term candidates are extracted at index-time and stored in pre-configured field
 *
 * @return Set, a set of term candidate surface form
 * @throws JATEException
 * @throws IOException
 */
protected Set<String> getUniqueTerms() throws JATEException, IOException {
    Terms terms =SolrUtil.getTermVector(properties.getSolrFieldNameJATECTerms(),solrIndexSearcher);

    //>>>>>>>>>
    /*TermsEnum source = terms.iterator();
    String term = //"thrownawayorusedjustforelementarystatistical profile";
    "l hierar hy";
    //"ordertoavoidadependencyofthebaselineresultontherandom";

            if (source.seekExact(new BytesRef(term.getBytes("UTF-8")))) {
                PostingsEnum docEnum = source.postings(null);
                int doc = 0;
                while ((doc = docEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
                    int tfid = docEnum.freq();  //tf in document

                }

            } else {

            }*/
    //>>>>>>>>>

    TermsEnum termsEnum = terms.iterator();
    Set<String> allTermCandidates = new HashSet<>();

    while (termsEnum.next() != null) {
        BytesRef t = termsEnum.term();
        if (t.length == 0)
            continue;
        allTermCandidates.add(t.utf8ToString());
    }
    return allTermCandidates;
}
 
Example 9
Source File: MultiPhrasePrefixQuery.java    From crate with Apache License 2.0 5 votes vote down vote up
private void getPrefixTerms(ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException {
    // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment into one terms
    // instance, which is very expensive. Therefore I think it is better to iterate over each leaf individually.
    List<LeafReaderContext> leaves = reader.leaves();
    for (LeafReaderContext leaf : leaves) {
        Terms _terms = leaf.reader().terms(field);
        if (_terms == null) {
            continue;
        }

        TermsEnum termsEnum = _terms.iterator();
        TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes());
        if (TermsEnum.SeekStatus.END == seekStatus) {
            continue;
        }

        for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) {
            if (!StringHelper.startsWith(term, prefix.bytes())) {
                break;
            }

            terms.add(new Term(field, BytesRef.deepCopyOf(term)));
            if (terms.size() >= maxExpansions) {
                return;
            }
        }
    }
}
 
Example 10
Source File: TermIntervalsSource.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
static IntervalMatchesIterator matches(TermsEnum te, int doc, String field) throws IOException {
  TermQuery query = new TermQuery(new Term(field, te.term()));
  PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
  if (pe.advance(doc) != doc) {
    return null;
  }
  return new IntervalMatchesIterator() {

    @Override
    public int gaps() {
      return 0;
    }

    @Override
    public int width() {
      return 1;
    }

    int upto = pe.freq();
    int pos = -1;

    @Override
    public boolean next() throws IOException {
      if (upto <= 0) {
        pos = IntervalIterator.NO_MORE_INTERVALS;
        return false;
      }
      upto--;
      pos = pe.nextPosition();
      return true;
    }

    @Override
    public int startPosition() {
      return pos;
    }

    @Override
    public int endPosition() {
      return pos;
    }

    @Override
    public int startOffset() throws IOException {
      return pe.startOffset();
    }

    @Override
    public int endOffset() throws IOException {
      return pe.endOffset();
    }

    @Override
    public MatchesIterator getSubMatches() {
      return null;
    }

    @Override
    public Query getQuery() {
      return query;
    }
  };
}
 
Example 11
Source File: DocTermOrds.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/** Returns the term ({@link BytesRef}) corresponding to
 *  the provided ordinal. */
public BytesRef lookupTerm(TermsEnum termsEnum, int ord) throws IOException {
  termsEnum.seekExact(ord);
  return termsEnum.term();
}