Java Code Examples for org.apache.lucene.index.TermsEnum#docFreq()

The following examples show how to use org.apache.lucene.index.TermsEnum#docFreq() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: QueryAutoStopWordAnalyzer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the
 * given selection of fields from terms with a document frequency greater than
 * the given maxDocFreq
 *
 * @param delegate Analyzer whose TokenStream will be filtered
 * @param indexReader IndexReader to identify the stopwords from
 * @param fields Selection of fields to calculate stopwords for
 * @param maxDocFreq Document frequency terms should be above in order to be stopwords
 * @throws IOException Can be thrown while reading from the IndexReader
 */
public QueryAutoStopWordAnalyzer(
    Analyzer delegate,
    IndexReader indexReader,
    Collection<String> fields,
    int maxDocFreq) throws IOException {
  super(delegate.getReuseStrategy());
  this.delegate = delegate;
  
  for (String field : fields) {
    Set<String> stopWords = new HashSet<>();
    Terms terms = MultiTerms.getTerms(indexReader, field);
    CharsRefBuilder spare = new CharsRefBuilder();
    if (terms != null) {
      TermsEnum te = terms.iterator();
      BytesRef text;
      while ((text = te.next()) != null) {
        if (te.docFreq() > maxDocFreq) {
          spare.copyUTF8Bytes(text);
          stopWords.add(spare.toString());
        }
      }
    }
    stopWordsPerField.put(field, stopWords);
  }
}
 
Example 2
Source File: SolrRangeQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** Try to collect terms from the given terms enum and return count=sum(df) for terms visited so far
 *  or (-count - 1) if this should be rewritten into a boolean query.
 *  The termEnum will already be positioned on the next term if not exhausted.
 */
private long collectTerms(LeafReaderContext context, TermsEnum termsEnum, List<TermAndState> terms) throws IOException {
  long count = 0;
  final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, IndexSearcher.getMaxClauseCount());
  for (int i = 0; i < threshold; ++i) {
    final BytesRef term = termsEnum.next();
    if (term == null) {
      return -count - 1;
    }
    TermState state = termsEnum.termState();
    int df = termsEnum.docFreq();
    count += df;
    terms.add(new TermAndState(BytesRef.deepCopyOf(term), state, df, termsEnum.totalTermFreq()));
  }
  return termsEnum.next() == null ? (-count - 1) : count;
}
 
Example 3
Source File: QERetrievalApp.java    From lucene4ir with Apache License 2.0 6 votes vote down vote up
/**
 * Combines the individual term vectors of each document into a single list.
 * @param terms
 * @return
 */
public HashMap<String, QETerm> combineTerms(Vector<Terms> terms){
    HashMap<String, QETerm> combinedTerms = new HashMap<String, QETerm>();
    int numDocs = terms.size();
    for(Terms ts : terms){
        try {
            TermsEnum te = ts.iterator();
            BytesRef term;
            while ((term = te.next()) != null) {
                String tString = term.utf8ToString();
                QETerm qet = new QETerm(tString, te.totalTermFreq(),te.docFreq(),numDocs);
                if (combinedTerms.containsKey(tString)){
                    QETerm mergedTerm = qet.combine(combinedTerms.get(tString));
                    combinedTerms.replace(tString,mergedTerm);
                }
                else
                    combinedTerms.put(tString,qet);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    return combinedTerms;
}
 
Example 4
Source File: QualityQueriesFinder.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private String [] bestTerms(String field,int numTerms) throws IOException {
  PriorityQueue<TermDf> pq = new TermsDfQueue(numTerms);
  IndexReader ir = DirectoryReader.open(dir);
  try {
    int threshold = ir.maxDoc() / 10; // ignore words too common.
    Terms terms = MultiTerms.getTerms(ir, field);
    if (terms != null) {
      TermsEnum termsEnum = terms.iterator();
      while (termsEnum.next() != null) {
        int df = termsEnum.docFreq();
        if (df<threshold) {
          String ttxt = termsEnum.term().utf8ToString();
          pq.insertWithOverflow(new TermDf(ttxt,df));
        }
      }
    }
  } finally {
    ir.close();
  }
  String res[] = new String[pq.size()];
  int i = 0;
  while (pq.size()>0) {
    TermDf tdf = pq.pop(); 
    res[i++] = tdf.word;
    System.out.println(i+".   word:  "+tdf.df+"   "+tdf.word);
  }
  return res;
}
 
Example 5
Source File: JoinDocFreqValueSource.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public FunctionValues getValues(Map<Object, Object> context, LeafReaderContext readerContext) throws IOException
{
  final BinaryDocValues terms = DocValues.getBinary(readerContext.reader(), field);
  final IndexReader top = ReaderUtil.getTopLevelContext(readerContext).reader();
  Terms t = MultiTerms.getTerms(top, qfield);
  final TermsEnum termsEnum = t == null ? TermsEnum.EMPTY : t.iterator();
  
  return new IntDocValues(this) {

    int lastDocID = -1;

    @Override
    public int intVal(int doc) throws IOException {
      if (doc < lastDocID) {
        throw new IllegalArgumentException("docs were sent out-of-order: lastDocID=" + lastDocID + " vs docID=" + doc);
      }
      lastDocID = doc;
      int curDocID = terms.docID();
      if (doc > curDocID) {
        curDocID = terms.advance(doc);
      }
      if (doc == curDocID) {
        BytesRef term = terms.binaryValue();
        if (termsEnum.seekExact(term)) {
          return termsEnum.docFreq();
        }
      }
      return 0;
    }
  };
}
 
Example 6
Source File: TermFilteredPresearcher.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private Query buildFilterClause(LeafReader reader, String field) throws IOException {

    Terms terms = reader.terms(field);
    if (terms == null)
      return null;

    BooleanQuery.Builder bq = new BooleanQuery.Builder();

    int docsInBatch = reader.maxDoc();

    BytesRef term;
    TermsEnum te = terms.iterator();
    while ((term = te.next()) != null) {
      // we need to check that every document in the batch has the same field values, otherwise
      // this filtering will not work
      if (te.docFreq() != docsInBatch)
        throw new IllegalArgumentException("Some documents in this batch do not have a term value of "
            + field + ":" + Term.toString(term));
      bq.add(new TermQuery(new Term(field, BytesRef.deepCopyOf(term))), BooleanClause.Occur.SHOULD);
    }

    BooleanQuery built = bq.build();

    if (built.clauses().size() == 0)
      return null;

    return built;
  }
 
Example 7
Source File: TermInSetQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
TermAndState(String field, TermsEnum termsEnum) throws IOException {
  this.field = field;
  this.termsEnum = termsEnum;
  this.term = BytesRef.deepCopyOf(termsEnum.term());
  this.state = termsEnum.termState();
  this.docFreq = termsEnum.docFreq();
  this.totalTermFreq = termsEnum.totalTermFreq();
}
 
Example 8
Source File: GraphTermsQParserPlugin.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void collectTermStates(IndexReader reader,
                               List<LeafReaderContext> leaves,
                               TermStates[] contextArray,
                               Term[] queryTerms) throws IOException {
  TermsEnum termsEnum = null;
  for (LeafReaderContext context : leaves) {

    Terms terms = context.reader().terms(this.field);
    if (terms == null) {
      // field does not exist
      continue;
    }

    termsEnum = terms.iterator();

    if (termsEnum == TermsEnum.EMPTY) continue;

    for (int i = 0; i < queryTerms.length; i++) {
      Term term = queryTerms[i];
      TermStates termStates = contextArray[i];

      if (termsEnum.seekExact(term.bytes())) {
        if (termStates == null) {
          contextArray[i] = new TermStates(reader.getContext(),
              termsEnum.termState(), context.ord, termsEnum.docFreq(),
              termsEnum.totalTermFreq());
        } else {
          termStates.register(termsEnum.termState(), context.ord,
              termsEnum.docFreq(), termsEnum.totalTermFreq());
        }
      }
    }
  }
}
 
Example 9
Source File: UnInvertedField.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Called for each term in the field being uninverted.
 * Collects {@link #maxTermCounts} for all bigTerms as well as storing them in {@link #bigTerms}.
 * @param te positioned at the current term.
 * @param termNum the ID/pointer/ordinal of the current term. Monotonically increasing between calls.
 */
@Override
protected void visitTerm(TermsEnum te, int termNum) throws IOException {

  if (termNum >= maxTermCounts.length) {
    // resize by doubling - for very large number of unique terms, expanding
    // by 4K and resultant GC will dominate uninvert times.  Resize at end if material
    int[] newMaxTermCounts = new int[ Math.min(Integer.MAX_VALUE-16, maxTermCounts.length*2) ];
    System.arraycopy(maxTermCounts, 0, newMaxTermCounts, 0, termNum);
    maxTermCounts = newMaxTermCounts;
  }

  final BytesRef term = te.term();

  if (te.docFreq() > maxTermDocFreq) {
    Term t = new Term(field, term);  // this makes a deep copy of the term bytes
    TopTerm topTerm = new TopTerm();
    topTerm.term = t.bytes();
    topTerm.termNum = termNum;
    topTerm.termQuery = new TermQuery(t);

    bigTerms.put(topTerm.termNum, topTerm);

    if (deState == null) {
      deState = new SolrIndexSearcher.DocsEnumState();
      deState.fieldName = field;
      deState.liveDocs = searcher.getLiveDocsBits();
      deState.termsEnum = te;  // TODO: check for MultiTermsEnum in SolrIndexSearcher could now fail?
      deState.postingsEnum = postingsEnum;
      deState.minSetSizeCached = maxTermDocFreq;
    }

    postingsEnum = deState.postingsEnum;
    DocSet set = searcher.getDocSet(deState);
    maxTermCounts[termNum] = set.size();
  }
}
 
Example 10
Source File: CodecCollector.java    From mtas with Apache License 2.0 5 votes vote down vote up
/**
 * Compute termvector number basic.
 *
 * @param termsEnum
 *          the terms enum
 * @param r
 *          the r
 * @return the termvector number basic
 * @throws IOException
 *           Signals that an I/O exception has occurred.
 */
private static TermvectorNumberBasic computeTermvectorNumberBasic(
    TermsEnum termsEnum, LeafReader r) throws IOException {
  TermvectorNumberBasic result = new TermvectorNumberBasic();
  boolean hasDeletedDocuments = (r.getLiveDocs() != null);
  if (!hasDeletedDocuments) {
    result.valueSum[0] = termsEnum.totalTermFreq();
    result.docNumber = termsEnum.docFreq();
    if (result.valueSum[0] > -1) {
      return result;
    }
  }
  throw new IOException("should not call this");
}
 
Example 11
Source File: AlfrescoLukeRequestHandler.java    From SearchServices with GNU Lesser General Public License v3.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
private static void getDetailedFieldInfo(SolrQueryRequest req,
		String field, SimpleOrderedMap<Object> fieldMap) throws IOException {

	SolrParams params = req.getParams();
	final int numTerms = params.getInt(NUMTERMS, DEFAULT_COUNT);

	TopTermQueue tiq = new TopTermQueue(numTerms + 1); // Something to
														// collect the top N
														// terms in.

	final CharsRefBuilder spare = new CharsRefBuilder();

	Terms terms = MultiFields.getTerms(req.getSearcher().getIndexReader(),
			field);
	if (terms == null) { // field does not exist
		return;
	}
	TermsEnum termsEnum = terms.iterator();
	BytesRef text;
	int[] buckets = new int[HIST_ARRAY_SIZE];
	while ((text = termsEnum.next()) != null) {
		++tiq.distinctTerms;
		int freq = termsEnum.docFreq(); // This calculation seems odd, but
										// it gives the same results as it
										// used to.
		int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1));
		buckets[slot] = buckets[slot] + 1;
		if (numTerms > 0 && freq > tiq.minFreq) {
			spare.copyUTF8Bytes(text);
			String t = spare.toString();

			tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum
					.docFreq()));
			if (tiq.size() > numTerms) { // if tiq full
				tiq.pop(); // remove lowest in tiq
				tiq.minFreq = tiq.getTopTermInfo().docFreq;
			}
		}
	}
	tiq.histogram.add(buckets);
	fieldMap.add("distinct", tiq.distinctTerms);

	// Include top terms
	fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema()));

	// Add a histogram
	fieldMap.add("histogram", tiq.histogram.toNamedList());
}
 
Example 12
Source File: SplitOp.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
static Collection<RangeCount> getHashHistogram(SolrIndexSearcher searcher, String prefixField, DocRouter router, DocCollection collection) throws IOException {
  RTimer timer = new RTimer();
  TreeMap<DocRouter.Range,RangeCount> counts = new TreeMap<>();

  Terms terms = MultiTerms.getTerms(searcher.getIndexReader(), prefixField);
  if (terms == null) {
    return counts.values();
  }

  int numPrefixes = 0;
  int numTriLevel = 0;
  int numCollisions = 0;
  long sumBuckets = 0;

  TermsEnum termsEnum = terms.iterator();
  BytesRef term;
  while ((term = termsEnum.next()) != null) {
    numPrefixes++;

    String termStr = term.utf8ToString();
    int firstSep = termStr.indexOf(CompositeIdRouter.SEPARATOR);
    // truncate to first separator since we don't support multiple levels currently
    // NOTE: this does not currently work for tri-level composite ids since the number of bits allocated to the first ID is 16 for a 2 part id
    // and 8 for a 3 part id!
    if (firstSep != termStr.length()-1 && firstSep > 0) {
      numTriLevel++;
      termStr = termStr.substring(0, firstSep+1);
    }

    DocRouter.Range range = router.getSearchRangeSingle(termStr, null, collection);
    int numDocs = termsEnum.docFreq();
    sumBuckets += numDocs;

    RangeCount rangeCount = new RangeCount(range, numDocs);

    RangeCount prev = counts.put(rangeCount.range, rangeCount);
    if (prev != null) {
      // we hit a hash collision or truncated a prefix to first level, so add the buckets together.
      rangeCount.count += prev.count;
      numCollisions++;
    }
  }

  if (log.isInfoEnabled()) {
    log.info("Split histogram: ms={}, numBuckets={} sumBuckets={} numPrefixes={} numTriLevel={} numCollisions={}"
        , timer.getTime(), counts.size(), sumBuckets, numPrefixes, numTriLevel, numCollisions);
  }

  return counts.values();
}
 
Example 13
Source File: LukeRequestHandler.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
private static void getDetailedFieldInfo(SolrQueryRequest req, String field, SimpleOrderedMap<Object> fieldMap)
    throws IOException {

  SolrParams params = req.getParams();
  final int numTerms = params.getInt( NUMTERMS, DEFAULT_COUNT );

  TopTermQueue tiq = new TopTermQueue(numTerms + 1);  // Something to collect the top N terms in.

  final CharsRefBuilder spare = new CharsRefBuilder();

  Terms terms = MultiTerms.getTerms(req.getSearcher().getIndexReader(), field);
  if (terms == null) {  // field does not exist
    return;
  }
  TermsEnum termsEnum = terms.iterator();
  BytesRef text;
  int[] buckets = new int[HIST_ARRAY_SIZE];
  while ((text = termsEnum.next()) != null) {
    ++tiq.distinctTerms;
    int freq = termsEnum.docFreq();  // This calculation seems odd, but it gives the same results as it used to.
    int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1));
    buckets[slot] = buckets[slot] + 1;
    if (numTerms > 0 && freq > tiq.minFreq) {
      spare.copyUTF8Bytes(text);
      String t = spare.toString();

      tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum.docFreq()));
      if (tiq.size() > numTerms) { // if tiq full
        tiq.pop(); // remove lowest in tiq
        tiq.minFreq = tiq.getTopTermInfo().docFreq;
      }
    }
  }
  tiq.histogram.add(buckets);
  fieldMap.add("distinct", tiq.distinctTerms);

  // Include top terms
  fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema()));

  // Add a histogram
  fieldMap.add("histogram", tiq.histogram.toNamedList());
}
 
Example 14
Source File: TermFreqAnalyser.java    From Siamese with GNU General Public License v3.0 4 votes vote down vote up
private static void analyseTermFreq(String indexName, String field, String freqType, String outputFileName) {
        String indexFile = elasticsearchLoc + "/data/stackoverflow/nodes/0/indices/"
                + indexName + "/0/index";
        DecimalFormat df = new DecimalFormat("#.00");
        int printEvery = 100000;
        File outputFile = new File(outputFileName);
        if (outputFile.exists()) {
            if (!outputFile.delete()) {
                System.out.println("ERROR: cannot delete the output file.");
                System.exit(0);
            }
        }
        /* adapted from
        https://stackoverflow.com/questions/28244961/lucene-4-10-2-calculate-tf-idf-for-all-terms-in-index
         */
        int count = 0;
        try {
            IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexFile)));
            Fields fields = MultiFields.getFields(reader);
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            int size = 0;
            // TODO: is there a better solution?
            // iterate to get the size
            while (termsEnum.next() != null) {
                size++;
            }
//            String[] termArr = new String[size];
            long[] freqArr = new long[size];
            // do the real work
            termsEnum = terms.iterator();
            while (termsEnum.next() != null) {
//                String term = termsEnum.term().utf8ToString();
                long tfreq = 0;
                if (freqType.equals("tf"))
                    tfreq = termsEnum.totalTermFreq();
                else if (freqType.equals("df"))
                    tfreq = termsEnum.docFreq();
                else {
                    System.out.println("Wrong frequency. Quit!");
                    System.exit(0);
                }
//                termArr[count] = term;
                freqArr[count] = tfreq;
                if (count % printEvery == 0) {
                    System.out.println("processed: " + count + " terms "
                            + " [" + df.format(((long)count * 100)/size) + "%]");
                }
                count++;
            }
            System.out.println(field + ": total = " + count);
            double[] data = new double[size];
            String output = "freq\n";
            for (int i = 0; i < freqArr.length; i++) {
                data[i] = freqArr[i];
                output += freqArr[i] + "\n";
                if (i > 0 && i % printEvery == 0) {
                    MyUtils.writeToFile("./", outputFileName, output, true);
                    System.out.println("written: " + i + " terms "
                            + " [" + df.format(((long)i * 100)/size) + "%]");
                    output = "";
                }
            }
            // write the rest to the file
            MyUtils.writeToFile("./",outputFileName, output, true);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
 
Example 15
Source File: DocSetInfoCommand.java    From clue with Apache License 2.0 4 votes vote down vote up
@Override
public void execute(Namespace args, PrintStream out) throws Exception {
  String field = args.getString("field");
  String termVal = null;
  int bucketSize = args.getInt("size");

  if (field != null){
    String[] parts = field.split(":");
    if (parts.length > 1){
      field = parts[0];
      termVal = parts[1];
    }
  }
  
  IndexReader reader = ctx.getIndexReader();
  List<LeafReaderContext> leaves = reader.leaves();
  

  PostingsEnum postingsEnum = null;
  for (LeafReaderContext leaf : leaves) {
    LeafReader atomicReader = leaf.reader();
    Terms terms = atomicReader.terms(field);
    if (terms == null){
      continue;
    }
    if (terms != null && termVal != null){        
      TermsEnum te = terms.iterator();
      
      if (te.seekExact(new BytesRef(termVal))){
        postingsEnum = te.postings(postingsEnum, PostingsEnum.FREQS);
        
        int docFreq = te.docFreq();
        
        int minDocId = -1, maxDocId = -1;
        int doc, count = 0;
        
        int[] percentDocs = new int[PERCENTILES.length];
        
        int percentileIdx = 0;
        
        while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
          maxDocId = doc;
          if (minDocId == -1) {
            minDocId = doc;
          }
          count ++;
          
          double perDocs = (double) count / (double) docFreq * 100.0;
          while (percentileIdx < percentDocs.length) {
            if (perDocs > PERCENTILES[percentileIdx]) {
              percentDocs[percentileIdx] = doc;
              percentileIdx++;
            } else {
              break;
            }
          }
        }
        
        // calculate histogram          
        int[] buckets = null;
        if (maxDocId > 0) {
          buckets = new int[maxDocId / bucketSize + 1];
          
          postingsEnum = te.postings(postingsEnum, PostingsEnum.FREQS);
          while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            int bucketIdx = doc / bucketSize;
            buckets[bucketIdx]++;
          }
        }
        
        double density = (double) docFreq / (double) (maxDocId - minDocId) ; 
        out.println(String.format("min: %d, max: %d, count: %d, density: %.2f", minDocId, maxDocId, docFreq, density));
        out.println("percentiles: " + Arrays.toString(PERCENTILES) + " => " + Arrays.toString(percentDocs));
        out.println("histogram: (bucketsize=" + bucketSize+")");
        out.println(Arrays.toString(buckets));
      }
    }
  }
}