Java Code Examples for org.apache.lucene.index.TermsEnum#totalTermFreq()

The following examples show how to use org.apache.lucene.index.TermsEnum#totalTermFreq() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ReadmeSimilarityCalculator.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
private DocVector[] getDocumentVectors() throws IOException {
	DocVector[] docVector = new DocVector[getTotalDocumentInIndex()];
	for (int docId = 0; docId < getTotalDocumentInIndex(); docId++) {
		Terms vector = getIndexReader().getTermVector(docId, FIELD_CONTENT);
		TermsEnum termsEnum = null;
		termsEnum = vector.iterator();
		BytesRef text = null;
		docVector[docId] = new DocVector(getAllTerms());
		while ((text = termsEnum.next()) != null) {
			String term = text.utf8ToString();
			int freq = (int) termsEnum.totalTermFreq();
			docVector[docId].setEntry(term, freq);
		}
		docVector[docId].normalize();
	}
	getIndexReader().close();
	return docVector;
}
 
Example 2
Source File: TermVectorEntry.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Returns a new term vector entry representing the specified term, and optionally, positions.
 *
 * @param te - positioned terms iterator
 * @return term vector entry
 * @throws IOException - if there is a low level IO error.
 */
static TermVectorEntry of(TermsEnum te) throws IOException {
  Objects.requireNonNull(te);

  String termText = BytesRefUtils.decode(te.term());

  List<TermVectorEntry.TermVectorPosition> tvPositions = new ArrayList<>();
  PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
  pe.nextDoc();
  int freq = pe.freq();
  for (int i = 0; i < freq; i++) {
    int pos = pe.nextPosition();
    if (pos < 0) {
      // no position information available
      continue;
    }
    TermVectorPosition tvPos = TermVectorPosition.of(pos, pe);
    tvPositions.add(tvPos);
  }

  return new TermVectorEntry(termText, te.totalTermFreq(), tvPositions);
}
 
Example 3
Source File: MoreLikeThis.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param field2termFreqMap a Map of terms and their frequencies per field
 * @param vector List of terms and their frequencies for a doc/field
 */
private void addTermFrequencies(Map<String, Map<String, Int>> field2termFreqMap, Terms vector, String fieldName) throws IOException {
  Map<String, Int> termFreqMap = field2termFreqMap.computeIfAbsent(fieldName, k -> new HashMap<>());
  final TermsEnum termsEnum = vector.iterator();
  final CharsRefBuilder spare = new CharsRefBuilder();
  BytesRef text;
  while((text = termsEnum.next()) != null) {
    spare.copyUTF8Bytes(text);
    final String term = spare.toString();
    if (isNoiseWord(term)) {
      continue;
    }
    final int freq = (int) termsEnum.totalTermFreq();

    // increment frequency
    Int cnt = termFreqMap.get(term);
    if (cnt == null) {
      cnt = new Int();
      termFreqMap.put(term, cnt);
      cnt.x = freq;
    } else {
      cnt.x += freq;
    }
  }
}
 
Example 4
Source File: DocToDoubleVectorUtils.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * create a sparse <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc
 *
 * @param docTerms   term vectors for a given document
 * @param fieldTerms field term vectors
 * @return a sparse vector of <code>Double</code>s as an array
 * @throws IOException in case accessing the underlying index fails
 */
public static Double[] toSparseLocalFreqDoubleArray(Terms docTerms, Terms fieldTerms) throws IOException {
  TermsEnum fieldTermsEnum = fieldTerms.iterator();
  Double[] freqVector = null;
  if (docTerms != null && fieldTerms.size() > -1) {
    freqVector = new Double[(int) fieldTerms.size()];
    int i = 0;
    TermsEnum docTermsEnum = docTerms.iterator();
    BytesRef term;
    while ((term = fieldTermsEnum.next()) != null) {
      TermsEnum.SeekStatus seekStatus = docTermsEnum.seekCeil(term);
      if (seekStatus.equals(TermsEnum.SeekStatus.END)) {
        docTermsEnum = docTerms.iterator();
      }
      if (seekStatus.equals(TermsEnum.SeekStatus.FOUND)) {
        long termFreqLocal = docTermsEnum.totalTermFreq(); // the total number of occurrences of this term in the given document
        freqVector[i] = Long.valueOf(termFreqLocal).doubleValue();
      } else {
        freqVector[i] = 0d;
      }
      i++;
    }
  }
  return freqVector;
}
 
Example 5
Source File: DocToDoubleVectorUtils.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * create a dense <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc
 *
 * @param docTerms term vectors for a given document
 * @return a dense vector of <code>Double</code>s as an array
 * @throws IOException in case accessing the underlying index fails
 */
public static Double[] toDenseLocalFreqDoubleArray(Terms docTerms) throws IOException {
  Double[] freqVector = null;
  if (docTerms != null) {
    freqVector = new Double[(int) docTerms.size()];
    int i = 0;
    TermsEnum docTermsEnum = docTerms.iterator();

    while (docTermsEnum.next() != null) {
      long termFreqLocal = docTermsEnum.totalTermFreq(); // the total number of occurrences of this term in the given document
      freqVector[i] = Long.valueOf(termFreqLocal).doubleValue();
      i++;
    }
  }
  return freqVector;
}
 
Example 6
Source File: QERetrievalApp.java    From lucene4ir with Apache License 2.0 6 votes vote down vote up
/**
 * Combines the individual term vectors of each document into a single list.
 * @param terms
 * @return
 */
public HashMap<String, QETerm> combineTerms(Vector<Terms> terms){
    HashMap<String, QETerm> combinedTerms = new HashMap<String, QETerm>();
    int numDocs = terms.size();
    for(Terms ts : terms){
        try {
            TermsEnum te = ts.iterator();
            BytesRef term;
            while ((term = te.next()) != null) {
                String tString = term.utf8ToString();
                QETerm qet = new QETerm(tString, te.totalTermFreq(),te.docFreq(),numDocs);
                if (combinedTerms.containsKey(tString)){
                    QETerm mergedTerm = qet.combine(combinedTerms.get(tString));
                    combinedTerms.replace(tString,mergedTerm);
                }
                else
                    combinedTerms.put(tString,qet);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    return combinedTerms;
}
 
Example 7
Source File: SimpleTextTermVectorsReader.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public long getSumTotalTermFreq() throws IOException {
  // TODO: make it constant-time
  long ttf = 0;
  TermsEnum iterator = iterator();
  for (BytesRef b = iterator.next(); b != null; b = iterator.next()) {
    ttf += iterator.totalTermFreq();
  }
  return ttf;
}
 
Example 8
Source File: BooleanPerceptronClassifier.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void updateWeights(IndexReader indexReader,
                           int docId, Boolean assignedClass, SortedMap<String, Double> weights,
                           double modifier, boolean updateFST) throws IOException {
  TermsEnum cte = textTerms.iterator();

  // get the doc term vectors
  Terms terms = indexReader.getTermVector(docId, textFieldName);

  if (terms == null) {
    throw new IOException("term vectors must be stored for field "
            + textFieldName);
  }

  TermsEnum termsEnum = terms.iterator();

  BytesRef term;

  while ((term = termsEnum.next()) != null) {
    cte.seekExact(term);
    if (assignedClass != null) {
      long termFreqLocal = termsEnum.totalTermFreq();
      // update weights
      Long previousValue = Util.get(fst, term);
      String termString = term.utf8ToString();
      weights.put(termString, previousValue == null ? 0 : Math.max(0, previousValue + modifier * termFreqLocal));
    }
  }
  if (updateFST) {
    updateFST(weights);
  }
}
 
Example 9
Source File: TermInSetQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
TermAndState(String field, TermsEnum termsEnum) throws IOException {
  this.field = field;
  this.termsEnum = termsEnum;
  this.term = BytesRef.deepCopyOf(termsEnum.term());
  this.state = termsEnum.termState();
  this.docFreq = termsEnum.docFreq();
  this.totalTermFreq = termsEnum.totalTermFreq();
}
 
Example 10
Source File: GraphTermsQParserPlugin.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void collectTermStates(IndexReader reader,
                               List<LeafReaderContext> leaves,
                               TermStates[] contextArray,
                               Term[] queryTerms) throws IOException {
  TermsEnum termsEnum = null;
  for (LeafReaderContext context : leaves) {

    Terms terms = context.reader().terms(this.field);
    if (terms == null) {
      // field does not exist
      continue;
    }

    termsEnum = terms.iterator();

    if (termsEnum == TermsEnum.EMPTY) continue;

    for (int i = 0; i < queryTerms.length; i++) {
      Term term = queryTerms[i];
      TermStates termStates = contextArray[i];

      if (termsEnum.seekExact(term.bytes())) {
        if (termStates == null) {
          contextArray[i] = new TermStates(reader.getContext(),
              termsEnum.termState(), context.ord, termsEnum.docFreq(),
              termsEnum.totalTermFreq());
        } else {
          termStates.register(termsEnum.termState(), context.ord,
              termsEnum.docFreq(), termsEnum.totalTermFreq());
        }
      }
    }
  }
}
 
Example 11
Source File: CodecCollector.java    From mtas with Apache License 2.0 5 votes vote down vote up
/**
 * Compute termvector number basic.
 *
 * @param termsEnum
 *          the terms enum
 * @param r
 *          the r
 * @return the termvector number basic
 * @throws IOException
 *           Signals that an I/O exception has occurred.
 */
private static TermvectorNumberBasic computeTermvectorNumberBasic(
    TermsEnum termsEnum, LeafReader r) throws IOException {
  TermvectorNumberBasic result = new TermvectorNumberBasic();
  boolean hasDeletedDocuments = (r.getLiveDocs() != null);
  if (!hasDeletedDocuments) {
    result.valueSum[0] = termsEnum.totalTermFreq();
    result.docNumber = termsEnum.docFreq();
    if (result.valueSum[0] > -1) {
      return result;
    }
  }
  throw new IOException("should not call this");
}
 
Example 12
Source File: TermVectorComponent.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void mapOneVector(NamedList<Object> docNL, FieldOptions fieldOptions, IndexReader reader, int docID, TermsEnum termsEnum, String field) throws IOException {
  NamedList<Object> fieldNL = new NamedList<>();
  docNL.add(field, fieldNL);

  BytesRef text;
  PostingsEnum dpEnum = null;
  while((text = termsEnum.next()) != null) {
    String term = text.utf8ToString();
    NamedList<Object> termInfo = new NamedList<>();
    fieldNL.add(term, termInfo);
    final int freq = (int) termsEnum.totalTermFreq();
    if (fieldOptions.termFreq == true) {
      termInfo.add("tf", freq);
    }

    int dpEnumFlags = 0;
    dpEnumFlags |= fieldOptions.positions ? PostingsEnum.POSITIONS : 0;
    //payloads require offsets
    dpEnumFlags |= (fieldOptions.offsets || fieldOptions.payloads) ? PostingsEnum.OFFSETS : 0;
    dpEnumFlags |= fieldOptions.payloads ? PostingsEnum.PAYLOADS : 0;
    dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);

    boolean atNextDoc = false;
    if (dpEnum != null) {
      dpEnum.nextDoc();
      atNextDoc = true;
    }

    if (atNextDoc && dpEnumFlags != 0) {
      NamedList<Integer> positionsNL = null;
      NamedList<Number> theOffsets = null;
      NamedList<String> thePayloads = null;

      for (int i = 0; i < freq; i++) {
        final int pos = dpEnum.nextPosition();
        if (fieldOptions.positions && pos >= 0) {
          if (positionsNL == null) {
            positionsNL = new NamedList<>();
            termInfo.add("positions", positionsNL);
          }
          positionsNL.add("position", pos);
        }

        int startOffset = fieldOptions.offsets ? dpEnum.startOffset() : -1;
        if (startOffset >= 0) {
          if (theOffsets == null) {
            theOffsets = new NamedList<>();
            termInfo.add("offsets", theOffsets);
          }
          theOffsets.add("start", dpEnum.startOffset());
          theOffsets.add("end", dpEnum.endOffset());
        }

        BytesRef payload = fieldOptions.payloads ? dpEnum.getPayload() : null;
        if (payload != null) {
          if (thePayloads == null) {
            thePayloads = new NamedList<>();
            termInfo.add("payloads", thePayloads);
          }
          thePayloads.add("payload", Base64.byteArrayToBase64(payload.bytes, payload.offset, payload.length));
        }
      }
    }
    
    int df = 0;
    if (fieldOptions.docFreq || fieldOptions.tfIdf) {
      df = reader.docFreq(new Term(field, text));
    }

    if (fieldOptions.docFreq) {
      termInfo.add("df", df);
    }

    // TODO: this is not TF/IDF by anyone's definition!
    if (fieldOptions.tfIdf) {
      double tfIdfVal = ((double) freq) / df;
      termInfo.add("tf-idf", tfIdfVal);
    }
  }
}
 
Example 13
Source File: TermFreqAnalyser.java    From Siamese with GNU General Public License v3.0 4 votes vote down vote up
private static void analyseTermFreq(String indexName, String field, String freqType, String outputFileName) {
        String indexFile = elasticsearchLoc + "/data/stackoverflow/nodes/0/indices/"
                + indexName + "/0/index";
        DecimalFormat df = new DecimalFormat("#.00");
        int printEvery = 100000;
        File outputFile = new File(outputFileName);
        if (outputFile.exists()) {
            if (!outputFile.delete()) {
                System.out.println("ERROR: cannot delete the output file.");
                System.exit(0);
            }
        }
        /* adapted from
        https://stackoverflow.com/questions/28244961/lucene-4-10-2-calculate-tf-idf-for-all-terms-in-index
         */
        int count = 0;
        try {
            IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexFile)));
            Fields fields = MultiFields.getFields(reader);
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            int size = 0;
            // TODO: is there a better solution?
            // iterate to get the size
            while (termsEnum.next() != null) {
                size++;
            }
//            String[] termArr = new String[size];
            long[] freqArr = new long[size];
            // do the real work
            termsEnum = terms.iterator();
            while (termsEnum.next() != null) {
//                String term = termsEnum.term().utf8ToString();
                long tfreq = 0;
                if (freqType.equals("tf"))
                    tfreq = termsEnum.totalTermFreq();
                else if (freqType.equals("df"))
                    tfreq = termsEnum.docFreq();
                else {
                    System.out.println("Wrong frequency. Quit!");
                    System.exit(0);
                }
//                termArr[count] = term;
                freqArr[count] = tfreq;
                if (count % printEvery == 0) {
                    System.out.println("processed: " + count + " terms "
                            + " [" + df.format(((long)count * 100)/size) + "%]");
                }
                count++;
            }
            System.out.println(field + ": total = " + count);
            double[] data = new double[size];
            String output = "freq\n";
            for (int i = 0; i < freqArr.length; i++) {
                data[i] = freqArr[i];
                output += freqArr[i] + "\n";
                if (i > 0 && i % printEvery == 0) {
                    MyUtils.writeToFile("./", outputFileName, output, true);
                    System.out.println("written: " + i + " terms "
                            + " [" + df.format(((long)i * 100)/size) + "%]");
                    output = "";
                }
            }
            // write the rest to the file
            MyUtils.writeToFile("./",outputFileName, output, true);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }