Java Code Examples for org.apache.lucene.index.Terms#iterator()

The following examples show how to use org.apache.lucene.index.Terms#iterator() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DisjunctionMatchesIterator.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Create a {@link DisjunctionMatchesIterator} over a list of terms extracted from a {@link BytesRefIterator}
 *
 * Only terms that have at least one match in the given document will be included
 */
static MatchesIterator fromTermsEnum(LeafReaderContext context, int doc, Query query, String field, BytesRefIterator terms) throws IOException {
  Objects.requireNonNull(field);
  Terms t = context.reader().terms(field);
  if (t == null)
    return null;
  TermsEnum te = t.iterator();
  PostingsEnum reuse = null;
  for (BytesRef term = terms.next(); term != null; term = terms.next()) {
    if (te.seekExact(term)) {
      PostingsEnum pe = te.postings(reuse, PostingsEnum.OFFSETS);
      if (pe.advance(doc) == doc) {
        return new TermsEnumDisjunctionMatchesIterator(new TermMatchesIterator(query, pe), terms, te, doc, query);
      }
      else {
        reuse = pe;
      }
    }
  }
  return null;
}
 
Example 2
Source File: BlockTermsWriter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public void write(Fields fields, NormsProducer norms) throws IOException {

  for(String field : fields) {

    Terms terms = fields.terms(field);
    if (terms == null) {
      continue;
    }

    TermsEnum termsEnum = terms.iterator();

    TermsWriter termsWriter = addField(fieldInfos.fieldInfo(field));

    while (true) {
      BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }

      termsWriter.write(term, termsEnum, norms);
    }

    termsWriter.finish();
  }
}
 
Example 3
Source File: BM25NBClassifier.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Calculate probabilities for all classes for a given input text
 *
 * @param inputDocument the input text as a {@code String}
 * @return a {@code List} of {@code ClassificationResult}, one for each existing class
 * @throws IOException if assigning probabilities fails
 */
private List<ClassificationResult<BytesRef>> assignClassNormalizedList(String inputDocument) throws IOException {
  List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>();

  Terms classes = MultiTerms.getTerms(indexReader, classFieldName);
  TermsEnum classesEnum = classes.iterator();
  BytesRef next;
  String[] tokenizedText = tokenize(inputDocument);
  while ((next = classesEnum.next()) != null) {
    if (next.length > 0) {
      Term term = new Term(this.classFieldName, next);
      assignedClasses.add(new ClassificationResult<>(term.bytes(), calculateLogPrior(term) + calculateLogLikelihood(tokenizedText, term)));
    }
  }

  return normClassificationResults(assignedClasses);
}
 
Example 4
Source File: SolrRangeQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public RangeTermsEnum(Terms terms) throws IOException {
  if (terms == null) {
    positioned = true;
  } else {
    te = terms.iterator();
    if (lower != null) {
      TermsEnum.SeekStatus status = te.seekCeil(lower);
      if (status == TermsEnum.SeekStatus.END) {
        positioned = true;
        curr = null;
      } else if (status == SeekStatus.FOUND) {
        positioned = includeLower();
        curr = te.term();
      } else {
        // lower bound not found, so includeLower is irrelevant
        positioned = true;
        curr = te.term();
      }
    }
  }
}
 
Example 5
Source File: LindenFieldCacheImpl.java    From linden with Apache License 2.0 6 votes vote down vote up
@Override
protected Accountable createValue(final AtomicReader reader, CacheKey key, boolean setDocsWithField)
    throws IOException {
  final Map<String, Integer> uidMap = new HashMap<>();

  Uninvert u = new Uninvert() {
    private String currentValue;

    @Override
    public void visitTerm(BytesRef term) {
      currentValue = term.utf8ToString();
    }

    @Override
    public void visitDoc(int docID) {
      uidMap.put(currentValue, docID);
    }

    @Override
    protected TermsEnum termsEnum(Terms terms) throws IOException {
      return terms.iterator(null);
    }
  };
  u.uninvert(reader, key.field, setDocsWithField);
  return new PerReaderUIDMaps(reader.getContext().ord, uidMap);
}
 
Example 6
Source File: AbstractFeatureBuilder.java    From jate with GNU Lesser General Public License v3.0 5 votes vote down vote up
/**
 * Retrieve term candidates from solr field
 *      see @code {uk.ac.shef.dcs.jate.JATEProperties.PROPERTY_SOLR_FIELD_CONTENT_TERMS}
 *
 * The method assumes that the term candidates are extracted at index-time and stored in pre-configured field
 *
 * @return Set, a set of term candidate surface form
 * @throws JATEException
 * @throws IOException
 */
protected Set<String> getUniqueTerms() throws JATEException, IOException {
    Terms terms =SolrUtil.getTermVector(properties.getSolrFieldNameJATECTerms(),solrIndexSearcher);

    //>>>>>>>>>
    /*TermsEnum source = terms.iterator();
    String term = //"thrownawayorusedjustforelementarystatistical profile";
    "l hierar hy";
    //"ordertoavoidadependencyofthebaselineresultontherandom";

            if (source.seekExact(new BytesRef(term.getBytes("UTF-8")))) {
                PostingsEnum docEnum = source.postings(null);
                int doc = 0;
                while ((doc = docEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
                    int tfid = docEnum.freq();  //tf in document

                }

            } else {

            }*/
    //>>>>>>>>>

    TermsEnum termsEnum = terms.iterator();
    Set<String> allTermCandidates = new HashSet<>();

    while (termsEnum.next() != null) {
        BytesRef t = termsEnum.term();
        if (t.length == 0)
            continue;
        allTermCandidates.add(t.utf8ToString());
    }
    return allTermCandidates;
}
 
Example 7
Source File: FrequencyCtxSentenceBasedFBWorker.java    From jate with GNU Lesser General Public License v3.0 5 votes vote down vote up
private List<MWESentenceContext> collectTermOffsets(Terms termVectorLookup) throws IOException {
    List<MWESentenceContext> result = new ArrayList<>();

    TermsEnum tiRef= termVectorLookup.iterator();
    BytesRef luceneTerm = tiRef.next();
    while (luceneTerm != null) {
        if (luceneTerm.length == 0) {
            luceneTerm = tiRef.next();
            continue;
        }
        String tString = luceneTerm.utf8ToString();
        if(!allCandidates.contains(tString)) {
            luceneTerm=tiRef.next();
            continue;
        }


        PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
        //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

        int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            int totalOccurrence = postingsEnum.freq();
            for (int i = 0; i < totalOccurrence; i++) {
                postingsEnum.nextPosition();
                int start = postingsEnum.startOffset();
                int end = postingsEnum.endOffset();
                BytesRef payload=postingsEnum.getPayload();
                int sentenceId=-1;
                if(payload!=null){
                    sentenceId=new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())).getSentenceId();
                }
                result.add(new MWESentenceContext(tString,sentenceId, start, end));
            }
        }
        luceneTerm = tiRef.next();
    }
    Collections.sort(result);
    return result;
}
 
Example 8
Source File: FeatureSortField.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
  Terms terms = context.reader().terms(field);
  if (terms == null) {
    currentReaderPostingsValues = null;
  } else {
    TermsEnum termsEnum = terms.iterator();
    if (termsEnum.seekExact(featureName) == false) {
      currentReaderPostingsValues = null;
    } else {
      currentReaderPostingsValues = termsEnum.postings(currentReaderPostingsValues, PostingsEnum.FREQS);
    }
  }
}
 
Example 9
Source File: SecureAtomicReaderTestBase.java    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
private int getTermWithSeekCount(Fields fields, String field) throws IOException {
  Terms terms = fields.terms(field);
  TermsEnum termsEnum = terms.iterator(null);
  SeekStatus seekStatus = termsEnum.seekCeil(new BytesRef(""));
  if (seekStatus == SeekStatus.END) {
    return 0;
  }
  System.out.println(termsEnum.term().utf8ToString());
  int count = 1;
  while (termsEnum.next() != null) {
    count++;
  }
  return count;
}
 
Example 10
Source File: InfiniteLoopCommand.java    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
@Override
public Boolean execute(IndexContext context) throws IOException, InterruptedException {
  try {
    IndexReader indexReader = context.getIndexReader();
    while (true) {
      long hash = 0;
      for (AtomicReaderContext atomicReaderContext : indexReader.leaves()) {
        AtomicReader reader = atomicReaderContext.reader();
        for (String field : reader.fields()) {
          Terms terms = reader.terms(field);
          BytesRef bytesRef;
          TermsEnum iterator = terms.iterator(null);
          while ((bytesRef = iterator.next()) != null) {
            hash += bytesRef.hashCode();
          }
        }
      }
      System.out.println("hashcode = " + hash);
    }
  } catch (IOException e) {
    e.printStackTrace();
    throw e;
  } catch (Throwable t) {
    t.printStackTrace();
    if (t instanceof InterruptedException) {
      throw t;
    } else if (t instanceof RuntimeException) {
      throw (RuntimeException) t;
    }
    throw new RuntimeException(t);
  }
}
 
Example 11
Source File: TestTeeSinkTokenFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
  Directory dir = newDirectory();
  Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
  IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer));
  Document doc = new Document();
  TokenStream tokenStream = analyzer.tokenStream("field", "abcd   ");
  TeeSinkTokenFilter tee = new TeeSinkTokenFilter(tokenStream);
  TokenStream sink = tee.newSinkTokenStream();
  FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
  ft.setStoreTermVectors(true);
  ft.setStoreTermVectorOffsets(true);
  ft.setStoreTermVectorPositions(true);
  Field f1 = new Field("field", tee, ft);
  Field f2 = new Field("field", sink, ft);
  doc.add(f1);
  doc.add(f2);
  w.addDocument(doc);
  w.close();

  IndexReader r = DirectoryReader.open(dir);
  Terms vector = r.getTermVectors(0).terms("field");
  assertEquals(1, vector.size());
  TermsEnum termsEnum = vector.iterator();
  termsEnum.next();
  assertEquals(2, termsEnum.totalTermFreq());
  PostingsEnum positions = termsEnum.postings(null, PostingsEnum.ALL);
  assertTrue(positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  assertEquals(2, positions.freq());
  positions.nextPosition();
  assertEquals(0, positions.startOffset());
  assertEquals(4, positions.endOffset());
  positions.nextPosition();
  assertEquals(8, positions.startOffset());
  assertEquals(12, positions.endOffset());
  assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.nextDoc());
  r.close();
  dir.close();
  analyzer.close();
}
 
Example 12
Source File: TermVectorsResponse.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
private void buildField(XContentBuilder builder, final CharsRefBuilder spare, Fields theFields, Iterator<String> fieldIter) throws IOException {
    String fieldName = fieldIter.next();
    builder.startObject(fieldName);
    Terms curTerms = theFields.terms(fieldName);
    // write field statistics
    buildFieldStatistics(builder, curTerms);
    builder.startObject(FieldStrings.TERMS);
    TermsEnum termIter = curTerms.iterator();
    BoostAttribute boostAtt = termIter.attributes().addAttribute(BoostAttribute.class);
    for (int i = 0; i < curTerms.size(); i++) {
        buildTerm(builder, spare, curTerms, termIter, boostAtt);
    }
    builder.endObject();
    builder.endObject();
}
 
Example 13
Source File: BlendedInfixSuggester.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Create the coefficient to transform the weight.
 *
 * @param doc id of the document
 * @param matchedTokens tokens found in the query
 * @param prefixToken unfinished token in the query
 * @return the coefficient
 * @throws IOException If there are problems reading term vectors from the underlying Lucene index.
 */
private double createCoefficient(IndexSearcher searcher, int doc, Set<String> matchedTokens, String prefixToken) throws IOException {

  Terms tv = searcher.getIndexReader().getTermVector(doc, TEXT_FIELD_NAME);
  TermsEnum it = tv.iterator();

  Integer position = Integer.MAX_VALUE;
  BytesRef term;
  // find the closest token position
  while ((term = it.next()) != null) {

    String docTerm = term.utf8ToString();

    if (matchedTokens.contains(docTerm) || (prefixToken != null && docTerm.startsWith(prefixToken))) {
 
      PostingsEnum docPosEnum = it.postings(null, PostingsEnum.OFFSETS);
      docPosEnum.nextDoc();

      // use the first occurrence of the term
      int p = docPosEnum.nextPosition();
      if (p < position) {
        position = p;
      }
    }
  }

  // create corresponding coefficient based on position
  return calculateCoefficient(position);
}
 
Example 14
Source File: KNearestFuzzyClassifierTest.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Test
public void testPerformance() throws Exception {
  MockAnalyzer analyzer = new MockAnalyzer(random());
  int numDocs = atLeast(10);
  LeafReader leafReader = getRandomIndex(analyzer, numDocs);
  try {
    Classifier<BytesRef> classifier = new KNearestFuzzyClassifier(leafReader, null, analyzer, null, 3, categoryFieldName, textFieldName);

    ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(leafReader,
        classifier, categoryFieldName, textFieldName, -1);
    assertNotNull(confusionMatrix);

    double avgClassificationTime = confusionMatrix.getAvgClassificationTime();
    assertTrue(avgClassificationTime >= 0);

    double accuracy = confusionMatrix.getAccuracy();
    assertTrue(accuracy >= 0d);
    assertTrue(accuracy <= 1d);

    double recall = confusionMatrix.getRecall();
    assertTrue(recall >= 0d);
    assertTrue(recall <= 1d);

    double precision = confusionMatrix.getPrecision();
    assertTrue(precision >= 0d);
    assertTrue(precision <= 1d);

    Terms terms = MultiTerms.getTerms(leafReader, categoryFieldName);
    TermsEnum iterator = terms.iterator();
    BytesRef term;
    while ((term = iterator.next()) != null) {
      String s = term.utf8ToString();
      recall = confusionMatrix.getRecall(s);
      assertTrue(recall >= 0d);
      assertTrue(recall <= 1d);
      precision = confusionMatrix.getPrecision(s);
      assertTrue(precision >= 0d);
      assertTrue(precision <= 1d);
      double f1Measure = confusionMatrix.getF1Measure(s);
      assertTrue(f1Measure >= 0d);
      assertTrue(f1Measure <= 1d);
    }
  } finally {
    leafReader.close();
  }
}
 
Example 15
Source File: TestPrefixRandom.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
  return new SimplePrefixTermsEnum(terms.iterator(), prefix);
}
 
Example 16
Source File: IGainTermsQParserPlugin.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public void finish() throws IOException {
  NamedList<Double> analytics = new NamedList<Double>();
  @SuppressWarnings({"unchecked", "rawtypes"})
  NamedList<Integer> topFreq = new NamedList();

  @SuppressWarnings({"unchecked", "rawtypes"})
  NamedList<Integer> allFreq = new NamedList();

  rb.rsp.add("featuredTerms", analytics);
  rb.rsp.add("docFreq", topFreq);
  rb.rsp.add("numDocs", count);

  TreeSet<TermWithScore> topTerms = new TreeSet<>();

  double numDocs = count;
  double pc = numPositiveDocs / numDocs;
  double entropyC = binaryEntropy(pc);

  Terms terms = ((SolrIndexSearcher)searcher).getSlowAtomicReader().terms(field);
  TermsEnum termsEnum = terms == null ? TermsEnum.EMPTY : terms.iterator();
  BytesRef term;
  PostingsEnum postingsEnum = null;
  while ((term = termsEnum.next()) != null) {
    postingsEnum = termsEnum.postings(postingsEnum);
    int xc = 0;
    int nc = 0;
    while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
      if (positiveSet.get(postingsEnum.docID())) {
        xc++;
      } else if (negativeSet.get(postingsEnum.docID())) {
        nc++;
      }
    }

    int docFreq = xc+nc;

    double entropyContainsTerm = binaryEntropy( (double) xc / docFreq );
    double entropyNotContainsTerm = binaryEntropy( (double) (numPositiveDocs - xc) / (numDocs - docFreq + 1) );
    double score = entropyC - ( (docFreq / numDocs) * entropyContainsTerm + (1.0 - docFreq / numDocs) * entropyNotContainsTerm);

    topFreq.add(term.utf8ToString(), docFreq);
    if (topTerms.size() < numTerms) {
      topTerms.add(new TermWithScore(term.utf8ToString(), score));
    } else  {
      if (topTerms.first().score < score) {
        topTerms.pollFirst();
        topTerms.add(new TermWithScore(term.utf8ToString(), score));
      }
    }
  }

  for (TermWithScore topTerm : topTerms) {
    analytics.add(topTerm.term, topTerm.score);
    topFreq.add(topTerm.term, allFreq.get(topTerm.term));
  }

  if (this.delegate instanceof DelegatingCollector) {
    ((DelegatingCollector) this.delegate).finish();
  }
}
 
Example 17
Source File: AlfrescoLukeRequestHandler.java    From SearchServices with GNU Lesser General Public License v3.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
private static void getDetailedFieldInfo(SolrQueryRequest req,
		String field, SimpleOrderedMap<Object> fieldMap) throws IOException {

	SolrParams params = req.getParams();
	final int numTerms = params.getInt(NUMTERMS, DEFAULT_COUNT);

	TopTermQueue tiq = new TopTermQueue(numTerms + 1); // Something to
														// collect the top N
														// terms in.

	final CharsRefBuilder spare = new CharsRefBuilder();

	Terms terms = MultiFields.getTerms(req.getSearcher().getIndexReader(),
			field);
	if (terms == null) { // field does not exist
		return;
	}
	TermsEnum termsEnum = terms.iterator();
	BytesRef text;
	int[] buckets = new int[HIST_ARRAY_SIZE];
	while ((text = termsEnum.next()) != null) {
		++tiq.distinctTerms;
		int freq = termsEnum.docFreq(); // This calculation seems odd, but
										// it gives the same results as it
										// used to.
		int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1));
		buckets[slot] = buckets[slot] + 1;
		if (numTerms > 0 && freq > tiq.minFreq) {
			spare.copyUTF8Bytes(text);
			String t = spare.toString();

			tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum
					.docFreq()));
			if (tiq.size() > numTerms) { // if tiq full
				tiq.pop(); // remove lowest in tiq
				tiq.minFreq = tiq.getTopTermInfo().docFreq;
			}
		}
	}
	tiq.histogram.add(buckets);
	fieldMap.add("distinct", tiq.distinctTerms);

	// Include top terms
	fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema()));

	// Add a histogram
	fieldMap.add("histogram", tiq.histogram.toNamedList());
}
 
Example 18
Source File: ClusteringKeyQuery.java    From stratio-cassandra with Apache License 2.0 4 votes vote down vote up
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
    return new FullKeyDataRangeFilteredTermsEnum(terms.iterator(null));
}
 
Example 19
Source File: CachingNaiveBayesClassifierTest.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Test
public void testPerformance() throws Exception {
  MockAnalyzer analyzer = new MockAnalyzer(random());
  int numDocs = atLeast(10);
  LeafReader leafReader = getRandomIndex(analyzer,  numDocs);
  try {
    CachingNaiveBayesClassifier simpleNaiveBayesClassifier = new CachingNaiveBayesClassifier(leafReader,
        analyzer, null, categoryFieldName, textFieldName);

    ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(leafReader,
        simpleNaiveBayesClassifier, categoryFieldName, textFieldName, -1);
    assertNotNull(confusionMatrix);

    double avgClassificationTime = confusionMatrix.getAvgClassificationTime();
    assertTrue(avgClassificationTime >= 0);
    double accuracy = confusionMatrix.getAccuracy();
    assertTrue(accuracy >= 0d);
    assertTrue(accuracy <= 1d);

    double recall = confusionMatrix.getRecall();
    assertTrue(recall >= 0d);
    assertTrue(recall <= 1d);

    double precision = confusionMatrix.getPrecision();
    assertTrue(precision >= 0d);
    assertTrue(precision <= 1d);

    Terms terms = MultiTerms.getTerms(leafReader, categoryFieldName);
    TermsEnum iterator = terms.iterator();
    BytesRef term;
    while ((term = iterator.next()) != null) {
      String s = term.utf8ToString();
      recall = confusionMatrix.getRecall(s);
      assertTrue(recall >= 0d);
      assertTrue(recall <= 1d);
      precision = confusionMatrix.getPrecision(s);
      assertTrue(precision >= 0d);
      assertTrue(precision <= 1d);
      double f1Measure = confusionMatrix.getF1Measure(s);
      assertTrue(f1Measure >= 0d);
      assertTrue(f1Measure <= 1d);
    }
  } finally {
    leafReader.close();
  }

}
 
Example 20
Source File: TermFreqAnalyser.java    From Siamese with GNU General Public License v3.0 4 votes vote down vote up
private static void analyseTermFreq(String indexName, String field, String freqType, String outputFileName) {
        String indexFile = elasticsearchLoc + "/data/stackoverflow/nodes/0/indices/"
                + indexName + "/0/index";
        DecimalFormat df = new DecimalFormat("#.00");
        int printEvery = 100000;
        File outputFile = new File(outputFileName);
        if (outputFile.exists()) {
            if (!outputFile.delete()) {
                System.out.println("ERROR: cannot delete the output file.");
                System.exit(0);
            }
        }
        /* adapted from
        https://stackoverflow.com/questions/28244961/lucene-4-10-2-calculate-tf-idf-for-all-terms-in-index
         */
        int count = 0;
        try {
            IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexFile)));
            Fields fields = MultiFields.getFields(reader);
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            int size = 0;
            // TODO: is there a better solution?
            // iterate to get the size
            while (termsEnum.next() != null) {
                size++;
            }
//            String[] termArr = new String[size];
            long[] freqArr = new long[size];
            // do the real work
            termsEnum = terms.iterator();
            while (termsEnum.next() != null) {
//                String term = termsEnum.term().utf8ToString();
                long tfreq = 0;
                if (freqType.equals("tf"))
                    tfreq = termsEnum.totalTermFreq();
                else if (freqType.equals("df"))
                    tfreq = termsEnum.docFreq();
                else {
                    System.out.println("Wrong frequency. Quit!");
                    System.exit(0);
                }
//                termArr[count] = term;
                freqArr[count] = tfreq;
                if (count % printEvery == 0) {
                    System.out.println("processed: " + count + " terms "
                            + " [" + df.format(((long)count * 100)/size) + "%]");
                }
                count++;
            }
            System.out.println(field + ": total = " + count);
            double[] data = new double[size];
            String output = "freq\n";
            for (int i = 0; i < freqArr.length; i++) {
                data[i] = freqArr[i];
                output += freqArr[i] + "\n";
                if (i > 0 && i % printEvery == 0) {
                    MyUtils.writeToFile("./", outputFileName, output, true);
                    System.out.println("written: " + i + " terms "
                            + " [" + df.format(((long)i * 100)/size) + "%]");
                    output = "";
                }
            }
            // write the rest to the file
            MyUtils.writeToFile("./",outputFileName, output, true);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }