java source code of DumpTermsApp

package lucene4ir.BiGramGenerator;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.*;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;

import javax.xml.bind.JAXB;
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

/**
 * Created by leif on 21/08/2016.
 */

public class DumpTermsApp {

    public String indexName;
    public IndexReader reader;

    public DumpTermsApp() {
        System.out.println("DumpTerms");
        /*
        Shows a number of routines to access various term, document and collection statistics

        Assumes index has a docnum (i.e. trec doc id), title and content fields.
         */
        indexName = "";
        reader = null;
    }


    public void readParamsFromFile(String indexParamFile) {
        try {
            DumpTermsParams p = JAXB.unmarshal(new File(indexParamFile), DumpTermsParams.class);
            indexName = p.indexName;

        } catch (Exception e) {
            System.out.println(" caught a " + e.getClass() +
                    "\n with message: " + e.getMessage());
            System.exit(1);
        }
    }

    public void openReader() {
        try {
            reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexName)));

        } catch (IOException e) {
            System.out.println(" caught a " + e.getClass() +
                    "\n with message: " + e.getMessage());
        }
    }

    public void numSegments(){
        // how do we get the number of segements
        int segments = reader.leaves().size();
        System.out.println("Number of Segments in Index: " + segments);

        // you can use a writer to force merge - and then you will only
        // have one segment
        // the maximum number of documents in a lucene index is approx 2 millio
        // you need to go solr or elastic search for bigger collections
        // solr/es using sharding.
    }


    public void termsList(String field) throws IOException {

        // again, we'll just look at the first segment.  Terms dictionaries
        // for different segments may well be different, as they depend on
        // the individual documents that have been added.
        LeafReader leafReader = reader.leaves().get(0).reader();

        System.out.println(reader.leaves().size());
        Terms terms = leafReader.terms(field);

        // The Terms object gives us some stats for this term within the segment
        System.out.println("Number of docs with this term:" + terms.getDocCount());

        TermsEnum te = terms.iterator();
        BytesRef term;
        while ((term = te.next()) != null) {
            System.out.println(term.utf8ToString() + " DF: " + te.docFreq() + " CF: " + te.totalTermFreq());
        }
    }

    public void getGramList(
            String field ,
            HashMap<String, BiGramGenerator.queryInfo> biGramMap ,
            int biCutoff ,
            HashMap<String, BiGramGenerator.queryInfo> uniGramMap
            ) throws IOException {


        LeafReader leafReader = reader.leaves().get(0).reader();
        String currentTerm;
        long currentFreq;
        BiGramGenerator.queryInfo currentQryInfo;

        Terms terms = leafReader.terms(field);
        TermsEnum te = terms.iterator();
        BytesRef term;


        while ((term = te.next()) != null) {
            currentTerm = term.utf8ToString().trim();
            if (!currentTerm.contains("_"))
            {
                currentQryInfo =  new BiGramGenerator("").new queryInfo();
                currentFreq = te.totalTermFreq();
                currentQryInfo.collFreq = currentFreq;
                if (currentTerm.contains(" ") && currentFreq >= biCutoff)
                    biGramMap.put(currentTerm,currentQryInfo);
                else if (!currentTerm.contains(" "))
                    uniGramMap.put(currentTerm,currentQryInfo);
            }

        }
    }

    public void termStats(String termText)  throws IOException{
        /*
        How to get the term frequency and document frequency of a term
         */
        Term termInstance = new Term("content", termText);
        long termFreq = reader.totalTermFreq(termInstance);
        long docFreq = reader.docFreq(termInstance);

        System.out.println("Term: "+termText+", Term Freq. = "+termFreq+", Doc Freq. = "+docFreq);
    }


    public void reportCollectionStatistics()throws IOException {

        IndexSearcher searcher = new IndexSearcher(reader);

        CollectionStatistics collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_ALL);
        long token_count = collectionStats.sumTotalTermFreq();
        long doc_count = collectionStats.docCount();
        long sum_doc_count = collectionStats.sumDocFreq();
        long avg_doc_length = token_count / doc_count;

        System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

        collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_TITLE);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);


        collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_CONTENT);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

    }

  public void extractBigramsFromStoredText() throws IOException {

        HashMap<String, Integer> hmap = new HashMap<String, Integer>();
        int n = reader.maxDoc();

        for (int i = 0; i < n; i++) {

            Document doc = reader.document(i);
            String all = doc.get(lucene4ir.Lucene4IRConstants.FIELD_ALL);
            
            Analyzer a = new StandardAnalyzer();
            TokenStream ts = a.tokenStream(null, all);
            ts.reset();
            String w1 = "";
            String w2 = "";
            while (ts.incrementToken()) {
                w1 = w2;
                w2 = ts.getAttribute(CharTermAttribute.class).toString();
                if (w1 != "") {
                    //System.out.println(w1 + " " + w2);

                    String key = w1 + " " + w2;
                    if (hmap.containsKey(key)==true) {
                        int v = hmap.get(key);
                        hmap.put(key,v+1);
                    }
                    else {
                        hmap.put(key, 1);
                    }

                }
            }
        }

        Set set = hmap.entrySet();
        Iterator iterator = set.iterator();
        while(iterator.hasNext()) {
            Map.Entry me = (Map.Entry)iterator.next();
            if ((int)me.getValue() > 2) {
                System.out.print(me.getKey() + ": ");
                System.out.println(me.getValue());
            }
        }

    }
    
    public static void main(String[] args)  throws IOException {
        String statsParamFile = "";

        try {
            statsParamFile = args[0];
        } catch (Exception e) {
            System.out.println(" caught a " + e.getClass() +
                    "\n with message: " + e.getMessage());
            System.exit(1);
        }

        DumpTermsApp dtApp = new DumpTermsApp();
        dtApp.readParamsFromFile(statsParamFile);

        dtApp.openReader();
        dtApp.reportCollectionStatistics();
        dtApp.termsList(lucene4ir.Lucene4IRConstants.FIELD_ALL);

    }

}

class DumpTermsParams {
    public String indexName;
}