java source code of LuceneLinkTokenizer

package com.github.kno10.wikipediaentities;

import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collections;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.standard.ClassicFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;

import com.github.kno10.wikipediaentities.util.FastStringReader;
import com.github.kno10.wikipediaentities.util.Util;

import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectIterator;

/**
 * Tokenize link texts seen in Wikipedia, to build a list of common link titles.
 * Count how often each target occurs.
 *
 * @author Erich Schubert
 */
public class LuceneLinkTokenizer {
  /** Link text */
  Object2IntOpenHashMap<String> links = new Object2IntOpenHashMap<>();

  /** Output file name */
  private String out;

  /** Minimum support to report */
  static final int MINSUPP = 3;

  /**
   * Constructor
   *
   * @param out Output file name
   */
  public LuceneLinkTokenizer(String out) {
    this.out = out;
  }

  /**
   * Make handler for a single thread.
   *
   * @return Threadsafe handler.
   */
  public Handler makeThreadHandler() {
    return new LinkHandler();
  }

  class LinkHandler extends AbstractHandler {
    /** Link texts */
    Object2IntOpenHashMap<String> links = new Object2IntOpenHashMap<>();

    /** Lucene Wikipedia tokenizer */
    WikipediaTokenizer tokenizer;

    /** Filtered token stream */
    TokenStream stream;

    /** Lucene character term attribute */
    CharTermAttribute termAtt;

    /** Buffer for tokenization */
    StringBuilder buf = new StringBuilder();

    /** String reader */
    FastStringReader reader = new FastStringReader("");

    public LinkHandler() {
      tokenizer = new WikipediaTokenizer();
      stream = tokenizer;
      // stream = new PorterStemFilter(stream);
      stream = new ClassicFilter(stream);
      stream = new LowerCaseFilter(stream);
      termAtt = stream.addAttribute(CharTermAttribute.class);
    }

    @Override
    public void linkDetected(String prefix, String title, String label, String target) {
      // Normalize the link text.
      try {
        buf.delete(0, buf.length());
        tokenizer.reset();
        tokenizer.setReader(reader.reset(label));
        stream.reset();
        while(stream.incrementToken()) {
          if(termAtt.length() <= 0)
            continue;
          if(buf.length() > 0)
            buf.append(' ');
          buf.append(termAtt.buffer(), 0, termAtt.length());
        }
        if(buf.length() == 0)
          return;
        label = buf.toString();
        links.addTo(label, 1);
      }
      catch(IOException e) { // Should never happen in FastStringReader
        e.printStackTrace();
      }
    }

    @Override
    public void close() {
      synchronized(LuceneLinkTokenizer.this) {
        Object2IntOpenHashMap<String> plinks = LuceneLinkTokenizer.this.links;
        if(plinks.size() == 0) {
          LuceneLinkTokenizer.this.links = links;
        }
        else {
          for(ObjectIterator<Object2IntOpenHashMap.Entry<String>> it = links.object2IntEntrySet().fastIterator(); it.hasNext();) {
            Object2IntOpenHashMap.Entry<String> ent = it.next();
            plinks.addTo(ent.getKey(), ent.getIntValue());
          }
        }
        links = null;
      }
    }
  }

  public void close() throws IOException {
    System.err.format("Closing %s output.\n", getClass().getSimpleName());
    PrintStream writer = Util.openOutput(out);
    // We sort everything here. This is expensive, but makes the output
    // files nicer to use in the future.
    ArrayList<String> keys = new ArrayList<>(links.size());
    for(ObjectIterator<Object2IntOpenHashMap.Entry<String>> it = links.object2IntEntrySet().fastIterator(); it.hasNext();) {
      Object2IntOpenHashMap.Entry<String> ent = it.next();
      if(ent.getIntValue() >= MINSUPP) {
        keys.add(ent.getKey());
      }
    }
    Collections.sort(keys);
    for(String key : keys) {
      writer.append(key);
      writer.append('\n');
    }
    if(writer != System.out)
      writer.close();
  }
}