java source code of ConcatenateFilter

/*
  This software was produced for the U. S. Government
  under Contract No. W15P7T-11-C-F600, and is
  subject to the Rights in Noncommercial Computer Software
  and Noncommercial Computer Software Documentation
  Clause 252.227-7014 (JUN 1995)

  Copyright 2013 The MITRE Corporation. All Rights Reserved.

  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

      http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
 */

package org.opensextant.solrtexttagger;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.tokenattributes.*;

import java.io.IOException;

/**
 * Concatenate all tokens, separated by a provided character,
 * defaulting to a single space. It always produces exactly one token, and it's designed to be the
 * last token filter in an analysis chain.
 */
public class ConcatenateFilter extends TokenFilter {

  /*
  For a very different approach that could accept synonyms or anything except position gaps (e.g.
  not stopwords),
  consider using o.a.l.analysis.TokenStreamToAutomaton
  with o.a.l.util.automaton.SpecialOperations.getFiniteStrings().
  For gaps (stopwords), we could perhaps index a special token at those gaps and then have the
  tagger deal with them -- also doable.
   */

  private char separator = ' ';

  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);

  private boolean done;
  private StringBuilder buf = new StringBuilder(128);

  /**
   * Construct a token stream filtering the given input.
   */
  protected ConcatenateFilter(TokenStream input) {
    super(input);
  }

  public void setTokenSeparator(char separator) {
    this.separator = separator;
  }

  @Override
  public void reset() throws IOException {
    input.reset();
    done = false;
  }

  @Override
  public final boolean incrementToken() throws IOException {
    if (done)
      return false;
    done = true;

    buf.setLength(0);
    boolean firstTerm = true;
    while (input.incrementToken()) {
      if (!firstTerm) {
        buf.append(separator);
      }
      //TODO consider indexing special chars when posInc > 1 (stop words). We ignore for now. #13
      buf.append(termAtt);
      firstTerm = false;
    }
    input.end();//call here so we can see end of stream offsets

    termAtt.setEmpty().append(buf);
    //Setting the other attributes ultimately won't have much effect but lets be thorough
    offsetAtt.setOffset(0, offsetAtt.endOffset());
    posIncrAtt.setPositionIncrement(1);
    posLenAtt.setPositionLength(1);//or do we add up the positions?  Probably not used any way.
    typeAtt.setType(ShingleFilter.DEFAULT_TOKEN_TYPE);//"shingle"

    return true;
  }

  @Override
  public void end() throws IOException {
    //we already called input.end() in incrementToken
  }
}