java source code of Tagger

/*
  This software was produced for the U. S. Government
  under Contract No. W15P7T-11-C-F600, and is
  subject to the Rights in Noncommercial Computer Software
  and Noncommercial Computer Software Documentation
  Clause 252.227-7014 (JUN 1995)

  Copyright 2013 The MITRE Corporation. All Rights Reserved.

  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

      http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
 */

package org.opensextant.solrtexttagger;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.index.Terms;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

/**
 * Tags maximum string of words in a corpus.  This is a callback-style API
 * in which you implement {@link #tagCallback(int, int, Object)}.
 *
 * This class should be independently usable outside Solr.
 */
public abstract class Tagger {
  private final Logger log = LoggerFactory.getLogger(Tagger.class);

  private final TokenStream tokenStream;
  private final TermToBytesRefAttribute byteRefAtt;
  private final PositionIncrementAttribute posIncAtt;
  private final OffsetAttribute offsetAtt;
  private final TaggingAttribute taggingAtt;

  private final TagClusterReducer tagClusterReducer;
  private final Terms terms;
  private final Bits liveDocs;
  private final boolean skipAltTokens;
  private final boolean ignoreStopWords;

  private Map<BytesRef, IntsRef> docIdsCache;

  /** Whether the WARNING about skipped tokens was already logged. */
  private boolean loggedSkippedAltTokenWarning = false;

  public Tagger(Terms terms, Bits liveDocs, TokenStream tokenStream,
                TagClusterReducer tagClusterReducer, boolean skipAltTokens,
                boolean ignoreStopWords) throws IOException {
    this.terms = terms;
    this.liveDocs = liveDocs;
    this.tokenStream = tokenStream;
    this.skipAltTokens = skipAltTokens;
    this.ignoreStopWords = ignoreStopWords;
    byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class);
    posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
    offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    taggingAtt = tokenStream.addAttribute(TaggingAttribute.class);
    tokenStream.reset();

    this.tagClusterReducer = tagClusterReducer;
  }

  public void enableDocIdsCache(int initSize) {
    if (initSize > 0)
      docIdsCache = new HashMap<>(initSize);
  }

  public void process() throws IOException {
    if (terms == null)
      return;

    //a shared pointer to the head used by this method and each Tag instance.
    final TagLL[] head = new TagLL[1];

    TermPrefixCursor cursor = null;//re-used

    //boolean switch used to log warnings in case tokens where skipped during tagging.
    boolean skippedTokens = false;

    while (tokenStream.incrementToken()) {
      if (log.isTraceEnabled()) {
        log.trace("Token: {}, posInc: {},  offset: [{},{}]",
                byteRefAtt, posIncAtt.getPositionIncrement(),
                offsetAtt.startOffset(), offsetAtt.endOffset());
      }
      //check for posInc < 1 (alternate Tokens, such as expanded Synonyms)
      if (posIncAtt.getPositionIncrement() < 1) {
        //(a) Deal with this as a configuration issue and throw an exception
        if (!skipAltTokens) {
          //TODO throw UnsupportedTokenException when PhraseBuilder is ported
          throw new IllegalStateException("Query Analyzer generates alternate "
              + "Tokens (posInc == 0). Please adapt your Analyzer configuration or "
              + "enable '" + TaggerRequestHandler.SKIP_ALT_TOKENS + "' to skip such "
              + "tokens. NOTE: enabling '" + TaggerRequestHandler.SKIP_ALT_TOKENS
              + "' might result in wrong tagging results if the index time analyzer "
              + "is not configured accordingly. For detailed information see "
              + "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225");
        } else {
          //(b) In case the index time analyser had indexed all variants (users
          //    need to ensure that) processing of alternate tokens can be skipped
          //    as anyways all alternatives will be contained in the FST.
          skippedTokens = true;
          log.trace("  ... ignored token");
          continue;
        }
      }
      //-- If PositionIncrement > 1 (stopwords)
      if (!ignoreStopWords && posIncAtt.getPositionIncrement() > 1) {
        log.trace("   - posInc > 1 ... mark cluster as done");
        advanceTagsAndProcessClusterIfDone(head, null);
      }

      final BytesRef term;
      //NOTE: we need to lookup tokens if
      // * the LookupAtt is true OR
      // * there are still advancing tags (to find the longest possible match)
      if(taggingAtt.isTaggable() || head[0] != null){
        //-- Lookup the term id from the next token
        term = byteRefAtt.getBytesRef();
        if (term.length == 0) {
          throw new IllegalArgumentException("term: " + term.utf8ToString() + " analyzed to a zero-length token");
        }
      } else { //no current cluster AND lookup == false ...
        term = null; //skip this token
      }

      //-- Process tag
      advanceTagsAndProcessClusterIfDone(head, term);

      //-- only create new Tags for Tokens we need to lookup
      if (taggingAtt.isTaggable() && term != null) {

        //determine if the terms index has a term starting with the provided term
        // TODO create a pool of these cursors to reuse them more?  could be trivial impl
        if (cursor == null)// (else the existing cursor will be re-used)
          cursor = new TermPrefixCursor(terms.iterator(), liveDocs, docIdsCache);
        if (cursor.advance(term)) {
          TagLL newTail = new TagLL(head, cursor, offsetAtt.startOffset(), offsetAtt.endOffset(), null);
          cursor = null;//because the new tag now "owns" this instance
          //and add it to the end
          if (head[0] == null) {
            head[0] = newTail;
          } else {
            for (TagLL t = head[0]; true; t = t.nextTag) {
              if (t.nextTag == null) {
                t.addAfterLL(newTail);
                break;
              }
            }
          }
        }
      }//if termId >= 0
    }//end while(incrementToken())

    //-- Finish all tags
    advanceTagsAndProcessClusterIfDone(head, null);
    assert head[0] == null;

    if(!loggedSkippedAltTokenWarning && skippedTokens){
      loggedSkippedAltTokenWarning = true; //only log once
      log.warn("The Tagger skipped some alternate tokens (tokens with posInc == 0) "
          + "while processing text. This may cause problems with some Analyzer "
          + "configurations (e.g. query time synonym expansion). For details see "
          + "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225");
    }

    tokenStream.end();
    //tokenStream.close(); caller closes because caller acquired it
  }

  private void advanceTagsAndProcessClusterIfDone(TagLL[] head, BytesRef term) throws IOException {
    //-- Advance tags
    final int endOffset = term != null ? offsetAtt.endOffset() : -1;
    boolean anyAdvance = false;
    for (TagLL t = head[0]; t != null; t = t.nextTag) {
      anyAdvance |= t.advance(term, endOffset);
    }

    //-- Process cluster if done
    if (!anyAdvance && head[0] != null) {
      tagClusterReducer.reduce(head);
      for (TagLL t = head[0]; t != null; t = t.nextTag) {
        assert t.value != null;
        tagCallback(t.startOffset, t.endOffset, t.value);
      }
      head[0] = null;
    }
  }

  /**
   * Invoked by {@link #process()} for each tag found.  endOffset is always &gt;= the endOffset
   * given in the previous call.
   *
   * @param startOffset The character offset of the original stream where the tag starts.
   * @param endOffset One more than the character offset of the original stream where the tag ends.
   * @param docIdsKey A reference to the matching docIds that can be resolved via {@link #lookupDocIds(Object)}.
   */
  protected abstract void tagCallback(int startOffset, int endOffset, Object docIdsKey);

  /**
   * Returns a sorted array of integer docIds given the corresponding key.
   * @param docIdsKey The lookup key.
   * @return Not null
   */
  protected IntsRef lookupDocIds(Object docIdsKey) {
    return (IntsRef) docIdsKey;
  }
}