/* * This software was produced for the U. S. Government * under Contract No. W15P7T-11-C-F600, and is * subject to the Rights in Noncommercial Computer Software * and Noncommercial Computer Software Documentation * Clause 252.227-7014 (JUN 1995) * * Copyright 2013 The MITRE Corporation. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.handler.tagger; import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.HashMap; import java.util.Map; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.index.Terms; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Tags maximum string of words in a corpus. This is a callback-style API * in which you implement {@link #tagCallback(int, int, Object)}. * * This class should be independently usable outside Solr. */ public abstract class Tagger { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private final TokenStream tokenStream; private final TermToBytesRefAttribute byteRefAtt; private final PositionIncrementAttribute posIncAtt; private final OffsetAttribute offsetAtt; private final TaggingAttribute taggingAtt; private final TagClusterReducer tagClusterReducer; private final Terms terms; private final Bits liveDocs; private final boolean skipAltTokens; private final boolean ignoreStopWords; private Map<BytesRef, IntsRef> docIdsCache; /** Whether the WARNING about skipped tokens was already logged. */ private boolean loggedSkippedAltTokenWarning = false; public Tagger(Terms terms, Bits liveDocs, TokenStream tokenStream, TagClusterReducer tagClusterReducer, boolean skipAltTokens, boolean ignoreStopWords) throws IOException { this.terms = terms; this.liveDocs = liveDocs; this.tokenStream = tokenStream; this.skipAltTokens = skipAltTokens; this.ignoreStopWords = ignoreStopWords; byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class); posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); taggingAtt = tokenStream.addAttribute(TaggingAttribute.class); tokenStream.reset(); this.tagClusterReducer = tagClusterReducer; } public void enableDocIdsCache(int initSize) { if (initSize > 0) docIdsCache = new HashMap<>(initSize); } public void process() throws IOException { if (terms == null) return; //a shared pointer to the head used by this method and each Tag instance. final TagLL[] head = new TagLL[1]; TermPrefixCursor cursor = null;//re-used //boolean switch used to log warnings in case tokens where skipped during tagging. boolean skippedTokens = false; while (tokenStream.incrementToken()) { if (log.isTraceEnabled()) { log.trace("Token: {}, posInc: {}, offset: [{},{}]", byteRefAtt, posIncAtt.getPositionIncrement(), offsetAtt.startOffset(), offsetAtt.endOffset()); } //check for posInc < 1 (alternate Tokens, such as expanded Synonyms) if (posIncAtt.getPositionIncrement() < 1) { //(a) Deal with this as a configuration issue and throw an exception if (!skipAltTokens) { //TODO throw UnsupportedTokenException when PhraseBuilder is ported throw new IllegalStateException("Query Analyzer generates alternate " + "Tokens (posInc == 0). Please adapt your Analyzer configuration or " + "enable '" + TaggerRequestHandler.SKIP_ALT_TOKENS + "' to skip such " + "tokens. NOTE: enabling '" + TaggerRequestHandler.SKIP_ALT_TOKENS + "' might result in wrong tagging results if the index time analyzer " + "is not configured accordingly. For detailed information see " + "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225"); } else { //(b) In case the index time analyser had indexed all variants (users // need to ensure that) processing of alternate tokens can be skipped // as anyways all alternatives will be contained in the FST. skippedTokens = true; log.trace(" ... ignored token"); continue; } } //-- If PositionIncrement > 1 (stopwords) if (!ignoreStopWords && posIncAtt.getPositionIncrement() > 1) { log.trace(" - posInc > 1 ... mark cluster as done"); advanceTagsAndProcessClusterIfDone(head, null); } final BytesRef term; //NOTE: we need to lookup tokens if // * the LookupAtt is true OR // * there are still advancing tags (to find the longest possible match) if(taggingAtt.isTaggable() || head[0] != null){ //-- Lookup the term id from the next token term = byteRefAtt.getBytesRef(); if (term.length == 0) { throw new IllegalArgumentException("term: " + term.utf8ToString() + " analyzed to a zero-length token"); } } else { //no current cluster AND lookup == false ... term = null; //skip this token } //-- Process tag advanceTagsAndProcessClusterIfDone(head, term); //-- only create new Tags for Tokens we need to lookup if (taggingAtt.isTaggable() && term != null) { //determine if the terms index has a term starting with the provided term // TODO create a pool of these cursors to reuse them more? could be trivial impl if (cursor == null)// (else the existing cursor will be re-used) cursor = new TermPrefixCursor(terms.iterator(), liveDocs, docIdsCache); if (cursor.advance(term)) { TagLL newTail = new TagLL(head, cursor, offsetAtt.startOffset(), offsetAtt.endOffset(), null); cursor = null;//because the new tag now "owns" this instance //and add it to the end if (head[0] == null) { head[0] = newTail; } else { for (TagLL t = head[0]; true; t = t.nextTag) { if (t.nextTag == null) { t.addAfterLL(newTail); break; } } } } }//if termId >= 0 }//end while(incrementToken()) //-- Finish all tags advanceTagsAndProcessClusterIfDone(head, null); assert head[0] == null; if(!loggedSkippedAltTokenWarning && skippedTokens){ loggedSkippedAltTokenWarning = true; //only log once log.warn("{}{}{}{}" , "The Tagger skipped some alternate tokens (tokens with posInc == 0) " , "while processing text. This may cause problems with some Analyzer " , "configurations (e.g. query time synonym expansion). For details see " , "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225"); } tokenStream.end(); //tokenStream.close(); caller closes because caller acquired it } private void advanceTagsAndProcessClusterIfDone(TagLL[] head, BytesRef term) throws IOException { //-- Advance tags final int endOffset = term != null ? offsetAtt.endOffset() : -1; boolean anyAdvance = false; for (TagLL t = head[0]; t != null; t = t.nextTag) { anyAdvance |= t.advance(term, endOffset); } //-- Process cluster if done if (!anyAdvance && head[0] != null) { tagClusterReducer.reduce(head); for (TagLL t = head[0]; t != null; t = t.nextTag) { assert t.value != null; tagCallback(t.startOffset, t.endOffset, t.value); } head[0] = null; } } /** * Invoked by {@link #process()} for each tag found. endOffset is always >= the endOffset * given in the previous call. * * @param startOffset The character offset of the original stream where the tag starts. * @param endOffset One more than the character offset of the original stream where the tag ends. * @param docIdsKey A reference to the matching docIds that can be resolved via {@link #lookupDocIds(Object)}. */ protected abstract void tagCallback(int startOffset, int endOffset, Object docIdsKey); /** * Returns a sorted array of integer docIds given the corresponding key. * @param docIdsKey The lookup key. * @return Not null */ protected IntsRef lookupDocIds(Object docIdsKey) { return (IntsRef) docIdsKey; } }