/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis; import java.io.IOException; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Deque; import java.util.List; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.util.AttributeSource; /** * An abstract TokenFilter that exposes its input stream as a graph * * Call {@link #incrementBaseToken()} to move the root of the graph to the next * position in the TokenStream, {@link #incrementGraphToken()} to move along * the current graph, and {@link #incrementGraph()} to reset to the next graph * based at the current root. * * For example, given the stream 'a b/c:2 d e`, then with the base token at * 'a', incrementGraphToken() will produce the stream 'a b d e', and then * after calling incrementGraph() will produce the stream 'a c e'. */ public abstract class GraphTokenFilter extends TokenFilter { private final Deque<Token> tokenPool = new ArrayDeque<>(); private final List<Token> currentGraph = new ArrayList<>(); /** * The maximum permitted number of routes through a graph */ public static final int MAX_GRAPH_STACK_SIZE = 1000; /** * The maximum permitted read-ahead in the token stream */ public static final int MAX_TOKEN_CACHE_SIZE = 100; private Token baseToken; private int graphDepth; private int graphPos; private int trailingPositions = -1; private int finalOffsets = -1; private int stackSize; private int cacheSize; private final PositionIncrementAttribute posIncAtt; private final OffsetAttribute offsetAtt; /** * Create a new GraphTokenFilter */ public GraphTokenFilter(TokenStream input) { super(input); this.posIncAtt = input.addAttribute(PositionIncrementAttribute.class); this.offsetAtt = input.addAttribute(OffsetAttribute.class); } /** * Move the root of the graph to the next token in the wrapped TokenStream * * @return {@code false} if the underlying stream is exhausted */ protected final boolean incrementBaseToken() throws IOException { stackSize = 0; graphDepth = 0; graphPos = 0; Token oldBase = baseToken; baseToken = nextTokenInStream(baseToken); if (baseToken == null) { return false; } currentGraph.clear(); currentGraph.add(baseToken); baseToken.attSource.copyTo(this); recycleToken(oldBase); return true; } /** * Move to the next token in the current route through the graph * * @return {@code false} if there are not more tokens in the current graph */ protected final boolean incrementGraphToken() throws IOException { if (graphPos < graphDepth) { graphPos++; currentGraph.get(graphPos).attSource.copyTo(this); return true; } Token token = nextTokenInGraph(currentGraph.get(graphDepth)); if (token == null) { return false; } graphDepth++; graphPos++; currentGraph.add(graphDepth, token); token.attSource.copyTo(this); return true; } /** * Reset to the root token again, and move down the next route through the graph * * @return false if there are no more routes through the graph */ protected final boolean incrementGraph() throws IOException { if (baseToken == null) { return false; } graphPos = 0; for (int i = graphDepth; i >= 1; i--) { if (lastInStack(currentGraph.get(i)) == false) { currentGraph.set(i, nextTokenInStream(currentGraph.get(i))); for (int j = i + 1; j < graphDepth; j++) { currentGraph.set(j, nextTokenInGraph(currentGraph.get(j))); } if (stackSize++ > MAX_GRAPH_STACK_SIZE) { throw new IllegalStateException("Too many graph paths (> " + MAX_GRAPH_STACK_SIZE + ")"); } currentGraph.get(0).attSource.copyTo(this); graphDepth = i; return true; } } return false; } /** * Return the number of trailing positions at the end of the graph * * NB this should only be called after {@link #incrementGraphToken()} has returned {@code false} */ public int getTrailingPositions() { return trailingPositions; } @Override public void end() throws IOException { if (trailingPositions == -1) { input.end(); trailingPositions = posIncAtt.getPositionIncrement(); finalOffsets = offsetAtt.endOffset(); } else { endAttributes(); this.posIncAtt.setPositionIncrement(trailingPositions); this.offsetAtt.setOffset(finalOffsets, finalOffsets); } } @Override public void reset() throws IOException { input.reset(); // new attributes can be added between reset() calls, so we can't reuse // token objects from a previous run tokenPool.clear(); cacheSize = 0; graphDepth = 0; trailingPositions = -1; finalOffsets = -1; baseToken = null; } int cachedTokenCount() { return cacheSize; } private Token newToken() { if (tokenPool.size() == 0) { cacheSize++; if (cacheSize > MAX_TOKEN_CACHE_SIZE) { throw new IllegalStateException("Too many cached tokens (> " + MAX_TOKEN_CACHE_SIZE + ")"); } return new Token(this.cloneAttributes()); } Token token = tokenPool.removeFirst(); token.reset(input); return token; } private void recycleToken(Token token) { if (token == null) return; token.nextToken = null; tokenPool.add(token); } private Token nextTokenInGraph(Token token) throws IOException { int remaining = token.length(); do { token = nextTokenInStream(token); if (token == null) { return null; } remaining -= token.posInc(); } while (remaining > 0); return token; } // check if the next token in the tokenstream is at the same position as this one private boolean lastInStack(Token token) throws IOException { Token next = nextTokenInStream(token); return next == null || next.posInc() != 0; } private Token nextTokenInStream(Token token) throws IOException { if (token != null && token.nextToken != null) { return token.nextToken; } if (this.trailingPositions != -1) { // already hit the end return null; } if (input.incrementToken() == false) { input.end(); trailingPositions = posIncAtt.getPositionIncrement(); finalOffsets = offsetAtt.endOffset(); return null; } if (token == null) { return newToken(); } token.nextToken = newToken(); return token.nextToken; } private static class Token { final AttributeSource attSource; final PositionIncrementAttribute posIncAtt; final PositionLengthAttribute lengthAtt; Token nextToken; Token(AttributeSource attSource) { this.attSource = attSource; this.posIncAtt = attSource.addAttribute(PositionIncrementAttribute.class); boolean hasLengthAtt = attSource.hasAttribute(PositionLengthAttribute.class); this.lengthAtt = hasLengthAtt ? attSource.addAttribute(PositionLengthAttribute.class) : null; } int posInc() { return this.posIncAtt.getPositionIncrement(); } int length() { if (this.lengthAtt == null) { return 1; } return this.lengthAtt.getPositionLength(); } void reset(AttributeSource attSource) { attSource.copyTo(this.attSource); this.nextToken = null; } @Override public String toString() { return attSource.toString(); } } }