Java Code Examples for org.apache.lucene.analysis.tokenattributes.OffsetAttribute#endOffset()

The following examples show how to use org.apache.lucene.analysis.tokenattributes.OffsetAttribute#endOffset() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: PlainHighlighter.java From Elasticsearch with Apache License 2.0

6 votes

private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents) throws IOException {
    try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) {
        if (!tokenStream.hasAttribute(OffsetAttribute.class)) {
            // Can't split on term boundaries without offsets
            return -1;
        }
        int end = -1;
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class);
            if (attr.endOffset() >= noMatchSize) {
                // Jump to the end of this token if it wouldn't put us past the boundary
                if (attr.endOffset() == noMatchSize) {
                    end = noMatchSize;
                }
                return end;
            }
            end = attr.endOffset();
        }
        tokenStream.end();
        // We've exhausted the token stream so we should just highlight everything.
        return end;
    }
}

Example 2

Source File: ConcatenatingTokenStream.java From lucene-solr with Apache License 2.0

6 votes

@Override
public boolean incrementToken() throws IOException {
  boolean newSource = false;
  while (sources[currentSource].incrementToken() == false) {
    if (currentSource >= sources.length - 1)
      return false;
    sources[currentSource].end();
    initialPositionIncrement = sourceIncrements[currentSource].getPositionIncrement();
    OffsetAttribute att = sourceOffsets[currentSource];
    if (att != null)
      offsetIncrement += att.endOffset();
    currentSource++;
    newSource = true;
  }

  clearAttributes();
  sources[currentSource].copyTo(this);
  offsetAtt.setOffset(offsetAtt.startOffset() + offsetIncrement, offsetAtt.endOffset() + offsetIncrement);
  if (newSource) {
    int posInc = posIncAtt.getPositionIncrement();
    posIncAtt.setPositionIncrement(posInc + initialPositionIncrement);
  }

  return true;
}

Example 3

Source File: LuceneUtil.java From jasperreports with GNU Lesser General Public License v3.0

6 votes

protected String displayTokens(String text, String elementId) throws IOException {
	Analyzer analyzer = new LuceneSimpleAnalyzer(isCaseSensitive, removeAccents);;
	StringBuilder sb = new StringBuilder();
	sb.append(elementId).append(": ").append(text).append(": ");

	TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
	CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
	OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);

	tokenStream.reset();
	while (tokenStream.incrementToken()) {
		int startOffset = offsetAttribute.startOffset();
		int endOffset = offsetAttribute.endOffset();
		String term = charTermAttribute.toString();
		sb.append("[" + term + "](" + startOffset + "," + endOffset + ") ");
	}

	return sb.toString();
}

Example 4

Source File: AutoPhrasingTokenFilter.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

6 votes

private void emit(char[] tokenChars) {
    char[] token = tokenChars;
    if (replaceWhitespaceWith != null) {
        token = replaceWhiteSpace(token);
    }
    CharTermAttribute termAttr = getTermAttribute();
    if (termAttr != null) {
        termAttr.setEmpty();
        termAttr.append(new StringBuilder().append(token));
    }
    OffsetAttribute offAttr = getOffsetAttribute();
    if (offAttr != null && offAttr.endOffset() >= token.length) {
        int start = offAttr.endOffset() - token.length;
        offAttr.setOffset(start, offAttr.endOffset());
    }
    PositionIncrementAttribute pia = getPositionIncrementAttribute();
    if (pia != null) {
        pia.setPositionIncrement(++positionIncr);
    }
    lastEmitted = token;
}

Example 5

Source File: TransportAnalyzeAction.java From Elasticsearch with Apache License 2.0

5 votes

private static List<AnalyzeResponse.AnalyzeToken> simpleAnalyze(AnalyzeRequest request, Analyzer analyzer, String field) {
    List<AnalyzeResponse.AnalyzeToken> tokens = new ArrayList<>();
    int lastPosition = -1;
    int lastOffset = 0;
    for (String text : request.text()) {
        try (TokenStream stream = analyzer.tokenStream(field, text)) {
            stream.reset();
            CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
            PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
            OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
            TypeAttribute type = stream.addAttribute(TypeAttribute.class);

            while (stream.incrementToken()) {
                int increment = posIncr.getPositionIncrement();
                if (increment > 0) {
                    lastPosition = lastPosition + increment;
                }
                tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), type.type(), null));

            }
            stream.end();
            lastOffset += offset.endOffset();
            lastPosition += posIncr.getPositionIncrement();

            lastPosition += analyzer.getPositionIncrementGap(field);
            lastOffset += analyzer.getOffsetGap(field);
        } catch (IOException e) {
            throw new ElasticsearchException("failed to analyze", e);
        }
    }
    return tokens;
}

Example 6

Source File: TransportAnalyzeAction.java From Elasticsearch with Apache License 2.0

5 votes

private void analyze(TokenStream stream, Analyzer analyzer, String field, Set<String> includeAttributes) {
    try {
        stream.reset();
        CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
        OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
        TypeAttribute type = stream.addAttribute(TypeAttribute.class);

        while (stream.incrementToken()) {
            int increment = posIncr.getPositionIncrement();
            if (increment > 0) {
                lastPosition = lastPosition + increment;
            }
            tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(),
                lastOffset +offset.endOffset(), type.type(), extractExtendedAttributes(stream, includeAttributes)));

        }
        stream.end();
        lastOffset += offset.endOffset();
        lastPosition += posIncr.getPositionIncrement();

        lastPosition += analyzer.getPositionIncrementGap(field);
        lastOffset += analyzer.getOffsetGap(field);

    } catch (IOException e) {
        throw new ElasticsearchException("failed to analyze", e);
    } finally {
        IOUtils.closeWhileHandlingException(stream);
    }
}

Example 7

Source File: XmlInterpolationTest.java From lucene-solr with Apache License 2.0

5 votes

private int[] analyzeTagOne(String docText, String start, String end) {
  int[] result = {-1, -1};

  Reader filter = new HTMLStripCharFilter(new StringReader(docText));

  WhitespaceTokenizer ts = new WhitespaceTokenizer();
  final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
  final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
  try {
    ts.setReader(filter);
    ts.reset();
    while (ts.incrementToken()) {
      final String termString = termAttribute.toString();
      if (termString.equals(start))
        result[0] = offsetAttribute.startOffset();
      if (termString.equals(end)) {
        result[1] = offsetAttribute.endOffset();
        return result;
      }
    }
    ts.end();
  } catch (IOException e) {
    throw new RuntimeException(e);
  } finally {
    IOUtils.closeQuietly(ts);
  }
  return result;
}

Example 8

Source File: DecompoundTokenFilter.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

DecompoundToken(CharSequence value, CharTermAttribute termAttribute, OffsetAttribute offsetAttribute) {
    this.value = value;
    if (offsetAttribute.endOffset() - offsetAttribute.startOffset() != termAttribute.length()) {
        this.startOffset = offsetAttribute.startOffset();
        this.endOffset = offsetAttribute.endOffset();
    } else {
        this.startOffset = offsetAttribute.startOffset();
        this.endOffset = offsetAttribute.startOffset() + termAttribute.length();
    }
}

Example 9

Source File: TransportExtendedAnalyzeAction.java From elasticsearch-extended-analyze with Apache License 2.0

5 votes

private void analyze(TokenStream stream, Analyzer analyzer, String field, Set<String> includeAttributes, boolean shortAttrName) {
    try {
        stream.reset();
        CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
        OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
        TypeAttribute type = stream.addAttribute(TypeAttribute.class);

        while (stream.incrementToken()) {
            int increment = posIncr.getPositionIncrement();
            if (increment > 0) {
                lastPosition = lastPosition + increment;
            }
            tokens.add(new ExtendedAnalyzeResponse.ExtendedAnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(),
                lastOffset +offset.endOffset(), type.type(), extractExtendedAttributes(stream, includeAttributes, shortAttrName)));

        }
        stream.end();
        lastOffset += offset.endOffset();
        lastPosition += posIncr.getPositionIncrement();

        lastPosition += analyzer.getPositionIncrementGap(field);
        lastOffset += analyzer.getOffsetGap(field);

    } catch (IOException e) {
        throw new ElasticsearchException("failed to analyze", e);
    } finally {
        IOUtils.closeWhileHandlingException(stream);
    }
}

Example 10

Source File: XmlInterpolationTest.java From SolrTextTagger with Apache License 2.0

5 votes

private int[] analyzeTagOne(String docText, String start, String end) {
  int[] result = {-1, -1};

  Reader filter = new HTMLStripCharFilter(new StringReader(docText));

  WhitespaceTokenizer ts = new WhitespaceTokenizer();
  final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class);
  final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
  try {
    ts.setReader(filter);
    ts.reset();
    while (ts.incrementToken()) {
      final String termString = termAttribute.toString();
      if (termString.equals(start))
        result[0] = offsetAttribute.startOffset();
      if (termString.equals(end)) {
        result[1] = offsetAttribute.endOffset();
        return result;
      }
    }
    ts.end();
  } catch (IOException e) {
    throw new RuntimeException(e);
  } finally {
    IOUtils.closeQuietly(ts);
  }
  return result;
}

Example 11

Source File: MemoryIndex.java From lucene-solr with Apache License 2.0

4 votes

private void storeTerms(Info info, TokenStream tokenStream, int positionIncrementGap, int offsetGap) {

    int pos = -1;
    int offset = 0;
    if (info.numTokens > 0) {
      pos = info.lastPosition + positionIncrementGap;
      offset = info.lastOffset + offsetGap;
    }

    try (TokenStream stream = tokenStream) {
      TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
      PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class);
      OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
      PayloadAttribute payloadAtt = storePayloads ? stream.addAttribute(PayloadAttribute.class) : null;
      stream.reset();

      while (stream.incrementToken()) {
//        if (DEBUG) System.err.println("token='" + term + "'");
        info.numTokens++;
        final int posIncr = posIncrAttribute.getPositionIncrement();
        if (posIncr == 0) {
          info.numOverlapTokens++;
        }
        pos += posIncr;
        int ord = info.terms.add(termAtt.getBytesRef());
        if (ord < 0) {
          ord = (-ord) - 1;
          postingsWriter.reset(info.sliceArray.end[ord]);
        } else {
          info.sliceArray.start[ord] = postingsWriter.startNewSlice();
        }
        info.sliceArray.freq[ord]++;
        info.maxTermFrequency = Math.max(info.maxTermFrequency, info.sliceArray.freq[ord]);
        info.sumTotalTermFreq++;
        postingsWriter.writeInt(pos);
        if (storeOffsets) {
          postingsWriter.writeInt(offsetAtt.startOffset() + offset);
          postingsWriter.writeInt(offsetAtt.endOffset() + offset);
        }
        if (storePayloads) {
          final BytesRef payload = payloadAtt.getPayload();
          final int pIndex;
          if (payload == null || payload.length == 0) {
            pIndex = -1;
          } else {
            pIndex = payloadsBytesRefs.append(payload);
          }
          postingsWriter.writeInt(pIndex);
        }
        info.sliceArray.end[ord] = postingsWriter.getCurrentOffset();
      }
      stream.end();
      if (info.numTokens > 0) {
        info.lastPosition = pos;
        info.lastOffset = offsetAtt.endOffset() + offset;
      }
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }