org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute Java Examples

The following examples show how to use org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TokenStreamToDot.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** If inputText is non-null, and the TokenStream has
 *  offsets, we include the surface form in each arc's
 *  label. */
public TokenStreamToDot(String inputText, TokenStream in, PrintWriter out) {
  this.in = in;
  this.out = out;
  this.inputText = inputText;
  termAtt = in.addAttribute(CharTermAttribute.class);
  posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
  posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
  if (in.hasAttribute(OffsetAttribute.class)) {
    offsetAtt = in.addAttribute(OffsetAttribute.class);
  } else {
    offsetAtt = null;
  }
}
 
Example #2
Source File: MeCabKoTokenizer.java    From mecab-ko-lucene-analyzer with Apache License 2.0 5 votes vote down vote up
private void setAttributes() {
  charTermAtt = addAttribute(CharTermAttribute.class);
  posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  posLenAtt = addAttribute(PositionLengthAttribute.class);
  offsetAtt = addAttribute(OffsetAttribute.class);
  typeAtt = addAttribute(TypeAttribute.class);
  posAtt = addAttribute(PartOfSpeechAttribute.class);
  semanticClassAtt = addAttribute(SemanticClassAttribute.class);
}
 
Example #3
Source File: NGramTokenizerTest.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
static void testNGrams(int minGram, int maxGram, String s, final String nonTokenChars, boolean edgesOnly) throws IOException {
  // convert the string to code points
  final int[] codePoints = toCodePoints(s);
  final int[] offsets = new int[codePoints.length + 1];
  for (int i = 0; i < codePoints.length; ++i) {
    offsets[i+1] = offsets[i] + Character.charCount(codePoints[i]);
  }
  final Tokenizer grams = new NGramTokenizer(minGram, maxGram, edgesOnly) {
    @Override
    protected boolean isTokenChar(int chr) {
      return nonTokenChars.indexOf(chr) < 0;
    }
  };
  grams.setReader(new StringReader(s));
  final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class);
  final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class);
  final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class);
  final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class);
  grams.reset();
  for (int start = 0; start < codePoints.length; ++start) {
    nextGram:
    for (int end = start + minGram; end <= start + maxGram && end <= codePoints.length; ++end) {
      if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1])) {
        // not on an edge
        continue nextGram;
      }
      for (int j = start; j < end; ++j) {
        if (!isTokenChar(nonTokenChars, codePoints[j])) {
          continue nextGram;
        }
      }
      assertTrue(grams.incrementToken());
      assertArrayEquals(ArrayUtil.copyOfSubArray(codePoints, start, end), toCodePoints(termAtt));
      assertEquals(1, posIncAtt.getPositionIncrement());
      assertEquals(1, posLenAtt.getPositionLength());
      assertEquals(offsets[start], offsetAtt.startOffset());
      assertEquals(offsets[end], offsetAtt.endOffset());
    }
  }
  assertFalse(grams.incrementToken());
  grams.end();
  assertEquals(s.length(), offsetAtt.startOffset());
  assertEquals(s.length(), offsetAtt.endOffset());
}
 
Example #4
Source File: GraphTokenFilter.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
Token(AttributeSource attSource) {
  this.attSource = attSource;
  this.posIncAtt = attSource.addAttribute(PositionIncrementAttribute.class);
  boolean hasLengthAtt = attSource.hasAttribute(PositionLengthAttribute.class);
  this.lengthAtt = hasLengthAtt ? attSource.addAttribute(PositionLengthAttribute.class) : null;
}
 
Example #5
Source File: GraphTokenStreamFiniteStrings.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/**
 * Build an automaton from the provided {@link TokenStream}.
 */
private Automaton build(final TokenStream in) throws IOException {
  Automaton.Builder builder = new Automaton.Builder();

  final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
  final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);

  in.reset();

  int pos = -1;
  int prevIncr = 1;
  int state = -1;
  int id = -1;
  int gap = 0;
  while (in.incrementToken()) {
    int currentIncr = posIncAtt.getPositionIncrement();
    if (pos == -1 && currentIncr < 1) {
      throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1");
    }

    if (currentIncr == 0) {
      if (gap > 0) {
        pos -= gap;
      }
    }
    else {
      pos++;
      gap = currentIncr - 1;
    }

    int endPos = pos + posLengthAtt.getPositionLength() + gap;
    while (state < endPos) {
      state = builder.createState();
    }

    id++;
    if (tokens.length < id + 1) {
      tokens = ArrayUtil.grow(tokens, id + 1);
    }

    tokens[id] = in.cloneAttributes();
    builder.addTransition(pos, endPos, id);
    pos += gap;

    // we always produce linear token graphs from getFiniteStrings(), so we need to adjust
    // posLength and posIncrement accordingly
    tokens[id].addAttribute(PositionLengthAttribute.class).setPositionLength(1);
    if (currentIncr == 0) {
      // stacked token should have the same increment as original token at this position
      tokens[id].addAttribute(PositionIncrementAttribute.class).setPositionIncrement(prevIncr);
    }

    // only save last increment on non-zero increment in case we have multiple stacked tokens
    if (currentIncr > 0) {
      prevIncr = currentIncr;
    }
  }

  in.end();
  if (state != -1) {
    builder.setAccept(state, true);
  }
  return builder.finish();
}