Java Code Examples for org.apache.lucene.util.AttributeSource#State

The following examples show how to use org.apache.lucene.util.AttributeSource#State . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestTermAutomatonQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public boolean incrementToken() throws IOException {
  if (synNext) {
    AttributeSource.State state = captureState();
    clearAttributes();
    restoreState(state);
    posIncAtt.setPositionIncrement(0);
    termAtt.append(""+((char) 97 + random().nextInt(3)));
    synNext = false;
    return true;
  }

  if (input.incrementToken()) {
    if (random().nextInt(10) == 8) {
      synNext = true;
    }
    return true;
  } else {
    return false;
  }
}
 
Example 2
Source File: TeeSinkTokenFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public final boolean incrementToken() {
  if (!it.hasNext()) {
    return false;
  }

  AttributeSource.State state = it.next();
  restoreState(state);
  return true;
}
 
Example 3
Source File: WikipediaTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public final boolean incrementToken() throws IOException {
  if (tokens != null && tokens.hasNext()){
    AttributeSource.State state = tokens.next();
    restoreState(state);
    return true;
  }
  clearAttributes();
  int tokenType = scanner.getNextToken();

  if (tokenType == WikipediaTokenizerImpl.YYEOF) {
    return false;
  }
  String type = WikipediaTokenizerImpl.TOKEN_TYPES[tokenType];
  if (tokenOutput == TOKENS_ONLY || untokenizedTypes.contains(type) == false){
    setupToken();
  } else if (tokenOutput == UNTOKENIZED_ONLY && untokenizedTypes.contains(type) == true){
    collapseTokens(tokenType);

  }
  else if (tokenOutput == BOTH){
    //collapse into a single token, add it to tokens AND output the individual tokens
    //output the untokenized Token first
    collapseAndSaveTokens(tokenType, type);
  }
  int posinc = scanner.getPositionIncrement();
  if (first && posinc == 0) {
    posinc = 1; // don't emit posinc=0 for the first token!
  }
  posIncrAtt.setPositionIncrement(posinc);
  typeAtt.setType(type);
  first = false;
  return true;
}
 
Example 4
Source File: WikipediaTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void collapseAndSaveTokens(int tokenType, String type) throws IOException {
  //collapse
  StringBuilder buffer = new StringBuilder(32);
  int numAdded = scanner.setText(buffer);
  //TODO: how to know how much whitespace to add
  int theStart = scanner.yychar();
  int lastPos = theStart + numAdded;
  int tmpTokType;
  int numSeen = 0;
  List<AttributeSource.State> tmp = new ArrayList<>();
  setupSavedToken(0, type);
  tmp.add(captureState());
  //while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
  while ((tmpTokType = scanner.getNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.getNumWikiTokensSeen() > numSeen){
    int currPos = scanner.yychar();
    //append whitespace
    for (int i = 0; i < (currPos - lastPos); i++){
      buffer.append(' ');
    }
    numAdded = scanner.setText(buffer);
    setupSavedToken(scanner.getPositionIncrement(), type);
    tmp.add(captureState());
    numSeen++;
    lastPos = currPos + numAdded;
  }
  //trim the buffer
  // TODO: this is inefficient
  String s = buffer.toString().trim();
  termAtt.setEmpty().append(s);
  offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length()));
  flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG);
  //The way the loop is written, we will have proceeded to the next token.  We need to pushback the scanner to lastPos
  if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
    scanner.yypushback(scanner.yylength());
  }
  tokens = tmp.iterator();
}
 
Example 5
Source File: WordDelimiterFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
protected void swap(int i, int j) {
  AttributeSource.State tmp = buffered[i];
  buffered[i] = buffered[j];
  buffered[j] = tmp;
  
  int tmp2 = startOff[i];
  startOff[i] = startOff[j];
  startOff[j] = tmp2;
  
  tmp2 = posInc[i];
  posInc[i] = posInc[j];
  posInc[j] = tmp2;
}
 
Example 6
Source File: PreAnalyzedField.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public final boolean incrementToken() {
  if (!it.hasNext()) {
    return false;
  }
  
  AttributeSource.State state = it.next();
  restoreState(state.clone());
  // TODO: why can't I lookup the OffsetAttribute up in ctor instead?
  lastEndOffset = addAttribute(OffsetAttribute.class).endOffset();
  return true;
}
 
Example 7
Source File: Zemberek2DeASCIIfyFilterFactory.java    From lucene-solr-analysis-turkish with Apache License 2.0 5 votes vote down vote up
private boolean createToken(String synonym, AttributeSource.State current) {
    restoreState(current);
    termAttribute.setEmpty().append(synonym);
    typeAtt.setType(DEASCII_TOKEN_TYPE);
    posIncrAtt.setPositionIncrement(0);
    return true;
}
 
Example 8
Source File: LookaheadTokenFilter.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void add(AttributeSource.State state) {
  inputTokens.add(state);
}
 
Example 9
Source File: LookaheadTokenFilter.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public AttributeSource.State nextState() {
  assert nextRead < inputTokens.size();
  return inputTokens.get(nextRead++);
}
 
Example 10
Source File: TestDocumentWriter.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testTokenReuse() throws IOException {
  Analyzer analyzer = new Analyzer() {
    @Override
    public TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new TokenFilter(tokenizer) {
        boolean first = true;
        AttributeSource.State state;

        @Override
        public boolean incrementToken() throws IOException {
          if (state != null) {
            restoreState(state);
            payloadAtt.setPayload(null);
            posIncrAtt.setPositionIncrement(0);
            termAtt.setEmpty().append("b");
            state = null;
            return true;
          }

          boolean hasNext = input.incrementToken();
          if (!hasNext) return false;
          if (Character.isDigit(termAtt.buffer()[0])) {
            posIncrAtt.setPositionIncrement(termAtt.buffer()[0] - '0');
          }
          if (first) {
            // set payload on first position only
            payloadAtt.setPayload(new BytesRef(new byte[]{100}));
            first = false;
          }

          // index a "synonym" for every token
          state = captureState();
          return true;

        }

        @Override
        public void reset() throws IOException {
          super.reset();
          first = true;
          state = null;
        }

        final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
        final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
        final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
      });
    }
  };

  IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer));

  Document doc = new Document();
  doc.add(newTextField("f1", "a 5 a a", Field.Store.YES));

  writer.addDocument(doc);
  writer.commit();
  SegmentCommitInfo info = writer.newestSegment();
  writer.close();
  SegmentReader reader = new SegmentReader(info, Version.LATEST.major, newIOContext(random()));

  PostingsEnum termPositions = MultiTerms.getTermPostingsEnum(reader, "f1", new BytesRef("a"));
  assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  int freq = termPositions.freq();
  assertEquals(3, freq);
  assertEquals(0, termPositions.nextPosition());
  assertNotNull(termPositions.getPayload());
  assertEquals(6, termPositions.nextPosition());
  assertNull(termPositions.getPayload());
  assertEquals(7, termPositions.nextPosition());
  assertNull(termPositions.getPayload());
  reader.close();
}
 
Example 11
Source File: SimplePreAnalyzedParser.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private static AttributeSource.State createState(AttributeSource a, Tok state, int tokenEnd) {
  a.clearAttributes();
  CharTermAttribute termAtt = a.addAttribute(CharTermAttribute.class);
  char[] tokChars = state.token.toString().toCharArray();
  termAtt.copyBuffer(tokChars, 0, tokChars.length);
  int tokenStart = tokenEnd - state.token.length();
  for (Entry<String, String> e : state.attr.entrySet()) {
    String k = e.getKey();
    if (k.equals("i")) {
      // position increment
      int incr = Integer.parseInt(e.getValue());
      PositionIncrementAttribute posIncr = a.addAttribute(PositionIncrementAttribute.class);
      posIncr.setPositionIncrement(incr);
    } else if (k.equals("s")) {
      tokenStart = Integer.parseInt(e.getValue());
    } else if (k.equals("e")) {
      tokenEnd = Integer.parseInt(e.getValue());
    } else if (k.equals("y")) {
      TypeAttribute type = a.addAttribute(TypeAttribute.class);
      type.setType(e.getValue());
    } else if (k.equals("f")) {
      FlagsAttribute flags = a.addAttribute(FlagsAttribute.class);
      int f = Integer.parseInt(e.getValue(), 16);
      flags.setFlags(f);
    } else if (k.equals("p")) {
      PayloadAttribute p = a.addAttribute(PayloadAttribute.class);
      byte[] data = hexToBytes(e.getValue());
      if (data != null && data.length > 0) {
        p.setPayload(new BytesRef(data));
      }
    } else {
      // unknown attribute
    }
  }
  // handle offset attr
  OffsetAttribute offset = a.addAttribute(OffsetAttribute.class);
  offset.setOffset(tokenStart, tokenEnd);
  State resState = a.captureState();
  a.clearAttributes();
  return resState;
}