Java Code Examples for org.apache.lucene.util.AttributeSource

The following examples show how to use org.apache.lucene.util.AttributeSource. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: lucene-solr   Source File: AnalysisImpl.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Analyzes the given TokenStream, collecting the Tokens it produces.
 *
 * @param tokenStream TokenStream to analyze
 *
 * @return List of tokens produced from the TokenStream
 */
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream, List<Token> result) {
  final List<AttributeSource> tokens = new ArrayList<>();
  try {
    tokenStream.reset();
    CharTermAttribute charAtt = tokenStream.getAttribute(CharTermAttribute.class);
    while (tokenStream.incrementToken()) {
      tokens.add(tokenStream.cloneAttributes());
      List<TokenAttribute> attributes = copyAttributes(tokenStream, charAtt);
      result.add(new Token(charAtt.toString(), attributes));
    }
    tokenStream.end();
  } catch (IOException ioe) {
    throw new RuntimeException("Error occurred while iterating over TokenStream", ioe);
  } finally {
    IOUtils.closeWhileHandlingException(tokenStream);
  }
  return tokens;
}
 
Example 2
Source Project: lucene-solr   Source File: TestTermAutomatonQuery.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public boolean incrementToken() throws IOException {
  if (synNext) {
    AttributeSource.State state = captureState();
    clearAttributes();
    restoreState(state);
    posIncAtt.setPositionIncrement(0);
    termAtt.append(""+((char) 97 + random().nextInt(3)));
    synNext = false;
    return true;
  }

  if (input.incrementToken()) {
    if (random().nextInt(10) == 8) {
      synNext = true;
    }
    return true;
  } else {
    return false;
  }
}
 
Example 3
Source Project: lucene-solr   Source File: ConcatenatingTokenStream.java    License: Apache License 2.0 6 votes vote down vote up
private static AttributeSource combineSources(TokenStream... sources) {
  AttributeSource base = sources[0].cloneAttributes();
  try {
    for (int i = 1; i < sources.length; i++) {
      Iterator<Class<? extends Attribute>> it = sources[i].getAttributeClassesIterator();
      while (it.hasNext()) {
        base.addAttribute(it.next());
      }
      // check attributes can be captured
      sources[i].copyTo(base);
    }
    return base;
  }
  catch (IllegalArgumentException e) {
    throw new IllegalArgumentException("Attempted to concatenate TokenStreams with different attribute types", e);
  }
}
 
Example 4
Source Project: lucene-solr   Source File: FuzzyTermsEnum.java    License: Apache License 2.0 6 votes vote down vote up
private FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, Supplier<FuzzyAutomatonBuilder> automatonBuilder) throws IOException {

    this.terms = terms;
    this.atts = atts;
    this.term = term;

    this.maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
    this.boostAtt = atts.addAttribute(BoostAttribute.class);

    atts.addAttributeImpl(new AutomatonAttributeImpl());
    AutomatonAttribute aa = atts.addAttribute(AutomatonAttribute.class);
    aa.init(automatonBuilder);

    this.automata = aa.getAutomata();
    this.termLength = aa.getTermLength();
    this.maxEdits = this.automata.length - 1;

    bottom = maxBoostAtt.getMaxNonCompetitiveBoost();
    bottomTerm = maxBoostAtt.getCompetitiveTerm();
    bottomChanged(null);
  }
 
Example 5
Source Project: lucene-solr   Source File: AnalysisRequestHandlerBase.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Analyzes the given TokenStream, collecting the Tokens it produces.
 *
 * @param tokenStream TokenStream to analyze
 *
 * @return List of tokens produced from the TokenStream
 */
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
  final List<AttributeSource> tokens = new ArrayList<>();
  final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
  final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class);
  // for backwards compatibility, add all "common" attributes
  tokenStream.addAttribute(OffsetAttribute.class);
  tokenStream.addAttribute(TypeAttribute.class);
  try {
    tokenStream.reset();
    int position = 0;
    while (tokenStream.incrementToken()) {
      position += posIncrAtt.getPositionIncrement();
      trackerAtt.setActPosition(position);
      tokens.add(tokenStream.cloneAttributes());
    }
    tokenStream.end(); // TODO should we capture?
  } catch (IOException ioe) {
    throw new RuntimeException("Error occurred while iterating over tokenstream", ioe);
  } finally {
    IOUtils.closeWhileHandlingException(tokenStream);
  }

  return tokens;
}
 
Example 6
protected String[] walkTokens() throws IOException {
    List<String> wordList = new ArrayList<>();
    while (input.incrementToken()) {
        CharTermAttribute textAtt = input.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = input.getAttribute(OffsetAttribute.class);
        char[] buffer = textAtt.buffer();
        String word =  new String(buffer, 0, offsetAtt.endOffset() - offsetAtt.startOffset());
        wordList.add(word);

        AttributeSource attrs = input.cloneAttributes();
        tokenAttrs.add(attrs);
    }
    String[] words = new String[wordList.size()];
    for (int i = 0; i < words.length; i++) {
        words[i] = wordList.get(i);
    }
    return words;
}
 
Example 7
Source Project: Elasticsearch   Source File: NumericTokenizer.java    License: Apache License 2.0 5 votes vote down vote up
/** Make this tokenizer get attributes from the delegate token stream. */
private static final AttributeFactory delegatingAttributeFactory(final AttributeSource source) {
    return new AttributeFactory() {
        @Override
        public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
            return (AttributeImpl) source.addAttribute(attClass);
        }
    };
}
 
Example 8
Source Project: lucene-solr   Source File: AnalysisImpl.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Creates a new ListBasedTokenStream which uses the given tokens as its token source.
 *
 * @param attributeSource source of the attribute factory and attribute impls
 * @param tokens Source of tokens to be used
 */
ListBasedTokenStream(AttributeSource attributeSource, List<AttributeSource> tokens) {
  super(attributeSource.getAttributeFactory());
  this.tokens = tokens;
  // Make sure all the attributes of the source are here too
  addAttributes(attributeSource);
}
 
Example 9
Source Project: lucene-solr   Source File: AnalysisImpl.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public boolean incrementToken() {
  if (tokenIterator.hasNext()) {
    clearAttributes();
    AttributeSource next = tokenIterator.next();
    addAttributes(next);
    next.copyTo(this);
    return true;
  } else {
    return false;
  }
}
 
Example 10
Source Project: lucene-solr   Source File: TermsQuery.java    License: Apache License 2.0 5 votes vote down vote up
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
  if (this.terms.size() == 0) {
    return TermsEnum.EMPTY;
  }

  return new SeekingTermSetTermsEnum(terms.iterator(), this.terms, ords);
}
 
Example 11
Source Project: lucene-solr   Source File: TeeSinkTokenFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public final boolean incrementToken() {
  if (!it.hasNext()) {
    return false;
  }

  AttributeSource.State state = it.next();
  restoreState(state);
  return true;
}
 
Example 12
Source Project: lucene-solr   Source File: WikipediaTokenizer.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public final boolean incrementToken() throws IOException {
  if (tokens != null && tokens.hasNext()){
    AttributeSource.State state = tokens.next();
    restoreState(state);
    return true;
  }
  clearAttributes();
  int tokenType = scanner.getNextToken();

  if (tokenType == WikipediaTokenizerImpl.YYEOF) {
    return false;
  }
  String type = WikipediaTokenizerImpl.TOKEN_TYPES[tokenType];
  if (tokenOutput == TOKENS_ONLY || untokenizedTypes.contains(type) == false){
    setupToken();
  } else if (tokenOutput == UNTOKENIZED_ONLY && untokenizedTypes.contains(type) == true){
    collapseTokens(tokenType);

  }
  else if (tokenOutput == BOTH){
    //collapse into a single token, add it to tokens AND output the individual tokens
    //output the untokenized Token first
    collapseAndSaveTokens(tokenType, type);
  }
  int posinc = scanner.getPositionIncrement();
  if (first && posinc == 0) {
    posinc = 1; // don't emit posinc=0 for the first token!
  }
  posIncrAtt.setPositionIncrement(posinc);
  typeAtt.setType(type);
  first = false;
  return true;
}
 
Example 13
Source Project: lucene-solr   Source File: WikipediaTokenizer.java    License: Apache License 2.0 5 votes vote down vote up
private void collapseAndSaveTokens(int tokenType, String type) throws IOException {
  //collapse
  StringBuilder buffer = new StringBuilder(32);
  int numAdded = scanner.setText(buffer);
  //TODO: how to know how much whitespace to add
  int theStart = scanner.yychar();
  int lastPos = theStart + numAdded;
  int tmpTokType;
  int numSeen = 0;
  List<AttributeSource.State> tmp = new ArrayList<>();
  setupSavedToken(0, type);
  tmp.add(captureState());
  //while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
  while ((tmpTokType = scanner.getNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.getNumWikiTokensSeen() > numSeen){
    int currPos = scanner.yychar();
    //append whitespace
    for (int i = 0; i < (currPos - lastPos); i++){
      buffer.append(' ');
    }
    numAdded = scanner.setText(buffer);
    setupSavedToken(scanner.getPositionIncrement(), type);
    tmp.add(captureState());
    numSeen++;
    lastPos = currPos + numAdded;
  }
  //trim the buffer
  // TODO: this is inefficient
  String s = buffer.toString().trim();
  termAtt.setEmpty().append(s);
  offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length()));
  flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG);
  //The way the loop is written, we will have proceeded to the next token.  We need to pushback the scanner to lastPos
  if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
    scanner.yypushback(scanner.yylength());
  }
  tokens = tmp.iterator();
}
 
Example 14
Source Project: lucene-solr   Source File: WordDelimiterFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
protected void swap(int i, int j) {
  AttributeSource.State tmp = buffered[i];
  buffered[i] = buffered[j];
  buffered[j] = tmp;
  
  int tmp2 = startOff[i];
  startOff[i] = startOff[j];
  startOff[j] = tmp2;
  
  tmp2 = posInc[i];
  posInc[i] = posInc[j];
  posInc[j] = tmp2;
}
 
Example 15
Source Project: lucene-solr   Source File: TestRandomChains.java    License: Apache License 2.0 5 votes vote down vote up
static Object[] newTokenizerArgs(Random random, Class<?>[] paramTypes) {
  Object[] args = new Object[paramTypes.length];
  for (int i = 0; i < args.length; i++) {
    Class<?> paramType = paramTypes[i];
    if (paramType == AttributeSource.class) {
      // TODO: args[i] = new AttributeSource();
      // this is currently too scary to deal with!
      args[i] = null; // force IAE
    } else {
      args[i] = newRandomArg(random, paramType);
    }
  }
  return args;
}
 
Example 16
Source Project: lucene-solr   Source File: MockSynonymFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public final boolean incrementToken() throws IOException {
  if (tokenQueue.size() > 0) {
    tokenQueue.remove(0).copyTo(this);
    return true;
  }
  if (endOfInput == false && input.incrementToken()) {
    if (termAtt.toString().equals("dogs")) {
      addSynonymAndRestoreOrigToken("dog", 1, offsetAtt.endOffset());
    } else if (termAtt.toString().equals("guinea")) {
      AttributeSource firstSavedToken = cloneAttributes();
      if (input.incrementToken()) {
        if (termAtt.toString().equals("pig")) {
          AttributeSource secondSavedToken = cloneAttributes();
          int secondEndOffset = offsetAtt.endOffset();
          firstSavedToken.copyTo(this);
          addSynonym("cavy", 2, secondEndOffset);
          tokenQueue.add(secondSavedToken);
        } else if (termAtt.toString().equals("dogs")) {
          tokenQueue.add(cloneAttributes());
          addSynonym("dog", 1, offsetAtt.endOffset());
        }
      } else {
        endOfInput = true;
      }
      firstSavedToken.copyTo(this);
    }
    return true;
  } else {
    endOfInput = true;
    return false;
  }
}
 
Example 17
Source Project: lucene-solr   Source File: FuzzyQuery.java    License: Apache License 2.0 5 votes vote down vote up
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
  if (maxEdits == 0) { // can only match if it's exact
    return new SingleTermsEnum(terms.iterator(), term.bytes());
  }
  return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits, prefixLength, transpositions);
}
 
Example 18
Source Project: lucene-solr   Source File: FieldInvertState.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Sets attributeSource to a new instance.
 */
void setAttributeSource(AttributeSource attributeSource) {
  if (this.attributeSource != attributeSource) {
    this.attributeSource = attributeSource;
    termAttribute = attributeSource.getAttribute(TermToBytesRefAttribute.class);
    termFreqAttribute = attributeSource.addAttribute(TermFrequencyAttribute.class);
    posIncrAttribute = attributeSource.addAttribute(PositionIncrementAttribute.class);
    offsetAttribute = attributeSource.addAttribute(OffsetAttribute.class);
    payloadAttribute = attributeSource.getAttribute(PayloadAttribute.class);
  }
}
 
Example 19
Source Project: lucene-solr   Source File: GraphTokenStreamFiniteStrings.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Returns the list of tokens that start at the provided state
 */
public List<AttributeSource> getTerms(int state) {
  int numT = det.initTransition(state, transition);
  List<AttributeSource> tokens = new ArrayList<> ();
  for (int i = 0; i < numT; i++) {
    det.getNextTransition(transition);
    tokens.addAll(Arrays.asList(this.tokens).subList(transition.min, transition.max + 1));
  }
  return tokens;
}
 
Example 20
Source Project: lucene-solr   Source File: LegacyNumericRangeQuery.java    License: Apache License 2.0 5 votes vote down vote up
@Override @SuppressWarnings("unchecked")
protected TermsEnum getTermsEnum(final Terms terms, AttributeSource atts) throws IOException {
  // very strange: java.lang.Number itself is not Comparable, but all subclasses used here are
  if (min != null && max != null && ((Comparable<T>) min).compareTo(max) > 0) {
    return TermsEnum.EMPTY;
  }
  return new NumericRangeTermsEnum(terms.iterator());
}
 
Example 21
Source Project: lucene-solr   Source File: AnalysisRequestHandlerBase.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Creates a new ListBasedTokenStream which uses the given tokens as its token source.
 *
 * @param attributeSource source of the attribute factory and attribute impls
 * @param tokens Source of tokens to be used
 */
ListBasedTokenStream(AttributeSource attributeSource, List<AttributeSource> tokens) {
  super(attributeSource.getAttributeFactory());
  this.tokens = tokens;
  // Make sure all the attributes of the source are here too
  addAttributes(attributeSource);
}
 
Example 22
Source Project: lucene-solr   Source File: AnalysisRequestHandlerBase.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public boolean incrementToken() {
  if (tokenIterator.hasNext()) {
    clearAttributes();
    AttributeSource next = tokenIterator.next();

    addAttributes(next); // just in case there were delayed attribute additions

    next.copyTo(this);
    return true;
  } else {
    return false;
  }
}
 
Example 23
Source Project: lucene-solr   Source File: AnalysisRequestHandlerBase.java    License: Apache License 2.0 5 votes vote down vote up
protected void addAttributes(AttributeSource attributeSource) {
  // note: ideally we wouldn't call addAttributeImpl which is marked internal. But nonetheless it's possible
  //  this method is used by some custom attributes, especially since Solr doesn't provide a way to customize the
  //  AttributeFactory which is the recommended way to choose which classes implement which attributes.
  Iterator<AttributeImpl> atts = attributeSource.getAttributeImplsIterator();
  while (atts.hasNext()) {
    addAttributeImpl(atts.next()); // adds both impl & interfaces
  }
}
 
Example 24
Source Project: lucene-solr   Source File: PreAnalyzedField.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public final boolean incrementToken() {
  if (!it.hasNext()) {
    return false;
  }
  
  AttributeSource.State state = it.next();
  restoreState(state.clone());
  // TODO: why can't I lookup the OffsetAttribute up in ctor instead?
  lastEndOffset = addAttribute(OffsetAttribute.class).endOffset();
  return true;
}
 
Example 25
private boolean createToken(String synonym, AttributeSource.State current) {
    restoreState(current);
    termAttribute.setEmpty().append(synonym);
    typeAtt.setType(DEASCII_TOKEN_TYPE);
    posIncrAtt.setPositionIncrement(0);
    return true;
}
 
Example 26
@Override
public boolean incrementToken() throws IOException {
    //clearAttributes();
    if (first) {
        //gather all tokens from doc
        String[] words = walkTokens();
        if (words.length == 0) {
            return false;
        }
        //tagging
        posTags = createTags(words);
        first = false;
        tokenIdx = 0;
    }

    if (tokenIdx == tokenAttrs.size()) {
        resetParams();
        return false;
    }

    AttributeSource as = tokenAttrs.get(tokenIdx);
    Iterator<? extends Class<? extends Attribute>> it = as.getAttributeClassesIterator();
    while (it.hasNext()) {
        Class<? extends Attribute> attrClass = it.next();
        if (!hasAttribute(attrClass)) {
            addAttribute(attrClass);
        }
    }
    as.copyTo(this);
    MWEMetadata metadata = exitingPayload.getPayload() == null ? new MWEMetadata() :
            MWEMetadata.deserialize(exitingPayload.getPayload().utf8ToString());
    metadata.addMetaData(MWEMetadataType.POS, posTags[tokenIdx]);
    exitingPayload.setPayload(new BytesRef(MWEMetadata.serialize(metadata)));
    tokenIdx++;
    return true;
}
 
Example 27
@Override
public AttributeSource attributes() {
    return delegate.attributes();
}
 
Example 28
/**
 * @return AttributeSource
 * @see org.apache.lucene.index.DocsEnum#attributes()
 */
public AttributeSource attributes()
{
    return delegate.attributes();
}
 
Example 29
Source Project: lucene-solr   Source File: AnalysisImpl.java    License: Apache License 2.0 4 votes vote down vote up
void addAttributes(AttributeSource attributeSource) {
  Iterator<AttributeImpl> atts = attributeSource.getAttributeImplsIterator();
  while (atts.hasNext()) {
    addAttributeImpl(atts.next()); // adds both impl & interfaces
  }
}
 
Example 30
Source Project: lucene-solr   Source File: STMergingTermsEnum.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public AttributeSource attributes() {
  throw new UnsupportedOperationException();
}