org.apache.lucene.util.AttributeSource Java Examples

The following examples show how to use org.apache.lucene.util.AttributeSource. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: OpenNLPPOSTaggerFilter.java    From jate with GNU Lesser General Public License v3.0 6 votes vote down vote up
protected String[] walkTokens() throws IOException {
    List<String> wordList = new ArrayList<>();
    while (input.incrementToken()) {
        CharTermAttribute textAtt = input.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = input.getAttribute(OffsetAttribute.class);
        char[] buffer = textAtt.buffer();
        String word =  new String(buffer, 0, offsetAtt.endOffset() - offsetAtt.startOffset());
        wordList.add(word);

        AttributeSource attrs = input.cloneAttributes();
        tokenAttrs.add(attrs);
    }
    String[] words = new String[wordList.size()];
    for (int i = 0; i < words.length; i++) {
        words[i] = wordList.get(i);
    }
    return words;
}
 
Example #2
Source File: AnalysisImpl.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Analyzes the given TokenStream, collecting the Tokens it produces.
 *
 * @param tokenStream TokenStream to analyze
 *
 * @return List of tokens produced from the TokenStream
 */
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream, List<Token> result) {
  final List<AttributeSource> tokens = new ArrayList<>();
  try {
    tokenStream.reset();
    CharTermAttribute charAtt = tokenStream.getAttribute(CharTermAttribute.class);
    while (tokenStream.incrementToken()) {
      tokens.add(tokenStream.cloneAttributes());
      List<TokenAttribute> attributes = copyAttributes(tokenStream, charAtt);
      result.add(new Token(charAtt.toString(), attributes));
    }
    tokenStream.end();
  } catch (IOException ioe) {
    throw new RuntimeException("Error occurred while iterating over TokenStream", ioe);
  } finally {
    IOUtils.closeWhileHandlingException(tokenStream);
  }
  return tokens;
}
 
Example #3
Source File: TestTermAutomatonQuery.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public boolean incrementToken() throws IOException {
  if (synNext) {
    AttributeSource.State state = captureState();
    clearAttributes();
    restoreState(state);
    posIncAtt.setPositionIncrement(0);
    termAtt.append(""+((char) 97 + random().nextInt(3)));
    synNext = false;
    return true;
  }

  if (input.incrementToken()) {
    if (random().nextInt(10) == 8) {
      synNext = true;
    }
    return true;
  } else {
    return false;
  }
}
 
Example #4
Source File: AnalysisRequestHandlerBase.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Analyzes the given TokenStream, collecting the Tokens it produces.
 *
 * @param tokenStream TokenStream to analyze
 *
 * @return List of tokens produced from the TokenStream
 */
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
  final List<AttributeSource> tokens = new ArrayList<>();
  final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
  final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class);
  // for backwards compatibility, add all "common" attributes
  tokenStream.addAttribute(OffsetAttribute.class);
  tokenStream.addAttribute(TypeAttribute.class);
  try {
    tokenStream.reset();
    int position = 0;
    while (tokenStream.incrementToken()) {
      position += posIncrAtt.getPositionIncrement();
      trackerAtt.setActPosition(position);
      tokens.add(tokenStream.cloneAttributes());
    }
    tokenStream.end(); // TODO should we capture?
  } catch (IOException ioe) {
    throw new RuntimeException("Error occurred while iterating over tokenstream", ioe);
  } finally {
    IOUtils.closeWhileHandlingException(tokenStream);
  }

  return tokens;
}
 
Example #5
Source File: FuzzyTermsEnum.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, Supplier<FuzzyAutomatonBuilder> automatonBuilder) throws IOException {

    this.terms = terms;
    this.atts = atts;
    this.term = term;

    this.maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
    this.boostAtt = atts.addAttribute(BoostAttribute.class);

    atts.addAttributeImpl(new AutomatonAttributeImpl());
    AutomatonAttribute aa = atts.addAttribute(AutomatonAttribute.class);
    aa.init(automatonBuilder);

    this.automata = aa.getAutomata();
    this.termLength = aa.getTermLength();
    this.maxEdits = this.automata.length - 1;

    bottom = maxBoostAtt.getMaxNonCompetitiveBoost();
    bottomTerm = maxBoostAtt.getCompetitiveTerm();
    bottomChanged(null);
  }
 
Example #6
Source File: ConcatenatingTokenStream.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private static AttributeSource combineSources(TokenStream... sources) {
  AttributeSource base = sources[0].cloneAttributes();
  try {
    for (int i = 1; i < sources.length; i++) {
      Iterator<Class<? extends Attribute>> it = sources[i].getAttributeClassesIterator();
      while (it.hasNext()) {
        base.addAttribute(it.next());
      }
      // check attributes can be captured
      sources[i].copyTo(base);
    }
    return base;
  }
  catch (IllegalArgumentException e) {
    throw new IllegalArgumentException("Attempted to concatenate TokenStreams with different attribute types", e);
  }
}
 
Example #7
Source File: FuzzyQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
  if (maxEdits == 0) { // can only match if it's exact
    return new SingleTermsEnum(terms.iterator(), term.bytes());
  }
  return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits, prefixLength, transpositions);
}
 
Example #8
Source File: WordDelimiterFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
protected void swap(int i, int j) {
  AttributeSource.State tmp = buffered[i];
  buffered[i] = buffered[j];
  buffered[j] = tmp;
  
  int tmp2 = startOff[i];
  startOff[i] = startOff[j];
  startOff[j] = tmp2;
  
  tmp2 = posInc[i];
  posInc[i] = posInc[j];
  posInc[j] = tmp2;
}
 
Example #9
Source File: AnalysisImpl.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public boolean incrementToken() {
  if (tokenIterator.hasNext()) {
    clearAttributes();
    AttributeSource next = tokenIterator.next();
    addAttributes(next);
    next.copyTo(this);
    return true;
  } else {
    return false;
  }
}
 
Example #10
Source File: AnalysisImpl.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Creates a new ListBasedTokenStream which uses the given tokens as its token source.
 *
 * @param attributeSource source of the attribute factory and attribute impls
 * @param tokens Source of tokens to be used
 */
ListBasedTokenStream(AttributeSource attributeSource, List<AttributeSource> tokens) {
  super(attributeSource.getAttributeFactory());
  this.tokens = tokens;
  // Make sure all the attributes of the source are here too
  addAttributes(attributeSource);
}
 
Example #11
Source File: TermsQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
  if (this.terms.size() == 0) {
    return TermsEnum.EMPTY;
  }

  return new SeekingTermSetTermsEnum(terms.iterator(), this.terms, ords);
}
 
Example #12
Source File: WikipediaTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void collapseAndSaveTokens(int tokenType, String type) throws IOException {
  //collapse
  StringBuilder buffer = new StringBuilder(32);
  int numAdded = scanner.setText(buffer);
  //TODO: how to know how much whitespace to add
  int theStart = scanner.yychar();
  int lastPos = theStart + numAdded;
  int tmpTokType;
  int numSeen = 0;
  List<AttributeSource.State> tmp = new ArrayList<>();
  setupSavedToken(0, type);
  tmp.add(captureState());
  //while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
  while ((tmpTokType = scanner.getNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.getNumWikiTokensSeen() > numSeen){
    int currPos = scanner.yychar();
    //append whitespace
    for (int i = 0; i < (currPos - lastPos); i++){
      buffer.append(' ');
    }
    numAdded = scanner.setText(buffer);
    setupSavedToken(scanner.getPositionIncrement(), type);
    tmp.add(captureState());
    numSeen++;
    lastPos = currPos + numAdded;
  }
  //trim the buffer
  // TODO: this is inefficient
  String s = buffer.toString().trim();
  termAtt.setEmpty().append(s);
  offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length()));
  flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG);
  //The way the loop is written, we will have proceeded to the next token.  We need to pushback the scanner to lastPos
  if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
    scanner.yypushback(scanner.yylength());
  }
  tokens = tmp.iterator();
}
 
Example #13
Source File: WikipediaTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public final boolean incrementToken() throws IOException {
  if (tokens != null && tokens.hasNext()){
    AttributeSource.State state = tokens.next();
    restoreState(state);
    return true;
  }
  clearAttributes();
  int tokenType = scanner.getNextToken();

  if (tokenType == WikipediaTokenizerImpl.YYEOF) {
    return false;
  }
  String type = WikipediaTokenizerImpl.TOKEN_TYPES[tokenType];
  if (tokenOutput == TOKENS_ONLY || untokenizedTypes.contains(type) == false){
    setupToken();
  } else if (tokenOutput == UNTOKENIZED_ONLY && untokenizedTypes.contains(type) == true){
    collapseTokens(tokenType);

  }
  else if (tokenOutput == BOTH){
    //collapse into a single token, add it to tokens AND output the individual tokens
    //output the untokenized Token first
    collapseAndSaveTokens(tokenType, type);
  }
  int posinc = scanner.getPositionIncrement();
  if (first && posinc == 0) {
    posinc = 1; // don't emit posinc=0 for the first token!
  }
  posIncrAtt.setPositionIncrement(posinc);
  typeAtt.setType(type);
  first = false;
  return true;
}
 
Example #14
Source File: TestRandomChains.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
static Object[] newTokenizerArgs(Random random, Class<?>[] paramTypes) {
  Object[] args = new Object[paramTypes.length];
  for (int i = 0; i < args.length; i++) {
    Class<?> paramType = paramTypes[i];
    if (paramType == AttributeSource.class) {
      // TODO: args[i] = new AttributeSource();
      // this is currently too scary to deal with!
      args[i] = null; // force IAE
    } else {
      args[i] = newRandomArg(random, paramType);
    }
  }
  return args;
}
 
Example #15
Source File: TeeSinkTokenFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public final boolean incrementToken() {
  if (!it.hasNext()) {
    return false;
  }

  AttributeSource.State state = it.next();
  restoreState(state);
  return true;
}
 
Example #16
Source File: MockSynonymFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public final boolean incrementToken() throws IOException {
  if (tokenQueue.size() > 0) {
    tokenQueue.remove(0).copyTo(this);
    return true;
  }
  if (endOfInput == false && input.incrementToken()) {
    if (termAtt.toString().equals("dogs")) {
      addSynonymAndRestoreOrigToken("dog", 1, offsetAtt.endOffset());
    } else if (termAtt.toString().equals("guinea")) {
      AttributeSource firstSavedToken = cloneAttributes();
      if (input.incrementToken()) {
        if (termAtt.toString().equals("pig")) {
          AttributeSource secondSavedToken = cloneAttributes();
          int secondEndOffset = offsetAtt.endOffset();
          firstSavedToken.copyTo(this);
          addSynonym("cavy", 2, secondEndOffset);
          tokenQueue.add(secondSavedToken);
        } else if (termAtt.toString().equals("dogs")) {
          tokenQueue.add(cloneAttributes());
          addSynonym("dog", 1, offsetAtt.endOffset());
        }
      } else {
        endOfInput = true;
      }
      firstSavedToken.copyTo(this);
    }
    return true;
  } else {
    endOfInput = true;
    return false;
  }
}
 
Example #17
Source File: NumericTokenizer.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
/** Make this tokenizer get attributes from the delegate token stream. */
private static final AttributeFactory delegatingAttributeFactory(final AttributeSource source) {
    return new AttributeFactory() {
        @Override
        public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
            return (AttributeImpl) source.addAttribute(attClass);
        }
    };
}
 
Example #18
Source File: FieldInvertState.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Sets attributeSource to a new instance.
 */
void setAttributeSource(AttributeSource attributeSource) {
  if (this.attributeSource != attributeSource) {
    this.attributeSource = attributeSource;
    termAttribute = attributeSource.getAttribute(TermToBytesRefAttribute.class);
    termFreqAttribute = attributeSource.addAttribute(TermFrequencyAttribute.class);
    posIncrAttribute = attributeSource.addAttribute(PositionIncrementAttribute.class);
    offsetAttribute = attributeSource.addAttribute(OffsetAttribute.class);
    payloadAttribute = attributeSource.getAttribute(PayloadAttribute.class);
  }
}
 
Example #19
Source File: GraphTokenStreamFiniteStrings.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Returns the list of tokens that start at the provided state
 */
public List<AttributeSource> getTerms(int state) {
  int numT = det.initTransition(state, transition);
  List<AttributeSource> tokens = new ArrayList<> ();
  for (int i = 0; i < numT; i++) {
    det.getNextTransition(transition);
    tokens.addAll(Arrays.asList(this.tokens).subList(transition.min, transition.max + 1));
  }
  return tokens;
}
 
Example #20
Source File: LegacyNumericRangeQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override @SuppressWarnings("unchecked")
protected TermsEnum getTermsEnum(final Terms terms, AttributeSource atts) throws IOException {
  // very strange: java.lang.Number itself is not Comparable, but all subclasses used here are
  if (min != null && max != null && ((Comparable<T>) min).compareTo(max) > 0) {
    return TermsEnum.EMPTY;
  }
  return new NumericRangeTermsEnum(terms.iterator());
}
 
Example #21
Source File: AnalysisRequestHandlerBase.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Creates a new ListBasedTokenStream which uses the given tokens as its token source.
 *
 * @param attributeSource source of the attribute factory and attribute impls
 * @param tokens Source of tokens to be used
 */
ListBasedTokenStream(AttributeSource attributeSource, List<AttributeSource> tokens) {
  super(attributeSource.getAttributeFactory());
  this.tokens = tokens;
  // Make sure all the attributes of the source are here too
  addAttributes(attributeSource);
}
 
Example #22
Source File: AnalysisRequestHandlerBase.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public boolean incrementToken() {
  if (tokenIterator.hasNext()) {
    clearAttributes();
    AttributeSource next = tokenIterator.next();

    addAttributes(next); // just in case there were delayed attribute additions

    next.copyTo(this);
    return true;
  } else {
    return false;
  }
}
 
Example #23
Source File: AnalysisRequestHandlerBase.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
protected void addAttributes(AttributeSource attributeSource) {
  // note: ideally we wouldn't call addAttributeImpl which is marked internal. But nonetheless it's possible
  //  this method is used by some custom attributes, especially since Solr doesn't provide a way to customize the
  //  AttributeFactory which is the recommended way to choose which classes implement which attributes.
  Iterator<AttributeImpl> atts = attributeSource.getAttributeImplsIterator();
  while (atts.hasNext()) {
    addAttributeImpl(atts.next()); // adds both impl & interfaces
  }
}
 
Example #24
Source File: PreAnalyzedField.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public final boolean incrementToken() {
  if (!it.hasNext()) {
    return false;
  }
  
  AttributeSource.State state = it.next();
  restoreState(state.clone());
  // TODO: why can't I lookup the OffsetAttribute up in ctor instead?
  lastEndOffset = addAttribute(OffsetAttribute.class).endOffset();
  return true;
}
 
Example #25
Source File: Zemberek2DeASCIIfyFilterFactory.java    From lucene-solr-analysis-turkish with Apache License 2.0 5 votes vote down vote up
private boolean createToken(String synonym, AttributeSource.State current) {
    restoreState(current);
    termAttribute.setEmpty().append(synonym);
    typeAtt.setType(DEASCII_TOKEN_TYPE);
    posIncrAtt.setPositionIncrement(0);
    return true;
}
 
Example #26
Source File: OpenNLPPOSTaggerFilter.java    From jate with GNU Lesser General Public License v3.0 5 votes vote down vote up
@Override
public boolean incrementToken() throws IOException {
    //clearAttributes();
    if (first) {
        //gather all tokens from doc
        String[] words = walkTokens();
        if (words.length == 0) {
            return false;
        }
        //tagging
        posTags = createTags(words);
        first = false;
        tokenIdx = 0;
    }

    if (tokenIdx == tokenAttrs.size()) {
        resetParams();
        return false;
    }

    AttributeSource as = tokenAttrs.get(tokenIdx);
    Iterator<? extends Class<? extends Attribute>> it = as.getAttributeClassesIterator();
    while (it.hasNext()) {
        Class<? extends Attribute> attrClass = it.next();
        if (!hasAttribute(attrClass)) {
            addAttribute(attrClass);
        }
    }
    as.copyTo(this);
    MWEMetadata metadata = exitingPayload.getPayload() == null ? new MWEMetadata() :
            MWEMetadata.deserialize(exitingPayload.getPayload().utf8ToString());
    metadata.addMetaData(MWEMetadataType.POS, posTags[tokenIdx]);
    exitingPayload.setPayload(new BytesRef(MWEMetadata.serialize(metadata)));
    tokenIdx++;
    return true;
}
 
Example #27
Source File: MtasPreAnalyzedParser.java    From mtas with Apache License 2.0 4 votes vote down vote up
@Override
public ParseResult parse(Reader reader, AttributeSource parent)
    throws IOException {
  ParseResult res = new ParseResult();

  // get MtasUpdateRequestProcessorResult
  StringBuilder sb = new StringBuilder();
  char[] buf = new char[128];
  int cnt;
  while ((cnt = reader.read(buf)) > 0) {
    sb.append(buf, 0, cnt);
  }
  Iterator<MtasUpdateRequestProcessorResultItem> iterator;

  try (
      MtasUpdateRequestProcessorResultReader result = new MtasUpdateRequestProcessorResultReader(
          sb.toString());) {
    iterator = result.getIterator();
    if (iterator != null && iterator.hasNext()) {
      res.str = result.getStoredStringValue();
      res.bin = result.getStoredBinValue();
    } else {
      res.str = null;
      res.bin = null;
      result.close();
      return res;
    }
    parent.clearAttributes();
    while (iterator.hasNext()) {
      MtasUpdateRequestProcessorResultItem item = iterator.next();
      if (item.tokenTerm != null) {
        CharTermAttribute catt = parent.addAttribute(CharTermAttribute.class);
        catt.append(item.tokenTerm);
      }
      if (item.tokenFlags != null) {
        FlagsAttribute flags = parent.addAttribute(FlagsAttribute.class);
        flags.setFlags(item.tokenFlags);
      }
      if (item.tokenPosIncr != null) {
        PositionIncrementAttribute patt = parent
            .addAttribute(PositionIncrementAttribute.class);
        patt.setPositionIncrement(item.tokenPosIncr);
      }
      if (item.tokenPayload != null) {
        PayloadAttribute p = parent.addAttribute(PayloadAttribute.class);
        p.setPayload(new BytesRef(item.tokenPayload));
      }
      if (item.tokenOffsetStart != null && item.tokenOffsetEnd != null) {
        OffsetAttribute offset = parent.addAttribute(OffsetAttribute.class);
        offset.setOffset(item.tokenOffsetStart, item.tokenOffsetEnd);
      }
      // capture state and add to result
      State state = parent.captureState();
      res.states.add(state.clone());
      // reset for reuse
      parent.clearAttributes();
    }
  } catch (IOException e) {
    // ignore
    log.debug(e);
  }
  return res;
}
 
Example #28
Source File: SingleTokenTokenizer.java    From attic-polygene-java with Apache License 2.0 4 votes vote down vote up
public SingleTokenTokenizer( AttributeSource source, Reader in )
{
   super( source, in );
}
 
Example #29
Source File: STMergingTermsEnum.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public AttributeSource attributes() {
  throw new UnsupportedOperationException();
}
 
Example #30
Source File: SolrRangeQuery.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public AttributeSource attributes() {
  return te.attributes();
}