Java Code Examples for org.apache.lucene.analysis.tokenattributes.CharTermAttribute#length()

The following examples show how to use org.apache.lucene.analysis.tokenattributes.CharTermAttribute#length() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SimpleSynonymMap.java    From elasticsearch-dynamic-synonym with Apache License 2.0 6 votes vote down vote up
private Set<String> analyze(String text) throws IOException {
    Set<String> result = new HashSet<String>();
    Analyzer analyzer = configuration.getAnalyzer();
    try (TokenStream ts = analyzer.tokenStream("", text)) {
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            int length = termAtt.length();
            if (length == 0) {
                throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
            }
            if (posIncAtt.getPositionIncrement() != 1) {
                throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
            }

            result.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }

        ts.end();
        return result;
    }
}
 
Example 2
Source File: QueryAutoFilteringComponent.java    From query-autofiltering-component with Apache License 2.0 6 votes vote down vote up
private ArrayList<char[]> tokenize( String input ) throws IOException {

    Log.debug( "tokenize '" + input + "'" );
    ArrayList<char[]> tokens = new ArrayList<char[]>( );
    Tokenizer tk = getTokenizerImpl( input );

    CharTermAttribute term = tk.addAttribute( CharTermAttribute.class );
    tk.reset( );
    while (tk.incrementToken( ) ) {
      int bufLen = term.length();
      char[] copy = new char[ bufLen ];
      System.arraycopy(term.buffer( ), 0, copy, 0, bufLen );
      tokens.add( copy );
    }

    return tokens;
  }
 
Example 3
Source File: QueryAutoFilteringComponent.java    From query-autofiltering-component with Apache License 2.0 6 votes vote down vote up
private ArrayList<char[]> tokenize( String input ) throws IOException {
      
  Log.debug( "tokenize '" + input + "'" );
  ArrayList<char[]> tokens = new ArrayList<char[]>( );
  Tokenizer tk = getTokenizerImpl( input );
  
  CharTermAttribute term = tk.addAttribute( CharTermAttribute.class );
  tk.reset( );
  while (tk.incrementToken( ) ) {
    int bufLen = term.length();
    char[] copy = new char[ bufLen ];
    System.arraycopy(term.buffer( ), 0, copy, 0, bufLen );
    tokens.add( copy );
  }
      
  return tokens;
}
 
Example 4
Source File: SynonymMap.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Sugar: analyzes the text with the analyzer and
 *  separates by {@link SynonymMap#WORD_SEPARATOR}.
 *  reuse and its chars must not be null. */
public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException {
  try (TokenStream ts = analyzer.tokenStream("", text)) {
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
    ts.reset();
    reuse.clear();
    while (ts.incrementToken()) {
      int length = termAtt.length();
      if (length == 0) {
        throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
      }
      if (posIncAtt.getPositionIncrement() != 1) {
        throw new IllegalArgumentException("term: " + text + " analyzed to a token (" + termAtt +
                                           ") with position increment != 1 (got: " + posIncAtt.getPositionIncrement() + ")");
      }
      reuse.grow(reuse.length() + length + 1); /* current + word + separator */
      int end = reuse.length();
      if (reuse.length() > 0) {
        reuse.setCharAt(end++, SynonymMap.WORD_SEPARATOR);
        reuse.setLength(reuse.length() + 1);
      }
      System.arraycopy(termAtt.buffer(), 0, reuse.chars(), end, length);
      reuse.setLength(reuse.length() + length);
    }
    ts.end();
  }
  if (reuse.length() == 0) {
    throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
  }
  return reuse.get();
}
 
Example 5
Source File: MinHashFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private ArrayList<String> getTokens(TokenStream ts) throws IOException {
  ArrayList<String> tokens = new ArrayList<>();
  ts.reset();
  while (ts.incrementToken()) {
    CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
    String token = new String(termAttribute.buffer(), 0, termAttribute.length());
    tokens.add(token);
  }
  ts.end();
  ts.close();

  return tokens;
}
 
Example 6
Source File: DecompoundTokenFilter.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
DecompoundToken(CharSequence value, CharTermAttribute termAttribute, OffsetAttribute offsetAttribute) {
    this.value = value;
    if (offsetAttribute.endOffset() - offsetAttribute.startOffset() != termAttribute.length()) {
        this.startOffset = offsetAttribute.startOffset();
        this.endOffset = offsetAttribute.endOffset();
    } else {
        this.startOffset = offsetAttribute.startOffset();
        this.endOffset = offsetAttribute.startOffset() + termAttribute.length();
    }
}
 
Example 7
Source File: AutoPhrasingTokenFilter.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
private char[] nextToken() throws IOException {
    if (input.incrementToken()) {
        CharTermAttribute termAttr = getTermAttribute();
        if (termAttr != null) {
            char[] termBuf = termAttr.buffer();
            char[] nextTok = new char[termAttr.length()];
            System.arraycopy(termBuf, 0, nextTok, 0, termAttr.length());
            return nextTok;
        }
    }
    return null;
}
 
Example 8
Source File: SolrInformationServer.java    From SearchServices with GNU Lesser General Public License v3.0 4 votes vote down vote up
private void addContentPropertyToDocUsingAlfrescoRepository(
        SolrInputDocument doc,
        QName propertyQName,
        long dbId,
        String locale) throws AuthenticationException, IOException
{
    long start = System.nanoTime();

    // Expensive call to be done with ContentTracker
    try (GetTextContentResponse response = repositoryClient.getTextContent(dbId, propertyQName, null)) {
        addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.SpecializedFieldType.TRANSFORMATION_STATUS, response);
        addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.SpecializedFieldType.TRANSFORMATION_EXCEPTION, response);
        addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.SpecializedFieldType.TRANSFORMATION_TIME, response);

        final String textContent = textContentFrom(response);

        if (fingerprintHasBeenEnabledOnThisInstance && !textContent.isBlank()) {
            Analyzer analyzer = core.getLatestSchema().getFieldType("min_hash").getIndexAnalyzer();
            TokenStream ts = analyzer.tokenStream("dummy_field", textContent);
            CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
            ts.reset();
            while (ts.incrementToken()) {
                StringBuilder tokenBuff = new StringBuilder();
                char[] buff = termAttribute.buffer();

                for (int i = 0; i < termAttribute.length(); i++) {
                    tokenBuff.append(Integer.toHexString(buff[i]));
                }
                doc.addField(FINGERPRINT_FIELD, tokenBuff.toString());

            }
            ts.end();
            ts.close();
        }

        this.getTrackerStats().addDocTransformationTime(System.nanoTime() - start);

        String storedField = dataModel.getStoredContentField(propertyQName);
        doc.setField(storedField, "\u0000" + languageFrom(locale) + "\u0000" + textContent);

        dataModel.getIndexedFieldNamesForProperty(propertyQName)
                .getFields()
                .forEach(field -> addFieldIfNotSet(doc, field.getField()));
    }
}
 
Example 9
Source File: ICUTransformFilter.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
void setText(final CharTermAttribute token) {
  this.token = token;
  this.buffer = token.buffer();
  this.length = token.length();
}
 
Example 10
Source File: JsonPreAnalyzedParser.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public String toFormattedString(Field f) throws IOException {
  Map<String,Object> map = new LinkedHashMap<>();
  map.put(VERSION_KEY, VERSION);
  if (f.fieldType().stored()) {
    String stringValue = f.stringValue();
    if (stringValue != null) {
      map.put(STRING_KEY, stringValue);
    }
    BytesRef binaryValue = f.binaryValue();
    if (binaryValue != null) {
      map.put(BINARY_KEY, Base64.byteArrayToBase64(binaryValue.bytes, binaryValue.offset, binaryValue.length));
    }
  }
  TokenStream ts = f.tokenStreamValue();
  if (ts != null) {
    List<Map<String,Object>> tokens = new LinkedList<>();
    while (ts.incrementToken()) {
      Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
      String cTerm = null;
      String tTerm = null;
      Map<String,Object> tok = new TreeMap<>();
      while (it.hasNext()) {
        Class<? extends Attribute> cl = it.next();
        Attribute att = ts.getAttribute(cl);
        if (att == null) {
          continue;
        }
        if (cl.isAssignableFrom(CharTermAttribute.class)) {
          CharTermAttribute catt = (CharTermAttribute)att;
          cTerm = new String(catt.buffer(), 0, catt.length());
        } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
          TermToBytesRefAttribute tatt = (TermToBytesRefAttribute)att;
          tTerm = tatt.getBytesRef().utf8ToString();
        } else {
          if (cl.isAssignableFrom(FlagsAttribute.class)) {
            tok.put(FLAGS_KEY, Integer.toHexString(((FlagsAttribute)att).getFlags()));
          } else if (cl.isAssignableFrom(OffsetAttribute.class)) {
            tok.put(OFFSET_START_KEY, ((OffsetAttribute)att).startOffset());
            tok.put(OFFSET_END_KEY, ((OffsetAttribute)att).endOffset());
          } else if (cl.isAssignableFrom(PayloadAttribute.class)) {
            BytesRef p = ((PayloadAttribute)att).getPayload();
            if (p != null && p.length > 0) {
              tok.put(PAYLOAD_KEY, Base64.byteArrayToBase64(p.bytes, p.offset, p.length));
            }
          } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
            tok.put(POSINCR_KEY, ((PositionIncrementAttribute)att).getPositionIncrement());
          } else if (cl.isAssignableFrom(TypeAttribute.class)) {
            tok.put(TYPE_KEY, ((TypeAttribute)att).type());
          } else {
            tok.put(cl.getName(), att.toString());
          }
        }
      }
      String term = null;
      if (cTerm != null) {
        term = cTerm;
      } else {
        term = tTerm;
      }
      if (term != null && term.length() > 0) {
        tok.put(TOKEN_KEY, term);
      }
      tokens.add(tok);
    }
    map.put(TOKENS_KEY, tokens);
  }
  return JSONUtil.toJSON(map, -1);
}
 
Example 11
Source File: IcuTransformTokenFilter.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 4 votes vote down vote up
void setText(final CharTermAttribute token) {
    this.token = token;
    this.buffer = token.buffer();
    this.length = token.length();
}