Java Code Examples for org.apache.lucene.analysis.tokenattributes.CharTermAttribute#length()

The following examples show how to use org.apache.lucene.analysis.tokenattributes.CharTermAttribute#length() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: SimpleSynonymMap.java From elasticsearch-dynamic-synonym with Apache License 2.0

6 votes

private Set<String> analyze(String text) throws IOException {
    Set<String> result = new HashSet<String>();
    Analyzer analyzer = configuration.getAnalyzer();
    try (TokenStream ts = analyzer.tokenStream("", text)) {
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            int length = termAtt.length();
            if (length == 0) {
                throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
            }
            if (posIncAtt.getPositionIncrement() != 1) {
                throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
            }

            result.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }

        ts.end();
        return result;
    }
}

Example 2

Source File: QueryAutoFilteringComponent.java From query-autofiltering-component with Apache License 2.0

6 votes

private ArrayList<char[]> tokenize( String input ) throws IOException {

    Log.debug( "tokenize '" + input + "'" );
    ArrayList<char[]> tokens = new ArrayList<char[]>( );
    Tokenizer tk = getTokenizerImpl( input );

    CharTermAttribute term = tk.addAttribute( CharTermAttribute.class );
    tk.reset( );
    while (tk.incrementToken( ) ) {
      int bufLen = term.length();
      char[] copy = new char[ bufLen ];
      System.arraycopy(term.buffer( ), 0, copy, 0, bufLen );
      tokens.add( copy );
    }

    return tokens;
  }

Example 3

Source File: QueryAutoFilteringComponent.java From query-autofiltering-component with Apache License 2.0

6 votes

private ArrayList<char[]> tokenize( String input ) throws IOException {
      
  Log.debug( "tokenize '" + input + "'" );
  ArrayList<char[]> tokens = new ArrayList<char[]>( );
  Tokenizer tk = getTokenizerImpl( input );
  
  CharTermAttribute term = tk.addAttribute( CharTermAttribute.class );
  tk.reset( );
  while (tk.incrementToken( ) ) {
    int bufLen = term.length();
    char[] copy = new char[ bufLen ];
    System.arraycopy(term.buffer( ), 0, copy, 0, bufLen );
    tokens.add( copy );
  }
      
  return tokens;
}

Example 4

Source File: SynonymMap.java From lucene-solr with Apache License 2.0

5 votes

/** Sugar: analyzes the text with the analyzer and
 *  separates by {@link SynonymMap#WORD_SEPARATOR}.
 *  reuse and its chars must not be null. */
public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException {
  try (TokenStream ts = analyzer.tokenStream("", text)) {
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
    ts.reset();
    reuse.clear();
    while (ts.incrementToken()) {
      int length = termAtt.length();
      if (length == 0) {
        throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
      }
      if (posIncAtt.getPositionIncrement() != 1) {
        throw new IllegalArgumentException("term: " + text + " analyzed to a token (" + termAtt +
                                           ") with position increment != 1 (got: " + posIncAtt.getPositionIncrement() + ")");
      }
      reuse.grow(reuse.length() + length + 1); /* current + word + separator */
      int end = reuse.length();
      if (reuse.length() > 0) {
        reuse.setCharAt(end++, SynonymMap.WORD_SEPARATOR);
        reuse.setLength(reuse.length() + 1);
      }
      System.arraycopy(termAtt.buffer(), 0, reuse.chars(), end, length);
      reuse.setLength(reuse.length() + length);
    }
    ts.end();
  }
  if (reuse.length() == 0) {
    throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
  }
  return reuse.get();
}

Example 5

Source File: MinHashFilterTest.java From lucene-solr with Apache License 2.0

5 votes

private ArrayList<String> getTokens(TokenStream ts) throws IOException {
  ArrayList<String> tokens = new ArrayList<>();
  ts.reset();
  while (ts.incrementToken()) {
    CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
    String token = new String(termAttribute.buffer(), 0, termAttribute.length());
    tokens.add(token);
  }
  ts.end();
  ts.close();

  return tokens;
}

Example 6

Source File: DecompoundTokenFilter.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

DecompoundToken(CharSequence value, CharTermAttribute termAttribute, OffsetAttribute offsetAttribute) {
    this.value = value;
    if (offsetAttribute.endOffset() - offsetAttribute.startOffset() != termAttribute.length()) {
        this.startOffset = offsetAttribute.startOffset();
        this.endOffset = offsetAttribute.endOffset();
    } else {
        this.startOffset = offsetAttribute.startOffset();
        this.endOffset = offsetAttribute.startOffset() + termAttribute.length();
    }
}

Example 7

Source File: AutoPhrasingTokenFilter.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

private char[] nextToken() throws IOException {
    if (input.incrementToken()) {
        CharTermAttribute termAttr = getTermAttribute();
        if (termAttr != null) {
            char[] termBuf = termAttr.buffer();
            char[] nextTok = new char[termAttr.length()];
            System.arraycopy(termBuf, 0, nextTok, 0, termAttr.length());
            return nextTok;
        }
    }
    return null;
}

Example 8

Source File: SolrInformationServer.java From SearchServices with GNU Lesser General Public License v3.0

4 votes

private void addContentPropertyToDocUsingAlfrescoRepository(
        SolrInputDocument doc,
        QName propertyQName,
        long dbId,
        String locale) throws AuthenticationException, IOException
{
    long start = System.nanoTime();

    // Expensive call to be done with ContentTracker
    try (GetTextContentResponse response = repositoryClient.getTextContent(dbId, propertyQName, null)) {
        addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.SpecializedFieldType.TRANSFORMATION_STATUS, response);
        addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.SpecializedFieldType.TRANSFORMATION_EXCEPTION, response);
        addContentPropertyMetadata(doc, propertyQName, AlfrescoSolrDataModel.SpecializedFieldType.TRANSFORMATION_TIME, response);

        final String textContent = textContentFrom(response);

        if (fingerprintHasBeenEnabledOnThisInstance && !textContent.isBlank()) {
            Analyzer analyzer = core.getLatestSchema().getFieldType("min_hash").getIndexAnalyzer();
            TokenStream ts = analyzer.tokenStream("dummy_field", textContent);
            CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
            ts.reset();
            while (ts.incrementToken()) {
                StringBuilder tokenBuff = new StringBuilder();
                char[] buff = termAttribute.buffer();

                for (int i = 0; i < termAttribute.length(); i++) {
                    tokenBuff.append(Integer.toHexString(buff[i]));
                }
                doc.addField(FINGERPRINT_FIELD, tokenBuff.toString());

            }
            ts.end();
            ts.close();
        }

        this.getTrackerStats().addDocTransformationTime(System.nanoTime() - start);

        String storedField = dataModel.getStoredContentField(propertyQName);
        doc.setField(storedField, "\u0000" + languageFrom(locale) + "\u0000" + textContent);

        dataModel.getIndexedFieldNamesForProperty(propertyQName)
                .getFields()
                .forEach(field -> addFieldIfNotSet(doc, field.getField()));
    }
}

Example 9

Source File: ICUTransformFilter.java From lucene-solr with Apache License 2.0

4 votes

void setText(final CharTermAttribute token) {
  this.token = token;
  this.buffer = token.buffer();
  this.length = token.length();
}

Example 10

Source File: JsonPreAnalyzedParser.java From lucene-solr with Apache License 2.0

4 votes

@Override
public String toFormattedString(Field f) throws IOException {
  Map<String,Object> map = new LinkedHashMap<>();
  map.put(VERSION_KEY, VERSION);
  if (f.fieldType().stored()) {
    String stringValue = f.stringValue();
    if (stringValue != null) {
      map.put(STRING_KEY, stringValue);
    }
    BytesRef binaryValue = f.binaryValue();
    if (binaryValue != null) {
      map.put(BINARY_KEY, Base64.byteArrayToBase64(binaryValue.bytes, binaryValue.offset, binaryValue.length));
    }
  }
  TokenStream ts = f.tokenStreamValue();
  if (ts != null) {
    List<Map<String,Object>> tokens = new LinkedList<>();
    while (ts.incrementToken()) {
      Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
      String cTerm = null;
      String tTerm = null;
      Map<String,Object> tok = new TreeMap<>();
      while (it.hasNext()) {
        Class<? extends Attribute> cl = it.next();
        Attribute att = ts.getAttribute(cl);
        if (att == null) {
          continue;
        }
        if (cl.isAssignableFrom(CharTermAttribute.class)) {
          CharTermAttribute catt = (CharTermAttribute)att;
          cTerm = new String(catt.buffer(), 0, catt.length());
        } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
          TermToBytesRefAttribute tatt = (TermToBytesRefAttribute)att;
          tTerm = tatt.getBytesRef().utf8ToString();
        } else {
          if (cl.isAssignableFrom(FlagsAttribute.class)) {
            tok.put(FLAGS_KEY, Integer.toHexString(((FlagsAttribute)att).getFlags()));
          } else if (cl.isAssignableFrom(OffsetAttribute.class)) {
            tok.put(OFFSET_START_KEY, ((OffsetAttribute)att).startOffset());
            tok.put(OFFSET_END_KEY, ((OffsetAttribute)att).endOffset());
          } else if (cl.isAssignableFrom(PayloadAttribute.class)) {
            BytesRef p = ((PayloadAttribute)att).getPayload();
            if (p != null && p.length > 0) {
              tok.put(PAYLOAD_KEY, Base64.byteArrayToBase64(p.bytes, p.offset, p.length));
            }
          } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
            tok.put(POSINCR_KEY, ((PositionIncrementAttribute)att).getPositionIncrement());
          } else if (cl.isAssignableFrom(TypeAttribute.class)) {
            tok.put(TYPE_KEY, ((TypeAttribute)att).type());
          } else {
            tok.put(cl.getName(), att.toString());
          }
        }
      }
      String term = null;
      if (cTerm != null) {
        term = cTerm;
      } else {
        term = tTerm;
      }
      if (term != null && term.length() > 0) {
        tok.put(TOKEN_KEY, term);
      }
      tokens.add(tok);
    }
    map.put(TOKENS_KEY, tokens);
  }
  return JSONUtil.toJSON(map, -1);
}

Example 11

Source File: IcuTransformTokenFilter.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

4 votes

void setText(final CharTermAttribute token) {
    this.token = token;
    this.buffer = token.buffer();
    this.length = token.length();
}