org.apache.lucene.util.CharsRefBuilder Java Exaples

Source File: TestIndexWriterUnicode.java From lucene-solr with Apache License 2.0

6 votes

public void testRandomUnicodeStrings() throws Throwable {
  char[] buffer = new char[20];
  char[] expected = new char[20];

  CharsRefBuilder utf16 = new CharsRefBuilder();

  int num = atLeast(10000);
  for (int iter = 0; iter < num; iter++) {
    boolean hasIllegal = fillUnicode(buffer, expected, 0, 20);

    BytesRef utf8 = new BytesRef(CharBuffer.wrap(buffer, 0, 20));
    if (!hasIllegal) {
      byte[] b = new String(buffer, 0, 20).getBytes(StandardCharsets.UTF_8);
      assertEquals(b.length, utf8.length);
      for(int i=0;i<b.length;i++)
        assertEquals(b[i], utf8.bytes[i]);
    }

    utf16.copyUTF8Bytes(utf8.bytes, 0, utf8.length);
    assertEquals(utf16.length(), 20);
    for(int i=0;i<20;i++)
      assertEquals(expected[i], utf16.charAt(i));
  }
}

Source File: TermSuggester.java From Elasticsearch with Apache License 2.0

6 votes

@Override
public TermSuggestion innerExecute(String name, TermSuggestionContext suggestion, IndexSearcher searcher, CharsRefBuilder spare) throws IOException {
    DirectSpellChecker directSpellChecker = SuggestUtils.getDirectSpellChecker(suggestion.getDirectSpellCheckerSettings());
    final IndexReader indexReader = searcher.getIndexReader();
    TermSuggestion response = new TermSuggestion(
            name, suggestion.getSize(), suggestion.getDirectSpellCheckerSettings().sort()
    );
    List<Token> tokens = queryTerms(suggestion, spare);
    for (Token token : tokens) {
        // TODO: Extend DirectSpellChecker in 4.1, to get the raw suggested words as BytesRef
        SuggestWord[] suggestedWords = directSpellChecker.suggestSimilar(
                token.term, suggestion.getShardSize(), indexReader, suggestion.getDirectSpellCheckerSettings().suggestMode()
        );
        Text key = new Text(new BytesArray(token.term.bytes()));
        TermSuggestion.Entry resultEntry = new TermSuggestion.Entry(key, token.startOffset, token.endOffset - token.startOffset);
        for (SuggestWord suggestWord : suggestedWords) {
            Text word = new Text(suggestWord.string);
            resultEntry.addOption(new TermSuggestion.Entry.Option(word, suggestWord.freq, suggestWord.score));
        }
        response.addTerm(resultEntry);
    }
    return response;
}

Source File: XAnalyzingSuggester.java From Elasticsearch with Apache License 2.0

6 votes

private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRefBuilder spare) {
  LookupResult result;
  if (hasPayloads) {
    int sepIndex = -1;
    for(int i=0;i<output2.length;i++) {
      if (output2.bytes[output2.offset+i] == payloadSep) {
        sepIndex = i;
        break;
      }
    }
    assert sepIndex != -1;
    final int payloadLen = output2.length - sepIndex - 1;
    spare.copyUTF8Bytes(output2.bytes, output2.offset, sepIndex);
    BytesRef payload = new BytesRef(payloadLen);
    System.arraycopy(output2.bytes, sepIndex+1, payload.bytes, 0, payloadLen);
    payload.length = payloadLen;
    result = new LookupResult(spare.toString(), decodeWeight(output1), payload);
  } else {
    spare.copyUTF8Bytes(output2);
    result = new LookupResult(spare.toString(), decodeWeight(output1));
  }

  return result;
}

Source File: TermVectorsResponse.java From Elasticsearch with Apache License 2.0

6 votes

private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter, BoostAttribute boostAtt) throws IOException {
    // start term, optimized writing
    BytesRef term = termIter.next();
    spare.copyUTF8Bytes(term);
    builder.startObject(spare.toString());
    buildTermStatistics(builder, termIter);
    // finally write the term vectors
    PostingsEnum posEnum = termIter.postings(null, PostingsEnum.ALL);
    int termFreq = posEnum.freq();
    builder.field(FieldStrings.TERM_FREQ, termFreq);
    initMemory(curTerms, termFreq);
    initValues(curTerms, posEnum, termFreq);
    buildValues(builder, curTerms, termFreq);
    buildScore(builder, boostAtt);
    builder.endObject();
}

Source File: MoreLikeThis.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param field2termFreqMap a Map of terms and their frequencies per field
 * @param vector List of terms and their frequencies for a doc/field
 */
private void addTermFrequencies(Map<String, Map<String, Int>> field2termFreqMap, Terms vector, String fieldName) throws IOException {
  Map<String, Int> termFreqMap = field2termFreqMap.computeIfAbsent(fieldName, k -> new HashMap<>());
  final TermsEnum termsEnum = vector.iterator();
  final CharsRefBuilder spare = new CharsRefBuilder();
  BytesRef text;
  while((text = termsEnum.next()) != null) {
    spare.copyUTF8Bytes(text);
    final String term = spare.toString();
    if (isNoiseWord(term)) {
      continue;
    }
    final int freq = (int) termsEnum.totalTermFreq();

    // increment frequency
    Int cnt = termFreqMap.get(term);
    if (cnt == null) {
      cnt = new Int();
      termFreqMap.put(term, cnt);
      cnt.x = freq;
    } else {
      cnt.x += freq;
    }
  }
}

Source File: AnalyzingSuggester.java From lucene-solr with Apache License 2.0

6 votes

private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRefBuilder spare) {
  LookupResult result;
  if (hasPayloads) {
    int sepIndex = -1;
    for(int i=0;i<output2.length;i++) {
      if (output2.bytes[output2.offset+i] == PAYLOAD_SEP) {
        sepIndex = i;
        break;
      }
    }
    assert sepIndex != -1;
    spare.grow(sepIndex);
    final int payloadLen = output2.length - sepIndex - 1;
    spare.copyUTF8Bytes(output2.bytes, output2.offset, sepIndex);
    BytesRef payload = new BytesRef(payloadLen);
    System.arraycopy(output2.bytes, sepIndex+1, payload.bytes, 0, payloadLen);
    payload.length = payloadLen;
    result = new LookupResult(spare.toString(), decodeWeight(output1), payload);
  } else {
    spare.grow(output2.length);
    spare.copyUTF8Bytes(output2);
    result = new LookupResult(spare.toString(), decodeWeight(output1));
  }

  return result;
}

Source File: JaspellLookup.java From lucene-solr with Apache License 2.0

6 votes

@Override
public void build(InputIterator iterator) throws IOException {
  if (iterator.hasPayloads()) {
    throw new IllegalArgumentException("this suggester doesn't support payloads");
  }
  if (iterator.hasContexts()) {
    throw new IllegalArgumentException("this suggester doesn't support contexts");
  }
  count = 0;
  trie = new JaspellTernarySearchTrie();
  trie.setMatchAlmostDiff(editDistance);
  BytesRef spare;
  final CharsRefBuilder charsSpare = new CharsRefBuilder();

  while ((spare = iterator.next()) != null) {
    final long weight = iterator.weight();
    if (spare.length == 0) {
      continue;
    }
    charsSpare.copyUTF8Bytes(spare);
    trie.put(charsSpare.toString(), weight);
    count++;
  }
}

Source File: FSTCompletionLookup.java From lucene-solr with Apache License 2.0

6 votes

@Override
public List<LookupResult> lookup(CharSequence key, Set<BytesRef> contexts, boolean higherWeightsFirst, int num) {
  if (contexts != null) {
    throw new IllegalArgumentException("this suggester doesn't support contexts");
  }
  final List<Completion> completions;
  if (higherWeightsFirst) {
    completions = higherWeightsCompletion.lookup(key, num);
  } else {
    completions = normalCompletion.lookup(key, num);
  }
  
  final ArrayList<LookupResult> results = new ArrayList<>(completions.size());
  CharsRefBuilder spare = new CharsRefBuilder();
  for (Completion c : completions) {
    spare.copyUTF8Bytes(c.utf8);
    results.add(new LookupResult(spare.toString(), c.bucket));
  }
  return results;
}

Source File: TSTLookup.java From lucene-solr with Apache License 2.0

6 votes

@Override
public void build(InputIterator iterator) throws IOException {
  if (iterator.hasPayloads()) {
    throw new IllegalArgumentException("this suggester doesn't support payloads");
  }
  if (iterator.hasContexts()) {
    throw new IllegalArgumentException("this suggester doesn't support contexts");
  }
  root = new TernaryTreeNode();

  // make sure it's sorted and the comparator uses UTF16 sort order
  iterator = new SortedInputIterator(tempDir, tempFileNamePrefix, iterator, utf8SortedAsUTF16SortOrder);
  count = 0;
  ArrayList<String> tokens = new ArrayList<>();
  ArrayList<Number> vals = new ArrayList<>();
  BytesRef spare;
  CharsRefBuilder charsSpare = new CharsRefBuilder();
  while ((spare = iterator.next()) != null) {
    charsSpare.copyUTF8Bytes(spare);
    tokens.add(charsSpare.toString());
    vals.add(Long.valueOf(iterator.weight()));
    count++;
  }
  autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root);
}

Source File: TestSuggestField.java From lucene-solr with Apache License 2.0

6 votes

@Test
public void testReservedChars() throws Exception {
  CharsRefBuilder charsRefBuilder = new CharsRefBuilder();
  charsRefBuilder.append("sugg");
  charsRefBuilder.setCharAt(2, (char) ConcatenateGraphFilter.SEP_LABEL);
  IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
    new SuggestField("name", charsRefBuilder.toString(), 1);
  });
  assertTrue(expected.getMessage().contains("[0x1f]"));

  charsRefBuilder.setCharAt(2, (char) CompletionAnalyzer.HOLE_CHARACTER);
  expected = expectThrows(IllegalArgumentException.class, () -> {
    new SuggestField("name", charsRefBuilder.toString(), 1);
  });
  assertTrue(expected.getMessage().contains("[0x1e]"));

  charsRefBuilder.setCharAt(2, (char) NRTSuggesterBuilder.END_BYTE);
  expected = expectThrows(IllegalArgumentException.class, () -> {
    new SuggestField("name", charsRefBuilder.toString(), 1);
  });
  assertTrue(expected.getMessage().contains("[0x0]"));
}

Source File: DutchAnalyzer.java From lucene-solr with Apache License 2.0

6 votes

public DutchAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<String> stemOverrideDict) {
  this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
  this.excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionTable));
  if (stemOverrideDict.isEmpty()) {
    this.stemdict = null;
  } else {
    // we don't need to ignore case here since we lowercase in this analyzer anyway
    StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false);
    CharArrayMap<String>.EntryIterator iter = stemOverrideDict.entrySet().iterator();
    CharsRefBuilder spare = new CharsRefBuilder();
    while (iter.hasNext()) {
      char[] nextKey = iter.nextKey();
      spare.copyChars(nextKey, 0, nextKey.length);
      builder.add(spare.get(), iter.currentValue());
    }
    try {
      this.stemdict = builder.build();
    } catch (IOException ex) {
      throw new RuntimeException("can not build stem dict", ex);
    }
  }
}

Source File: SynonymMap.java From lucene-solr with Apache License 2.0

6 votes

/** Sugar: just joins the provided terms with {@link
 *  SynonymMap#WORD_SEPARATOR}.  reuse and its chars
 *  must not be null. */
public static CharsRef join(String[] words, CharsRefBuilder reuse) {
  int upto = 0;
  char[] buffer = reuse.chars();
  for (String word : words) {
    final int wordLen = word.length();
    final int needed = (0 == upto ? wordLen : 1 + upto + wordLen); // Add 1 for WORD_SEPARATOR
    if (needed > buffer.length) {
      reuse.grow(needed);
      buffer = reuse.chars();
    }
    if (upto > 0) {
      buffer[upto++] = SynonymMap.WORD_SEPARATOR;
    }

    word.getChars(0, wordLen, buffer, upto);
    upto += wordLen;
  }
  reuse.setLength(upto);
  return reuse.get();
}

Source File: QueryAutoStopWordAnalyzer.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the
 * given selection of fields from terms with a document frequency greater than
 * the given maxDocFreq
 *
 * @param delegate Analyzer whose TokenStream will be filtered
 * @param indexReader IndexReader to identify the stopwords from
 * @param fields Selection of fields to calculate stopwords for
 * @param maxDocFreq Document frequency terms should be above in order to be stopwords
 * @throws IOException Can be thrown while reading from the IndexReader
 */
public QueryAutoStopWordAnalyzer(
    Analyzer delegate,
    IndexReader indexReader,
    Collection<String> fields,
    int maxDocFreq) throws IOException {
  super(delegate.getReuseStrategy());
  this.delegate = delegate;
  
  for (String field : fields) {
    Set<String> stopWords = new HashSet<>();
    Terms terms = MultiTerms.getTerms(indexReader, field);
    CharsRefBuilder spare = new CharsRefBuilder();
    if (terms != null) {
      TermsEnum te = terms.iterator();
      BytesRef text;
      while ((text = te.next()) != null) {
        if (te.docFreq() > maxDocFreq) {
          spare.copyUTF8Bytes(text);
          stopWords.add(spare.toString());
        }
      }
    }
    stopWordsPerField.put(field, stopWords);
  }
}

Source File: TestConcatenateGraphFilter.java From lucene-solr with Apache License 2.0

6 votes

@Test
  public void testWithStopword() throws Exception {
    for (boolean preservePosInc : new boolean[]{true, false}) {
      Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
      String input = "a mykeyword a keyword"; //LUCENE-8344 add "a"
      tokenStream.setReader(new StringReader(input));
      TokenFilter tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("a"));
      ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenFilter, SEP_LABEL, preservePosInc, 10);
      CharsRefBuilder builder = new CharsRefBuilder();
      if (preservePosInc) {
        builder.append(SEP_LABEL);
      }
      builder.append("mykeyword");
      builder.append(SEP_LABEL);
      if (preservePosInc) {
        builder.append(SEP_LABEL);
      }
      builder.append("keyword");
//      if (preservePosInc) { LUCENE-8344 uncomment
//        builder.append(SEP_LABEL);
//      }
      assertTokenStreamContents(concatStream, new String[]{builder.toCharsRef().toString()});
    }
  }

Source File: TestLimitTokenPositionFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testMaxPosition3WithSynomyms() throws IOException {
  for (final boolean consumeAll : new boolean[]{true, false}) {
    MockTokenizer tokenizer = whitespaceMockTokenizer("one two three four five");
    // if we are consuming all tokens, we can use the checks, otherwise we can't
    tokenizer.setEnableChecks(consumeAll);

    SynonymMap.Builder builder = new SynonymMap.Builder(true);
    builder.add(new CharsRef("one"), new CharsRef("first"), true);
    builder.add(new CharsRef("one"), new CharsRef("alpha"), true);
    builder.add(new CharsRef("one"), new CharsRef("beguine"), true);
    CharsRefBuilder multiWordCharsRef = new CharsRefBuilder();
    SynonymMap.Builder.join(new String[]{"and", "indubitably", "single", "only"}, multiWordCharsRef);
    builder.add(new CharsRef("one"), multiWordCharsRef.get(), true);
    SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef);
    builder.add(new CharsRef("two"), multiWordCharsRef.get(), true);
    SynonymMap synonymMap = builder.build();
    @SuppressWarnings("deprecation")
    TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true);
    stream = new LimitTokenPositionFilter(stream, 3, consumeAll);

    // "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3.
    assertTokenStreamContents(stream,
        new String[]{"one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger"},
        new int[]{1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0});
  }
}

Source File: PointField.java From lucene-solr with Apache License 2.0

5 votes

@Override
public CharsRef indexedToReadable(BytesRef indexedForm, CharsRefBuilder charsRef) {
  final String value = indexedToReadable(indexedForm);
  charsRef.grow(value.length());
  charsRef.setLength(value.length());
  value.getChars(0, charsRef.length(), charsRef.chars(), 0);
  return charsRef.get();
}

Source File: EnumField.java From lucene-solr with Apache License 2.0

5 votes

@Override
public CharsRef indexedToReadable(BytesRef input, CharsRefBuilder output) {
  final Integer intValue = LegacyNumericUtils.prefixCodedToInt(input);
  final String stringValue = enumMapping.intValueToStringValue(intValue);
  output.grow(stringValue.length());
  output.setLength(stringValue.length());
  stringValue.getChars(0, output.length(), output.chars(), 0);
  return output.get();
}

Source File: ESWordnetSynonymParser.java From crate with Apache License 2.0

5 votes

@Override
public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException {
    try {
        return super.analyze(text, reuse);
    } catch (IllegalArgumentException ex) {
        if (lenient) {
            LOGGER.info("Synonym rule for [" + text + "] was ignored");
            return new CharsRef("");
        } else {
            throw ex;
        }
    }
}

Source File: TrieField.java From lucene-solr with Apache License 2.0

5 votes

@Override
public CharsRef indexedToReadable(BytesRef indexedForm, CharsRefBuilder charsRef) {
  final String value;
  switch (type) {
    case INTEGER:
      value = Integer.toString( LegacyNumericUtils.prefixCodedToInt(indexedForm) );
      break;
    case FLOAT:
      value = Float.toString( NumericUtils.sortableIntToFloat(LegacyNumericUtils.prefixCodedToInt(indexedForm)) );
      break;
    case LONG:
      value = Long.toString( LegacyNumericUtils.prefixCodedToLong(indexedForm) );
      break;
    case DOUBLE:
      value = Double.toString( NumericUtils.sortableLongToDouble(LegacyNumericUtils.prefixCodedToLong(indexedForm)) );
      break;
    case DATE:
      value = Instant.ofEpochMilli(LegacyNumericUtils.prefixCodedToLong(indexedForm)).toString();
      break;
    default:
      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + type);
  }
  charsRef.grow(value.length());
  charsRef.setLength(value.length());
  value.getChars(0, charsRef.length(), charsRef.chars(), 0);
  return charsRef.get();
}

Source File: ESSolrSynonymParser.java From crate with Apache License 2.0

5 votes

@Override
public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException {
    try {
        return super.analyze(text, reuse);
    } catch (IllegalArgumentException ex) {
        if (lenient) {
            LOGGER.info("Synonym rule for [" + text + "] was ignored");
            return new CharsRef("");
        } else {
            throw ex;
        }
    }
}

Source File: ExpandComponent.java From lucene-solr with Apache License 2.0

5 votes

@SuppressWarnings({"unchecked"})
private void addGroupSliceToOutputMap(FieldType fieldType, IntObjectHashMap<BytesRef> ordBytes,
                                      @SuppressWarnings({"rawtypes"})NamedList outMap, CharsRefBuilder charsRef, long groupValue, DocSlice slice) {
  if(fieldType instanceof StrField) {
    final BytesRef bytesRef = ordBytes.get((int)groupValue);
    fieldType.indexedToReadable(bytesRef, charsRef);
    String group = charsRef.toString();
    outMap.add(group, slice);
  } else {
    outMap.add(numericToString(fieldType, groupValue), slice);
  }
}

Source File: MoreLikeThisHandler.java From lucene-solr with Apache License 2.0

5 votes

public DocListAndSet getMoreLikeThis( Reader reader, int start, int rows, List<Query> filters, List<InterestingTerm> terms, int flags ) throws IOException
{
  // SOLR-5351: if only check against a single field, use the reader directly. Otherwise we
  // repeat the stream's content for multiple fields so that query terms can be pulled from any
  // of those fields.
  String [] fields = mlt.getFieldNames();
  if (fields.length == 1) {
    rawMLTQuery = mlt.like(fields[0], reader);
  } else {
    CharsRefBuilder buffered = new CharsRefBuilder();
    char [] chunk = new char [1024];
    int len;
    while ((len = reader.read(chunk)) >= 0) {
      buffered.append(chunk, 0, len);
    }

    Collection<Object> streamValue = Collections.singleton(buffered.get().toString());
    Map<String, Collection<Object>> multifieldDoc = new HashMap<>(fields.length);
    for (String field : fields) {
      multifieldDoc.put(field, streamValue);
    }

    rawMLTQuery = mlt.like(multifieldDoc);
  }

  boostedMLTQuery = getBoostedQuery( rawMLTQuery );
  if (terms != null) {
    fillInterestingTermsFromMLTQuery( boostedMLTQuery, terms );
  }
  DocListAndSet results = new DocListAndSet();
  if (this.needDocSet) {
    results = searcher.getDocListAndSet( boostedMLTQuery, filters, null, start, rows, flags);
  } else {
    results.docList = searcher.getDocList( boostedMLTQuery, filters, null, start, rows, flags);
  }
  return results;
}

Source File: QueryParsing.java From lucene-solr with Apache License 2.0

5 votes

static void writeFieldVal(BytesRef val, FieldType ft, Appendable out, int flags) throws IOException {
  if (ft != null) {
    try {
      CharsRefBuilder readable = new CharsRefBuilder();
      ft.indexedToReadable(val, readable);
      out.append(readable.get());
    } catch (Exception e) {
      out.append("EXCEPTION(val=");
      out.append(val.utf8ToString());
      out.append(")");
    }
  } else {
    out.append(val.utf8ToString());
  }
}

Source File: SearchGroupsResultTransformer.java From lucene-solr with Apache License 2.0

5 votes

@SuppressWarnings({"rawtypes"})
private NamedList serializeSearchGroup(Collection<SearchGroup<BytesRef>> data, SearchGroupsFieldCommand command) {
  final NamedList<Object[]> result = new NamedList<>(data.size());

  SortField[] groupSortField = command.getGroupSort().getSort();
  for (SearchGroup<BytesRef> searchGroup : data) {
    Object[] convertedSortValues = serializeOneSearchGroup(groupSortField, searchGroup);
    SchemaField field = searcher.getSchema().getFieldOrNull(command.getKey());
    String groupValue = searchGroup.groupValue != null ? field.getType().indexedToReadable(searchGroup.groupValue, new CharsRefBuilder()).toString() : null;
    result.add(groupValue, convertedSortValues);
  }

  return result;
}

Source File: DeleteUpdateCommand.java From lucene-solr with Apache License 2.0

5 votes

public String getId() {
  if (id == null && indexedId != null) {
    IndexSchema schema = req.getSchema();
    SchemaField sf = schema.getUniqueKeyField();
    if (sf != null) {
      CharsRefBuilder ref = new CharsRefBuilder();
      sf.getType().indexedToReadable(indexedId, ref);
      id = ref.toString();
    }
  }
  return id;
}

Source File: TestIndexWriterUnicode.java From lucene-solr with Apache License 2.0

5 votes

public void testAllUnicodeChars() throws Throwable {

    CharsRefBuilder utf16 = new CharsRefBuilder();
    char[] chars = new char[2];
    for(int ch=0;ch<0x0010FFFF;ch++) {

      if (ch == 0xd800)
        // Skip invalid code points
        ch = 0xe000;

      int len = 0;
      if (ch <= 0xffff) {
        chars[len++] = (char) ch;
      } else {
        chars[len++] = (char) (((ch-0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START);
        chars[len++] = (char) (((ch-0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START);
      }

      BytesRef utf8 = new BytesRef(CharBuffer.wrap(chars, 0, len));

      String s1 = new String(chars, 0, len);
      String s2 = new String(utf8.bytes, 0, utf8.length, StandardCharsets.UTF_8);
      assertEquals("codepoint " + ch, s1, s2);

      utf16.copyUTF8Bytes(utf8.bytes, 0, utf8.length);
      assertEquals("codepoint " + ch, s1, utf16.toString());

      byte[] b = s1.getBytes(StandardCharsets.UTF_8);
      assertEquals(utf8.length, b.length);
      for(int j=0;j<utf8.length;j++)
        assertEquals(utf8.bytes[j], b[j]);
    }
  }

Source File: CustomSpellCheckListner.java From customized-symspell with MIT License

5 votes

/**
 * Relod method of spellcheck listner
 * @param newSearcher
 * @param checker
 * @throws IOException
 * @throws SpellCheckException
 */
public void reload(SolrIndexSearcher newSearcher, SpellChecker checker)
    throws IOException, SpellCheckException {

  DirectoryReader productsIndexReader = newSearcher.getIndexReader();
  Fields fields = MultiFields.getFields(productsIndexReader);
  IndexSchema schema = newSearcher.getCore().getLatestSchema();
  long time = System.currentTimeMillis();
  for (String field : fields) {
    if (!fieldArr.contains(field)) {
      continue;
    }
    FieldType type = schema.getField(field).getType();
    int insertionsCount = 0;
    for (TermsEnum iterator = fields.terms(field).iterator(); iterator.next() != null; ) {
      BytesRef term = iterator.term();
      CharsRefBuilder charsRefBuilder = new CharsRefBuilder();
      type.indexedToReadable(term, charsRefBuilder);
      insertionsCount++;
      checker.getDataHolder().addItem(
          new DictionaryItem(charsRefBuilder.toString().trim(), (double) iterator.totalTermFreq(),
              0.0));
    }
    log.info("Spellcheck Dictionary populated for Field Name {}, Count {}", field,
        insertionsCount);
  }
  log.info("Data for SpellChecker  was populated. Time={} ms",
      (System.currentTimeMillis() - time));
}

Source File: TestConcatenateGraphFilter.java From lucene-solr with Apache License 2.0

5 votes

@Test
public void testWithSynonyms() throws Exception {
  SynonymMap.Builder builder = new SynonymMap.Builder(true);
  builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true);
  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
  String input = "mykeyword another keyword";
  tokenStream.setReader(new StringReader(input));
  @SuppressWarnings("deprecation")
  SynonymFilter filter = new SynonymFilter(tokenStream, builder.build(), true);
  ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter, SEP_LABEL, false, 100);
  String[] expectedOutputs = new String[2];
  CharsRefBuilder expectedOutput = new CharsRefBuilder();
  expectedOutput.append("mykeyword");
  expectedOutput.append(SEP_LABEL);
  expectedOutput.append("another");
  expectedOutput.append(SEP_LABEL);
  expectedOutput.append("keyword");
  expectedOutputs[0] = expectedOutput.toCharsRef().toString();
  expectedOutput.clear();
  expectedOutput.append("mysynonym");
  expectedOutput.append(SEP_LABEL);
  expectedOutput.append("another");
  expectedOutput.append(SEP_LABEL);
  expectedOutput.append("keyword");
  expectedOutputs[1] = expectedOutput.toCharsRef().toString();
  assertTokenStreamContents(stream, expectedOutputs, null, null, new int[]{1, 0});
}

Source File: TestConcatenateGraphFilter.java From lucene-solr with Apache License 2.0

5 votes

@Test
public void testWithMultipleTokens() throws Exception {
  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
  String input = "mykeyword another keyword";
  tokenStream.setReader(new StringReader(input));
  ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream);
  CharsRefBuilder builder = new CharsRefBuilder();
  builder.append("mykeyword");
  builder.append(SEP_LABEL);
  builder.append("another");
  builder.append(SEP_LABEL);
  builder.append("keyword");
  assertTokenStreamContents(stream, new String[]{builder.toCharsRef().toString()}, null, null, new int[]{1});
}

Source File: ManagedSynonymGraphFilterFactory.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Add the managed synonyms and their mappings into the SynonymMap builder.
 */
@Override
public void parse(Reader in) throws IOException, ParseException {
  boolean ignoreCase = synonymManager.getIgnoreCase();
  for (CasePreservedSynonymMappings cpsm : synonymManager.synonymMappings.values()) {
    for (Map.Entry<String, Set<String>> entry : cpsm.mappings.entrySet()) {
      for (String mapping : entry.getValue()) {
        // apply the case setting to match the behavior of the SynonymMap builder
        CharsRef casedTerm = analyze(synonymManager.applyCaseSetting(ignoreCase, entry.getKey()), new CharsRefBuilder());
        CharsRef casedMapping = analyze(synonymManager.applyCaseSetting(ignoreCase, mapping), new CharsRefBuilder());
        add(casedTerm, casedMapping, false);
      }
    }
  }
}

org.apache.lucene.util.CharsRefBuilder Java Examples