org.apache.lucene.util.CharsRefBuilder Java Examples

The following examples show how to use org.apache.lucene.util.CharsRefBuilder. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: TestIndexWriterUnicode.java From lucene-solr with Apache License 2.0

6 votes

public void testRandomUnicodeStrings() throws Throwable {
  char[] buffer = new char[20];
  char[] expected = new char[20];

  CharsRefBuilder utf16 = new CharsRefBuilder();

  int num = atLeast(10000);
  for (int iter = 0; iter < num; iter++) {
    boolean hasIllegal = fillUnicode(buffer, expected, 0, 20);

    BytesRef utf8 = new BytesRef(CharBuffer.wrap(buffer, 0, 20));
    if (!hasIllegal) {
      byte[] b = new String(buffer, 0, 20).getBytes(StandardCharsets.UTF_8);
      assertEquals(b.length, utf8.length);
      for(int i=0;i<b.length;i++)
        assertEquals(b[i], utf8.bytes[i]);
    }

    utf16.copyUTF8Bytes(utf8.bytes, 0, utf8.length);
    assertEquals(utf16.length(), 20);
    for(int i=0;i<20;i++)
      assertEquals(expected[i], utf16.charAt(i));
  }
}

Example #2

Source File: TermSuggester.java From Elasticsearch with Apache License 2.0

6 votes

@Override
public TermSuggestion innerExecute(String name, TermSuggestionContext suggestion, IndexSearcher searcher, CharsRefBuilder spare) throws IOException {
    DirectSpellChecker directSpellChecker = SuggestUtils.getDirectSpellChecker(suggestion.getDirectSpellCheckerSettings());
    final IndexReader indexReader = searcher.getIndexReader();
    TermSuggestion response = new TermSuggestion(
            name, suggestion.getSize(), suggestion.getDirectSpellCheckerSettings().sort()
    );
    List<Token> tokens = queryTerms(suggestion, spare);
    for (Token token : tokens) {
        // TODO: Extend DirectSpellChecker in 4.1, to get the raw suggested words as BytesRef
        SuggestWord[] suggestedWords = directSpellChecker.suggestSimilar(
                token.term, suggestion.getShardSize(), indexReader, suggestion.getDirectSpellCheckerSettings().suggestMode()
        );
        Text key = new Text(new BytesArray(token.term.bytes()));
        TermSuggestion.Entry resultEntry = new TermSuggestion.Entry(key, token.startOffset, token.endOffset - token.startOffset);
        for (SuggestWord suggestWord : suggestedWords) {
            Text word = new Text(suggestWord.string);
            resultEntry.addOption(new TermSuggestion.Entry.Option(word, suggestWord.freq, suggestWord.score));
        }
        response.addTerm(resultEntry);
    }
    return response;
}

Example #3

Source File: XAnalyzingSuggester.java From Elasticsearch with Apache License 2.0

6 votes

private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRefBuilder spare) {
  LookupResult result;
  if (hasPayloads) {
    int sepIndex = -1;
    for(int i=0;i<output2.length;i++) {
      if (output2.bytes[output2.offset+i] == payloadSep) {
        sepIndex = i;
        break;
      }
    }
    assert sepIndex != -1;
    final int payloadLen = output2.length - sepIndex - 1;
    spare.copyUTF8Bytes(output2.bytes, output2.offset, sepIndex);
    BytesRef payload = new BytesRef(payloadLen);
    System.arraycopy(output2.bytes, sepIndex+1, payload.bytes, 0, payloadLen);
    payload.length = payloadLen;
    result = new LookupResult(spare.toString(), decodeWeight(output1), payload);
  } else {
    spare.copyUTF8Bytes(output2);
    result = new LookupResult(spare.toString(), decodeWeight(output1));
  }

  return result;
}

Example #4

Source File: TermVectorsResponse.java From Elasticsearch with Apache License 2.0

6 votes

private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter, BoostAttribute boostAtt) throws IOException {
    // start term, optimized writing
    BytesRef term = termIter.next();
    spare.copyUTF8Bytes(term);
    builder.startObject(spare.toString());
    buildTermStatistics(builder, termIter);
    // finally write the term vectors
    PostingsEnum posEnum = termIter.postings(null, PostingsEnum.ALL);
    int termFreq = posEnum.freq();
    builder.field(FieldStrings.TERM_FREQ, termFreq);
    initMemory(curTerms, termFreq);
    initValues(curTerms, posEnum, termFreq);
    buildValues(builder, curTerms, termFreq);
    buildScore(builder, boostAtt);
    builder.endObject();
}

Example #5

Source File: MoreLikeThis.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param field2termFreqMap a Map of terms and their frequencies per field
 * @param vector List of terms and their frequencies for a doc/field
 */
private void addTermFrequencies(Map<String, Map<String, Int>> field2termFreqMap, Terms vector, String fieldName) throws IOException {
  Map<String, Int> termFreqMap = field2termFreqMap.computeIfAbsent(fieldName, k -> new HashMap<>());
  final TermsEnum termsEnum = vector.iterator();
  final CharsRefBuilder spare = new CharsRefBuilder();
  BytesRef text;
  while((text = termsEnum.next()) != null) {
    spare.copyUTF8Bytes(text);
    final String term = spare.toString();
    if (isNoiseWord(term)) {
      continue;
    }
    final int freq = (int) termsEnum.totalTermFreq();

    // increment frequency
    Int cnt = termFreqMap.get(term);
    if (cnt == null) {
      cnt = new Int();
      termFreqMap.put(term, cnt);
      cnt.x = freq;
    } else {
      cnt.x += freq;
    }
  }
}

Example #6

Source File: AnalyzingSuggester.java From lucene-solr with Apache License 2.0

6 votes

private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRefBuilder spare) {
  LookupResult result;
  if (hasPayloads) {
    int sepIndex = -1;
    for(int i=0;i<output2.length;i++) {
      if (output2.bytes[output2.offset+i] == PAYLOAD_SEP) {
        sepIndex = i;
        break;
      }
    }
    assert sepIndex != -1;
    spare.grow(sepIndex);
    final int payloadLen = output2.length - sepIndex - 1;
    spare.copyUTF8Bytes(output2.bytes, output2.offset, sepIndex);
    BytesRef payload = new BytesRef(payloadLen);
    System.arraycopy(output2.bytes, sepIndex+1, payload.bytes, 0, payloadLen);
    payload.length = payloadLen;
    result = new LookupResult(spare.toString(), decodeWeight(output1), payload);
  } else {
    spare.grow(output2.length);
    spare.copyUTF8Bytes(output2);
    result = new LookupResult(spare.toString(), decodeWeight(output1));
  }

  return result;
}

Example #7

Source File: JaspellLookup.java From lucene-solr with Apache License 2.0

6 votes

@Override
public void build(InputIterator iterator) throws IOException {
  if (iterator.hasPayloads()) {
    throw new IllegalArgumentException("this suggester doesn't support payloads");
  }
  if (iterator.hasContexts()) {
    throw new IllegalArgumentException("this suggester doesn't support contexts");
  }
  count = 0;
  trie = new JaspellTernarySearchTrie();
  trie.setMatchAlmostDiff(editDistance);
  BytesRef spare;
  final CharsRefBuilder charsSpare = new CharsRefBuilder();

  while ((spare = iterator.next()) != null) {
    final long weight = iterator.weight();
    if (spare.length == 0) {
      continue;
    }
    charsSpare.copyUTF8Bytes(spare);
    trie.put(charsSpare.toString(), weight);
    count++;
  }
}

Example #8

Source File: FSTCompletionLookup.java From lucene-solr with Apache License 2.0

6 votes

@Override
public List<LookupResult> lookup(CharSequence key, Set<BytesRef> contexts, boolean higherWeightsFirst, int num) {
  if (contexts != null) {
    throw new IllegalArgumentException("this suggester doesn't support contexts");
  }
  final List<Completion> completions;
  if (higherWeightsFirst) {
    completions = higherWeightsCompletion.lookup(key, num);
  } else {
    completions = normalCompletion.lookup(key, num);
  }
  
  final ArrayList<LookupResult> results = new ArrayList<>(completions.size());
  CharsRefBuilder spare = new CharsRefBuilder();
  for (Completion c : completions) {
    spare.copyUTF8Bytes(c.utf8);
    results.add(new LookupResult(spare.toString(), c.bucket));
  }
  return results;
}

Example #9

Source File: TSTLookup.java From lucene-solr with Apache License 2.0

6 votes

@Override
public void build(InputIterator iterator) throws IOException {
  if (iterator.hasPayloads()) {
    throw new IllegalArgumentException("this suggester doesn't support payloads");
  }
  if (iterator.hasContexts()) {
    throw new IllegalArgumentException("this suggester doesn't support contexts");
  }
  root = new TernaryTreeNode();

  // make sure it's sorted and the comparator uses UTF16 sort order
  iterator = new SortedInputIterator(tempDir, tempFileNamePrefix, iterator, utf8SortedAsUTF16SortOrder);
  count = 0;
  ArrayList<String> tokens = new ArrayList<>();
  ArrayList<Number> vals = new ArrayList<>();
  BytesRef spare;
  CharsRefBuilder charsSpare = new CharsRefBuilder();
  while ((spare = iterator.next()) != null) {
    charsSpare.copyUTF8Bytes(spare);
    tokens.add(charsSpare.toString());
    vals.add(Long.valueOf(iterator.weight()));
    count++;
  }
  autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root);
}

Example #10

Source File: TestSuggestField.java From lucene-solr with Apache License 2.0

6 votes

@Test
public void testReservedChars() throws Exception {
  CharsRefBuilder charsRefBuilder = new CharsRefBuilder();
  charsRefBuilder.append("sugg");
  charsRefBuilder.setCharAt(2, (char) ConcatenateGraphFilter.SEP_LABEL);
  IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
    new SuggestField("name", charsRefBuilder.toString(), 1);
  });
  assertTrue(expected.getMessage().contains("[0x1f]"));

  charsRefBuilder.setCharAt(2, (char) CompletionAnalyzer.HOLE_CHARACTER);
  expected = expectThrows(IllegalArgumentException.class, () -> {
    new SuggestField("name", charsRefBuilder.toString(), 1);
  });
  assertTrue(expected.getMessage().contains("[0x1e]"));

  charsRefBuilder.setCharAt(2, (char) NRTSuggesterBuilder.END_BYTE);
  expected = expectThrows(IllegalArgumentException.class, () -> {
    new SuggestField("name", charsRefBuilder.toString(), 1);
  });
  assertTrue(expected.getMessage().contains("[0x0]"));
}

Example #11

Source File: DutchAnalyzer.java From lucene-solr with Apache License 2.0

6 votes

public DutchAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<String> stemOverrideDict) {
  this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
  this.excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionTable));
  if (stemOverrideDict.isEmpty()) {
    this.stemdict = null;
  } else {
    // we don't need to ignore case here since we lowercase in this analyzer anyway
    StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false);
    CharArrayMap<String>.EntryIterator iter = stemOverrideDict.entrySet().iterator();
    CharsRefBuilder spare = new CharsRefBuilder();
    while (iter.hasNext()) {
      char[] nextKey = iter.nextKey();
      spare.copyChars(nextKey, 0, nextKey.length);
      builder.add(spare.get(), iter.currentValue());
    }
    try {
      this.stemdict = builder.build();
    } catch (IOException ex) {
      throw new RuntimeException("can not build stem dict", ex);
    }
  }
}

Example #12

Source File: SynonymMap.java From lucene-solr with Apache License 2.0

6 votes

/** Sugar: just joins the provided terms with {@link
 *  SynonymMap#WORD_SEPARATOR}.  reuse and its chars
 *  must not be null. */
public static CharsRef join(String[] words, CharsRefBuilder reuse) {
  int upto = 0;
  char[] buffer = reuse.chars();
  for (String word : words) {
    final int wordLen = word.length();
    final int needed = (0 == upto ? wordLen : 1 + upto + wordLen); // Add 1 for WORD_SEPARATOR
    if (needed > buffer.length) {
      reuse.grow(needed);
      buffer = reuse.chars();
    }
    if (upto > 0) {
      buffer[upto++] = SynonymMap.WORD_SEPARATOR;
    }

    word.getChars(0, wordLen, buffer, upto);
    upto += wordLen;
  }
  reuse.setLength(upto);
  return reuse.get();
}

Example #13

Source File: QueryAutoStopWordAnalyzer.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the
 * given selection of fields from terms with a document frequency greater than
 * the given maxDocFreq
 *
 * @param delegate Analyzer whose TokenStream will be filtered
 * @param indexReader IndexReader to identify the stopwords from
 * @param fields Selection of fields to calculate stopwords for
 * @param maxDocFreq Document frequency terms should be above in order to be stopwords
 * @throws IOException Can be thrown while reading from the IndexReader
 */
public QueryAutoStopWordAnalyzer(
    Analyzer delegate,
    IndexReader indexReader,
    Collection<String> fields,
    int maxDocFreq) throws IOException {
  super(delegate.getReuseStrategy());
  this.delegate = delegate;
  
  for (String field : fields) {
    Set<String> stopWords = new HashSet<>();
    Terms terms = MultiTerms.getTerms(indexReader, field);
    CharsRefBuilder spare = new CharsRefBuilder();
    if (terms != null) {
      TermsEnum te = terms.iterator();
      BytesRef text;
      while ((text = te.next()) != null) {
        if (te.docFreq() > maxDocFreq) {
          spare.copyUTF8Bytes(text);
          stopWords.add(spare.toString());
        }
      }
    }
    stopWordsPerField.put(field, stopWords);
  }
}

Example #14

Source File: TestConcatenateGraphFilter.java From lucene-solr with Apache License 2.0

6 votes

@Test
  public void testWithStopword() throws Exception {
    for (boolean preservePosInc : new boolean[]{true, false}) {
      Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
      String input = "a mykeyword a keyword"; //LUCENE-8344 add "a"
      tokenStream.setReader(new StringReader(input));
      TokenFilter tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("a"));
      ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenFilter, SEP_LABEL, preservePosInc, 10);
      CharsRefBuilder builder = new CharsRefBuilder();
      if (preservePosInc) {
        builder.append(SEP_LABEL);
      }
      builder.append("mykeyword");
      builder.append(SEP_LABEL);
      if (preservePosInc) {
        builder.append(SEP_LABEL);
      }
      builder.append("keyword");
//      if (preservePosInc) { LUCENE-8344 uncomment
//        builder.append(SEP_LABEL);
//      }
      assertTokenStreamContents(concatStream, new String[]{builder.toCharsRef().toString()});
    }
  }

Example #15

Source File: TestLimitTokenPositionFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testMaxPosition3WithSynomyms() throws IOException {
  for (final boolean consumeAll : new boolean[]{true, false}) {
    MockTokenizer tokenizer = whitespaceMockTokenizer("one two three four five");
    // if we are consuming all tokens, we can use the checks, otherwise we can't
    tokenizer.setEnableChecks(consumeAll);

    SynonymMap.Builder builder = new SynonymMap.Builder(true);
    builder.add(new CharsRef("one"), new CharsRef("first"), true);
    builder.add(new CharsRef("one"), new CharsRef("alpha"), true);
    builder.add(new CharsRef("one"), new CharsRef("beguine"), true);
    CharsRefBuilder multiWordCharsRef = new CharsRefBuilder();
    SynonymMap.Builder.join(new String[]{"and", "indubitably", "single", "only"}, multiWordCharsRef);
    builder.add(new CharsRef("one"), multiWordCharsRef.get(), true);
    SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef);
    builder.add(new CharsRef("two"), multiWordCharsRef.get(), true);
    SynonymMap synonymMap = builder.build();
    @SuppressWarnings("deprecation")
    TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true);
    stream = new LimitTokenPositionFilter(stream, 3, consumeAll);

    // "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3.
    assertTokenStreamContents(stream,
        new String[]{"one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger"},
        new int[]{1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0});
  }
}

Example #16

Source File: PointField.java From lucene-solr with Apache License 2.0

5 votes

@Override
public CharsRef indexedToReadable(BytesRef indexedForm, CharsRefBuilder charsRef) {
  final String value = indexedToReadable(indexedForm);
  charsRef.grow(value.length());
  charsRef.setLength(value.length());
  value.getChars(0, charsRef.length(), charsRef.chars(), 0);
  return charsRef.get();
}

Example #17

Source File: EnumField.java From lucene-solr with Apache License 2.0

5 votes

@Override
public CharsRef indexedToReadable(BytesRef input, CharsRefBuilder output) {
  final Integer intValue = LegacyNumericUtils.prefixCodedToInt(input);
  final String stringValue = enumMapping.intValueToStringValue(intValue);
  output.grow(stringValue.length());
  output.setLength(stringValue.length());
  stringValue.getChars(0, output.length(), output.chars(), 0);
  return output.get();
}

Example #18

Source File: ESWordnetSynonymParser.java From crate with Apache License 2.0

5 votes

@Override
public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException {
    try {
        return super.analyze(text, reuse);
    } catch (IllegalArgumentException ex) {
        if (lenient) {
            LOGGER.info("Synonym rule for [" + text + "] was ignored");
            return new CharsRef("");
        } else {
            throw ex;
        }
    }
}

Example #19

Source File: TrieField.java From lucene-solr with Apache License 2.0

5 votes

@Override
public CharsRef indexedToReadable(BytesRef indexedForm, CharsRefBuilder charsRef) {
  final String value;
  switch (type) {
    case INTEGER:
      value = Integer.toString( LegacyNumericUtils.prefixCodedToInt(indexedForm) );
      break;
    case FLOAT:
      value = Float.toString( NumericUtils.sortableIntToFloat(LegacyNumericUtils.prefixCodedToInt(indexedForm)) );
      break;
    case LONG:
      value = Long.toString( LegacyNumericUtils.prefixCodedToLong(indexedForm) );
      break;
    case DOUBLE:
      value = Double.toString( NumericUtils.sortableLongToDouble(LegacyNumericUtils.prefixCodedToLong(indexedForm)) );
      break;
    case DATE:
      value = Instant.ofEpochMilli(LegacyNumericUtils.prefixCodedToLong(indexedForm)).toString();
      break;
    default:
      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + type);
  }
  charsRef.grow(value.length());
  charsRef.setLength(value.length());
  value.getChars(0, charsRef.length(), charsRef.chars(), 0);
  return charsRef.get();
}

Example #20

Source File: ESSolrSynonymParser.java From crate with Apache License 2.0

5 votes

@Override
public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException {
    try {
        return super.analyze(text, reuse);
    } catch (IllegalArgumentException ex) {
        if (lenient) {
            LOGGER.info("Synonym rule for [" + text + "] was ignored");
            return new CharsRef("");
        } else {
            throw ex;
        }
    }
}

Example #21

Source File: ExpandComponent.java From lucene-solr with Apache License 2.0

5 votes

@SuppressWarnings({"unchecked"})
private void addGroupSliceToOutputMap(FieldType fieldType, IntObjectHashMap<BytesRef> ordBytes,
                                      @SuppressWarnings({"rawtypes"})NamedList outMap, CharsRefBuilder charsRef, long groupValue, DocSlice slice) {
  if(fieldType instanceof StrField) {
    final BytesRef bytesRef = ordBytes.get((int)groupValue);
    fieldType.indexedToReadable(bytesRef, charsRef);
    String group = charsRef.toString();
    outMap.add(group, slice);
  } else {
    outMap.add(numericToString(fieldType, groupValue), slice);
  }
}

Example #22

Source File: MoreLikeThisHandler.java From lucene-solr with Apache License 2.0

5 votes

public DocListAndSet getMoreLikeThis( Reader reader, int start, int rows, List<Query> filters, List<InterestingTerm> terms, int flags ) throws IOException
{
  // SOLR-5351: if only check against a single field, use the reader directly. Otherwise we
  // repeat the stream's content for multiple fields so that query terms can be pulled from any
  // of those fields.
  String [] fields = mlt.getFieldNames();
  if (fields.length == 1) {
    rawMLTQuery = mlt.like(fields[0], reader);
  } else {
    CharsRefBuilder buffered = new CharsRefBuilder();
    char [] chunk = new char [1024];
    int len;
    while ((len = reader.read(chunk)) >= 0) {
      buffered.append(chunk, 0, len);
    }

    Collection<Object> streamValue = Collections.singleton(buffered.get().toString());
    Map<String, Collection<Object>> multifieldDoc = new HashMap<>(fields.length);
    for (String field : fields) {
      multifieldDoc.put(field, streamValue);
    }

    rawMLTQuery = mlt.like(multifieldDoc);
  }

  boostedMLTQuery = getBoostedQuery( rawMLTQuery );
  if (terms != null) {
    fillInterestingTermsFromMLTQuery( boostedMLTQuery, terms );
  }
  DocListAndSet results = new DocListAndSet();
  if (this.needDocSet) {
    results = searcher.getDocListAndSet( boostedMLTQuery, filters, null, start, rows, flags);
  } else {
    results.docList = searcher.getDocList( boostedMLTQuery, filters, null, start, rows, flags);
  }
  return results;
}

Example #23

Source File: QueryParsing.java From lucene-solr with Apache License 2.0

5 votes

static void writeFieldVal(BytesRef val, FieldType ft, Appendable out, int flags) throws IOException {
  if (ft != null) {
    try {
      CharsRefBuilder readable = new CharsRefBuilder();
      ft.indexedToReadable(val, readable);
      out.append(readable.get());
    } catch (Exception e) {
      out.append("EXCEPTION(val=");
      out.append(val.utf8ToString());
      out.append(")");
    }
  } else {
    out.append(val.utf8ToString());
  }
}

Example #24

Source File: SearchGroupsResultTransformer.java From lucene-solr with Apache License 2.0

5 votes

@SuppressWarnings({"rawtypes"})
private NamedList serializeSearchGroup(Collection<SearchGroup<BytesRef>> data, SearchGroupsFieldCommand command) {
  final NamedList<Object[]> result = new NamedList<>(data.size());

  SortField[] groupSortField = command.getGroupSort().getSort();
  for (SearchGroup<BytesRef> searchGroup : data) {
    Object[] convertedSortValues = serializeOneSearchGroup(groupSortField, searchGroup);
    SchemaField field = searcher.getSchema().getFieldOrNull(command.getKey());
    String groupValue = searchGroup.groupValue != null ? field.getType().indexedToReadable(searchGroup.groupValue, new CharsRefBuilder()).toString() : null;
    result.add(groupValue, convertedSortValues);
  }

  return result;
}

Example #25

Source File: DeleteUpdateCommand.java From lucene-solr with Apache License 2.0

5 votes

public String getId() {
  if (id == null && indexedId != null) {
    IndexSchema schema = req.getSchema();
    SchemaField sf = schema.getUniqueKeyField();
    if (sf != null) {
      CharsRefBuilder ref = new CharsRefBuilder();
      sf.getType().indexedToReadable(indexedId, ref);
      id = ref.toString();
    }
  }
  return id;
}

Example #26

Source File: TestIndexWriterUnicode.java From lucene-solr with Apache License 2.0

5 votes

public void testAllUnicodeChars() throws Throwable {

    CharsRefBuilder utf16 = new CharsRefBuilder();
    char[] chars = new char[2];
    for(int ch=0;ch<0x0010FFFF;ch++) {

      if (ch == 0xd800)
        // Skip invalid code points
        ch = 0xe000;

      int len = 0;
      if (ch <= 0xffff) {
        chars[len++] = (char) ch;
      } else {
        chars[len++] = (char) (((ch-0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START);
        chars[len++] = (char) (((ch-0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START);
      }

      BytesRef utf8 = new BytesRef(CharBuffer.wrap(chars, 0, len));

      String s1 = new String(chars, 0, len);
      String s2 = new String(utf8.bytes, 0, utf8.length, StandardCharsets.UTF_8);
      assertEquals("codepoint " + ch, s1, s2);

      utf16.copyUTF8Bytes(utf8.bytes, 0, utf8.length);
      assertEquals("codepoint " + ch, s1, utf16.toString());

      byte[] b = s1.getBytes(StandardCharsets.UTF_8);
      assertEquals(utf8.length, b.length);
      for(int j=0;j<utf8.length;j++)
        assertEquals(utf8.bytes[j], b[j]);
    }
  }

Example #27

Source File: CustomSpellCheckListner.java From customized-symspell with MIT License

5 votes

/**
 * Relod method of spellcheck listner
 * @param newSearcher
 * @param checker
 * @throws IOException
 * @throws SpellCheckException
 */
public void reload(SolrIndexSearcher newSearcher, SpellChecker checker)
    throws IOException, SpellCheckException {

  DirectoryReader productsIndexReader = newSearcher.getIndexReader();
  Fields fields = MultiFields.getFields(productsIndexReader);
  IndexSchema schema = newSearcher.getCore().getLatestSchema();
  long time = System.currentTimeMillis();
  for (String field : fields) {
    if (!fieldArr.contains(field)) {
      continue;
    }
    FieldType type = schema.getField(field).getType();
    int insertionsCount = 0;
    for (TermsEnum iterator = fields.terms(field).iterator(); iterator.next() != null; ) {
      BytesRef term = iterator.term();
      CharsRefBuilder charsRefBuilder = new CharsRefBuilder();
      type.indexedToReadable(term, charsRefBuilder);
      insertionsCount++;
      checker.getDataHolder().addItem(
          new DictionaryItem(charsRefBuilder.toString().trim(), (double) iterator.totalTermFreq(),
              0.0));
    }
    log.info("Spellcheck Dictionary populated for Field Name {}, Count {}", field,
        insertionsCount);
  }
  log.info("Data for SpellChecker  was populated. Time={} ms",
      (System.currentTimeMillis() - time));
}

Example #28

Source File: TestConcatenateGraphFilter.java From lucene-solr with Apache License 2.0

5 votes

@Test
public void testWithSynonyms() throws Exception {
  SynonymMap.Builder builder = new SynonymMap.Builder(true);
  builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true);
  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
  String input = "mykeyword another keyword";
  tokenStream.setReader(new StringReader(input));
  @SuppressWarnings("deprecation")
  SynonymFilter filter = new SynonymFilter(tokenStream, builder.build(), true);
  ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter, SEP_LABEL, false, 100);
  String[] expectedOutputs = new String[2];
  CharsRefBuilder expectedOutput = new CharsRefBuilder();
  expectedOutput.append("mykeyword");
  expectedOutput.append(SEP_LABEL);
  expectedOutput.append("another");
  expectedOutput.append(SEP_LABEL);
  expectedOutput.append("keyword");
  expectedOutputs[0] = expectedOutput.toCharsRef().toString();
  expectedOutput.clear();
  expectedOutput.append("mysynonym");
  expectedOutput.append(SEP_LABEL);
  expectedOutput.append("another");
  expectedOutput.append(SEP_LABEL);
  expectedOutput.append("keyword");
  expectedOutputs[1] = expectedOutput.toCharsRef().toString();
  assertTokenStreamContents(stream, expectedOutputs, null, null, new int[]{1, 0});
}

Example #29

Source File: TestConcatenateGraphFilter.java From lucene-solr with Apache License 2.0

5 votes

@Test
public void testWithMultipleTokens() throws Exception {
  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
  String input = "mykeyword another keyword";
  tokenStream.setReader(new StringReader(input));
  ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream);
  CharsRefBuilder builder = new CharsRefBuilder();
  builder.append("mykeyword");
  builder.append(SEP_LABEL);
  builder.append("another");
  builder.append(SEP_LABEL);
  builder.append("keyword");
  assertTokenStreamContents(stream, new String[]{builder.toCharsRef().toString()}, null, null, new int[]{1});
}

Example #30

Source File: ManagedSynonymGraphFilterFactory.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Add the managed synonyms and their mappings into the SynonymMap builder.
 */
@Override
public void parse(Reader in) throws IOException, ParseException {
  boolean ignoreCase = synonymManager.getIgnoreCase();
  for (CasePreservedSynonymMappings cpsm : synonymManager.synonymMappings.values()) {
    for (Map.Entry<String, Set<String>> entry : cpsm.mappings.entrySet()) {
      for (String mapping : entry.getValue()) {
        // apply the case setting to match the behavior of the SynonymMap builder
        CharsRef casedTerm = analyze(synonymManager.applyCaseSetting(ignoreCase, entry.getKey()), new CharsRefBuilder());
        CharsRef casedMapping = analyze(synonymManager.applyCaseSetting(ignoreCase, mapping), new CharsRefBuilder());
        add(casedTerm, casedMapping, false);
      }
    }
  }
}