Java Code Examples for org.apache.lucene.util.CharsRefBuilder

The following examples show how to use org.apache.lucene.util.CharsRefBuilder. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: Elasticsearch   Source File: XAnalyzingSuggester.java    License: Apache License 2.0 6 votes vote down vote up
private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRefBuilder spare) {
  LookupResult result;
  if (hasPayloads) {
    int sepIndex = -1;
    for(int i=0;i<output2.length;i++) {
      if (output2.bytes[output2.offset+i] == payloadSep) {
        sepIndex = i;
        break;
      }
    }
    assert sepIndex != -1;
    final int payloadLen = output2.length - sepIndex - 1;
    spare.copyUTF8Bytes(output2.bytes, output2.offset, sepIndex);
    BytesRef payload = new BytesRef(payloadLen);
    System.arraycopy(output2.bytes, sepIndex+1, payload.bytes, 0, payloadLen);
    payload.length = payloadLen;
    result = new LookupResult(spare.toString(), decodeWeight(output1), payload);
  } else {
    spare.copyUTF8Bytes(output2);
    result = new LookupResult(spare.toString(), decodeWeight(output1));
  }

  return result;
}
 
Example 2
Source Project: Elasticsearch   Source File: TermSuggester.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public TermSuggestion innerExecute(String name, TermSuggestionContext suggestion, IndexSearcher searcher, CharsRefBuilder spare) throws IOException {
    DirectSpellChecker directSpellChecker = SuggestUtils.getDirectSpellChecker(suggestion.getDirectSpellCheckerSettings());
    final IndexReader indexReader = searcher.getIndexReader();
    TermSuggestion response = new TermSuggestion(
            name, suggestion.getSize(), suggestion.getDirectSpellCheckerSettings().sort()
    );
    List<Token> tokens = queryTerms(suggestion, spare);
    for (Token token : tokens) {
        // TODO: Extend DirectSpellChecker in 4.1, to get the raw suggested words as BytesRef
        SuggestWord[] suggestedWords = directSpellChecker.suggestSimilar(
                token.term, suggestion.getShardSize(), indexReader, suggestion.getDirectSpellCheckerSettings().suggestMode()
        );
        Text key = new Text(new BytesArray(token.term.bytes()));
        TermSuggestion.Entry resultEntry = new TermSuggestion.Entry(key, token.startOffset, token.endOffset - token.startOffset);
        for (SuggestWord suggestWord : suggestedWords) {
            Text word = new Text(suggestWord.string);
            resultEntry.addOption(new TermSuggestion.Entry.Option(word, suggestWord.freq, suggestWord.score));
        }
        response.addTerm(resultEntry);
    }
    return response;
}
 
Example 3
Source Project: Elasticsearch   Source File: TermVectorsResponse.java    License: Apache License 2.0 6 votes vote down vote up
private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter, BoostAttribute boostAtt) throws IOException {
    // start term, optimized writing
    BytesRef term = termIter.next();
    spare.copyUTF8Bytes(term);
    builder.startObject(spare.toString());
    buildTermStatistics(builder, termIter);
    // finally write the term vectors
    PostingsEnum posEnum = termIter.postings(null, PostingsEnum.ALL);
    int termFreq = posEnum.freq();
    builder.field(FieldStrings.TERM_FREQ, termFreq);
    initMemory(curTerms, termFreq);
    initValues(curTerms, posEnum, termFreq);
    buildValues(builder, curTerms, termFreq);
    buildScore(builder, boostAtt);
    builder.endObject();
}
 
Example 4
Source Project: lucene-solr   Source File: MoreLikeThis.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param field2termFreqMap a Map of terms and their frequencies per field
 * @param vector List of terms and their frequencies for a doc/field
 */
private void addTermFrequencies(Map<String, Map<String, Int>> field2termFreqMap, Terms vector, String fieldName) throws IOException {
  Map<String, Int> termFreqMap = field2termFreqMap.computeIfAbsent(fieldName, k -> new HashMap<>());
  final TermsEnum termsEnum = vector.iterator();
  final CharsRefBuilder spare = new CharsRefBuilder();
  BytesRef text;
  while((text = termsEnum.next()) != null) {
    spare.copyUTF8Bytes(text);
    final String term = spare.toString();
    if (isNoiseWord(term)) {
      continue;
    }
    final int freq = (int) termsEnum.totalTermFreq();

    // increment frequency
    Int cnt = termFreqMap.get(term);
    if (cnt == null) {
      cnt = new Int();
      termFreqMap.put(term, cnt);
      cnt.x = freq;
    } else {
      cnt.x += freq;
    }
  }
}
 
Example 5
Source Project: lucene-solr   Source File: AnalyzingSuggester.java    License: Apache License 2.0 6 votes vote down vote up
private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRefBuilder spare) {
  LookupResult result;
  if (hasPayloads) {
    int sepIndex = -1;
    for(int i=0;i<output2.length;i++) {
      if (output2.bytes[output2.offset+i] == PAYLOAD_SEP) {
        sepIndex = i;
        break;
      }
    }
    assert sepIndex != -1;
    spare.grow(sepIndex);
    final int payloadLen = output2.length - sepIndex - 1;
    spare.copyUTF8Bytes(output2.bytes, output2.offset, sepIndex);
    BytesRef payload = new BytesRef(payloadLen);
    System.arraycopy(output2.bytes, sepIndex+1, payload.bytes, 0, payloadLen);
    payload.length = payloadLen;
    result = new LookupResult(spare.toString(), decodeWeight(output1), payload);
  } else {
    spare.grow(output2.length);
    spare.copyUTF8Bytes(output2);
    result = new LookupResult(spare.toString(), decodeWeight(output1));
  }

  return result;
}
 
Example 6
Source Project: lucene-solr   Source File: JaspellLookup.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void build(InputIterator iterator) throws IOException {
  if (iterator.hasPayloads()) {
    throw new IllegalArgumentException("this suggester doesn't support payloads");
  }
  if (iterator.hasContexts()) {
    throw new IllegalArgumentException("this suggester doesn't support contexts");
  }
  count = 0;
  trie = new JaspellTernarySearchTrie();
  trie.setMatchAlmostDiff(editDistance);
  BytesRef spare;
  final CharsRefBuilder charsSpare = new CharsRefBuilder();

  while ((spare = iterator.next()) != null) {
    final long weight = iterator.weight();
    if (spare.length == 0) {
      continue;
    }
    charsSpare.copyUTF8Bytes(spare);
    trie.put(charsSpare.toString(), weight);
    count++;
  }
}
 
Example 7
Source Project: lucene-solr   Source File: FSTCompletionLookup.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public List<LookupResult> lookup(CharSequence key, Set<BytesRef> contexts, boolean higherWeightsFirst, int num) {
  if (contexts != null) {
    throw new IllegalArgumentException("this suggester doesn't support contexts");
  }
  final List<Completion> completions;
  if (higherWeightsFirst) {
    completions = higherWeightsCompletion.lookup(key, num);
  } else {
    completions = normalCompletion.lookup(key, num);
  }
  
  final ArrayList<LookupResult> results = new ArrayList<>(completions.size());
  CharsRefBuilder spare = new CharsRefBuilder();
  for (Completion c : completions) {
    spare.copyUTF8Bytes(c.utf8);
    results.add(new LookupResult(spare.toString(), c.bucket));
  }
  return results;
}
 
Example 8
Source Project: lucene-solr   Source File: TSTLookup.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void build(InputIterator iterator) throws IOException {
  if (iterator.hasPayloads()) {
    throw new IllegalArgumentException("this suggester doesn't support payloads");
  }
  if (iterator.hasContexts()) {
    throw new IllegalArgumentException("this suggester doesn't support contexts");
  }
  root = new TernaryTreeNode();

  // make sure it's sorted and the comparator uses UTF16 sort order
  iterator = new SortedInputIterator(tempDir, tempFileNamePrefix, iterator, utf8SortedAsUTF16SortOrder);
  count = 0;
  ArrayList<String> tokens = new ArrayList<>();
  ArrayList<Number> vals = new ArrayList<>();
  BytesRef spare;
  CharsRefBuilder charsSpare = new CharsRefBuilder();
  while ((spare = iterator.next()) != null) {
    charsSpare.copyUTF8Bytes(spare);
    tokens.add(charsSpare.toString());
    vals.add(Long.valueOf(iterator.weight()));
    count++;
  }
  autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root);
}
 
Example 9
Source Project: lucene-solr   Source File: TestSuggestField.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testReservedChars() throws Exception {
  CharsRefBuilder charsRefBuilder = new CharsRefBuilder();
  charsRefBuilder.append("sugg");
  charsRefBuilder.setCharAt(2, (char) ConcatenateGraphFilter.SEP_LABEL);
  IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
    new SuggestField("name", charsRefBuilder.toString(), 1);
  });
  assertTrue(expected.getMessage().contains("[0x1f]"));

  charsRefBuilder.setCharAt(2, (char) CompletionAnalyzer.HOLE_CHARACTER);
  expected = expectThrows(IllegalArgumentException.class, () -> {
    new SuggestField("name", charsRefBuilder.toString(), 1);
  });
  assertTrue(expected.getMessage().contains("[0x1e]"));

  charsRefBuilder.setCharAt(2, (char) NRTSuggesterBuilder.END_BYTE);
  expected = expectThrows(IllegalArgumentException.class, () -> {
    new SuggestField("name", charsRefBuilder.toString(), 1);
  });
  assertTrue(expected.getMessage().contains("[0x0]"));
}
 
Example 10
Source Project: lucene-solr   Source File: DutchAnalyzer.java    License: Apache License 2.0 6 votes vote down vote up
public DutchAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<String> stemOverrideDict) {
  this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(stopwords));
  this.excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionTable));
  if (stemOverrideDict.isEmpty()) {
    this.stemdict = null;
  } else {
    // we don't need to ignore case here since we lowercase in this analyzer anyway
    StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false);
    CharArrayMap<String>.EntryIterator iter = stemOverrideDict.entrySet().iterator();
    CharsRefBuilder spare = new CharsRefBuilder();
    while (iter.hasNext()) {
      char[] nextKey = iter.nextKey();
      spare.copyChars(nextKey, 0, nextKey.length);
      builder.add(spare.get(), iter.currentValue());
    }
    try {
      this.stemdict = builder.build();
    } catch (IOException ex) {
      throw new RuntimeException("can not build stem dict", ex);
    }
  }
}
 
Example 11
Source Project: lucene-solr   Source File: SynonymMap.java    License: Apache License 2.0 6 votes vote down vote up
/** Sugar: just joins the provided terms with {@link
 *  SynonymMap#WORD_SEPARATOR}.  reuse and its chars
 *  must not be null. */
public static CharsRef join(String[] words, CharsRefBuilder reuse) {
  int upto = 0;
  char[] buffer = reuse.chars();
  for (String word : words) {
    final int wordLen = word.length();
    final int needed = (0 == upto ? wordLen : 1 + upto + wordLen); // Add 1 for WORD_SEPARATOR
    if (needed > buffer.length) {
      reuse.grow(needed);
      buffer = reuse.chars();
    }
    if (upto > 0) {
      buffer[upto++] = SynonymMap.WORD_SEPARATOR;
    }

    word.getChars(0, wordLen, buffer, upto);
    upto += wordLen;
  }
  reuse.setLength(upto);
  return reuse.get();
}
 
Example 12
Source Project: lucene-solr   Source File: QueryAutoStopWordAnalyzer.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the
 * given selection of fields from terms with a document frequency greater than
 * the given maxDocFreq
 *
 * @param delegate Analyzer whose TokenStream will be filtered
 * @param indexReader IndexReader to identify the stopwords from
 * @param fields Selection of fields to calculate stopwords for
 * @param maxDocFreq Document frequency terms should be above in order to be stopwords
 * @throws IOException Can be thrown while reading from the IndexReader
 */
public QueryAutoStopWordAnalyzer(
    Analyzer delegate,
    IndexReader indexReader,
    Collection<String> fields,
    int maxDocFreq) throws IOException {
  super(delegate.getReuseStrategy());
  this.delegate = delegate;
  
  for (String field : fields) {
    Set<String> stopWords = new HashSet<>();
    Terms terms = MultiTerms.getTerms(indexReader, field);
    CharsRefBuilder spare = new CharsRefBuilder();
    if (terms != null) {
      TermsEnum te = terms.iterator();
      BytesRef text;
      while ((text = te.next()) != null) {
        if (te.docFreq() > maxDocFreq) {
          spare.copyUTF8Bytes(text);
          stopWords.add(spare.toString());
        }
      }
    }
    stopWordsPerField.put(field, stopWords);
  }
}
 
Example 13
Source Project: lucene-solr   Source File: TestConcatenateGraphFilter.java    License: Apache License 2.0 6 votes vote down vote up
@Test
  public void testWithStopword() throws Exception {
    for (boolean preservePosInc : new boolean[]{true, false}) {
      Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
      String input = "a mykeyword a keyword"; //LUCENE-8344 add "a"
      tokenStream.setReader(new StringReader(input));
      TokenFilter tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("a"));
      ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenFilter, SEP_LABEL, preservePosInc, 10);
      CharsRefBuilder builder = new CharsRefBuilder();
      if (preservePosInc) {
        builder.append(SEP_LABEL);
      }
      builder.append("mykeyword");
      builder.append(SEP_LABEL);
      if (preservePosInc) {
        builder.append(SEP_LABEL);
      }
      builder.append("keyword");
//      if (preservePosInc) { LUCENE-8344 uncomment
//        builder.append(SEP_LABEL);
//      }
      assertTokenStreamContents(concatStream, new String[]{builder.toCharsRef().toString()});
    }
  }
 
Example 14
Source Project: lucene-solr   Source File: TestLimitTokenPositionFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testMaxPosition3WithSynomyms() throws IOException {
  for (final boolean consumeAll : new boolean[]{true, false}) {
    MockTokenizer tokenizer = whitespaceMockTokenizer("one two three four five");
    // if we are consuming all tokens, we can use the checks, otherwise we can't
    tokenizer.setEnableChecks(consumeAll);

    SynonymMap.Builder builder = new SynonymMap.Builder(true);
    builder.add(new CharsRef("one"), new CharsRef("first"), true);
    builder.add(new CharsRef("one"), new CharsRef("alpha"), true);
    builder.add(new CharsRef("one"), new CharsRef("beguine"), true);
    CharsRefBuilder multiWordCharsRef = new CharsRefBuilder();
    SynonymMap.Builder.join(new String[]{"and", "indubitably", "single", "only"}, multiWordCharsRef);
    builder.add(new CharsRef("one"), multiWordCharsRef.get(), true);
    SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef);
    builder.add(new CharsRef("two"), multiWordCharsRef.get(), true);
    SynonymMap synonymMap = builder.build();
    @SuppressWarnings("deprecation")
    TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true);
    stream = new LimitTokenPositionFilter(stream, 3, consumeAll);

    // "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3.
    assertTokenStreamContents(stream,
        new String[]{"one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger"},
        new int[]{1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0});
  }
}
 
Example 15
Source Project: lucene-solr   Source File: TestIndexWriterUnicode.java    License: Apache License 2.0 6 votes vote down vote up
public void testRandomUnicodeStrings() throws Throwable {
  char[] buffer = new char[20];
  char[] expected = new char[20];

  CharsRefBuilder utf16 = new CharsRefBuilder();

  int num = atLeast(10000);
  for (int iter = 0; iter < num; iter++) {
    boolean hasIllegal = fillUnicode(buffer, expected, 0, 20);

    BytesRef utf8 = new BytesRef(CharBuffer.wrap(buffer, 0, 20));
    if (!hasIllegal) {
      byte[] b = new String(buffer, 0, 20).getBytes(StandardCharsets.UTF_8);
      assertEquals(b.length, utf8.length);
      for(int i=0;i<b.length;i++)
        assertEquals(b[i], utf8.bytes[i]);
    }

    utf16.copyUTF8Bytes(utf8.bytes, 0, utf8.length);
    assertEquals(utf16.length(), 20);
    for(int i=0;i<20;i++)
      assertEquals(expected[i], utf16.charAt(i));
  }
}
 
Example 16
Source Project: customized-symspell   Source File: CustomSpellCheckListner.java    License: MIT License 5 votes vote down vote up
/**
 * Relod method of spellcheck listner
 * @param newSearcher
 * @param checker
 * @throws IOException
 * @throws SpellCheckException
 */
public void reload(SolrIndexSearcher newSearcher, SpellChecker checker)
    throws IOException, SpellCheckException {

  DirectoryReader productsIndexReader = newSearcher.getIndexReader();
  Fields fields = MultiFields.getFields(productsIndexReader);
  IndexSchema schema = newSearcher.getCore().getLatestSchema();
  long time = System.currentTimeMillis();
  for (String field : fields) {
    if (!fieldArr.contains(field)) {
      continue;
    }
    FieldType type = schema.getField(field).getType();
    int insertionsCount = 0;
    for (TermsEnum iterator = fields.terms(field).iterator(); iterator.next() != null; ) {
      BytesRef term = iterator.term();
      CharsRefBuilder charsRefBuilder = new CharsRefBuilder();
      type.indexedToReadable(term, charsRefBuilder);
      insertionsCount++;
      checker.getDataHolder().addItem(
          new DictionaryItem(charsRefBuilder.toString().trim(), (double) iterator.totalTermFreq(),
              0.0));
    }
    log.info("Spellcheck Dictionary populated for Field Name {}, Count {}", field,
        insertionsCount);
  }
  log.info("Data for SpellChecker  was populated. Time={} ms",
      (System.currentTimeMillis() - time));
}
 
Example 17
Source Project: Elasticsearch   Source File: TermSuggester.java    License: Apache License 2.0 5 votes vote down vote up
private List<Token> queryTerms(SuggestionContext suggestion, CharsRefBuilder spare) throws IOException {
    final List<Token> result = new ArrayList<>();
    final String field = suggestion.getField();
    SuggestUtils.analyze(suggestion.getAnalyzer(), suggestion.getText(), field, new SuggestUtils.TokenConsumer() {
        @Override
        public void nextToken() {
            Term term = new Term(field, BytesRef.deepCopyOf(fillBytesRef(new BytesRefBuilder())));
            result.add(new Token(term, offsetAttr.startOffset(), offsetAttr.endOffset())); 
        }
    }, spare);
   return result;
}
 
Example 18
Source Project: Elasticsearch   Source File: DirectCandidateGenerator.java    License: Apache License 2.0 5 votes vote down vote up
protected BytesRef preFilter(final BytesRef term, final CharsRefBuilder spare, final BytesRefBuilder byteSpare) throws IOException {
    if (preFilter == null) {
        return term;
    }
    final BytesRefBuilder result = byteSpare;
    SuggestUtils.analyze(preFilter, term, field, new SuggestUtils.TokenConsumer() {
        
        @Override
        public void nextToken() throws IOException {
            this.fillBytesRef(result);
        }
    }, spare);
    return result.get();
}
 
Example 19
Source Project: Elasticsearch   Source File: Suggester.java    License: Apache License 2.0 5 votes vote down vote up
public Suggest.Suggestion<? extends Suggest.Suggestion.Entry<? extends Suggest.Suggestion.Entry.Option>>
    execute(String name, T suggestion, IndexSearcher searcher, CharsRefBuilder spare) throws IOException {
    // #3469 We want to ignore empty shards

    if (searcher.getIndexReader().numDocs() == 0) {
        return null;
    }
    return innerExecute(name, suggestion, searcher, spare);
}
 
Example 20
Source Project: Elasticsearch   Source File: PagedBytesReference.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public String toUtf8() {
    if (length() == 0) {
        return "";
    }

    byte[] bytes = toBytes();
    final CharsRefBuilder ref = new CharsRefBuilder();
    ref.copyUTF8Bytes(bytes, offset, length);
    return ref.toString();
}
 
Example 21
Source Project: Elasticsearch   Source File: XMoreLikeThis.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param termFreqMap a Map of terms and their frequencies
 * @param vector List of terms and their frequencies for a doc/field
 * @param fieldName Optional field name of the terms for skip terms
 */
private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector, @Nullable String fieldName) throws IOException {
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    while((text = termsEnum.next()) != null) {
        spare.copyUTF8Bytes(text);
        final String term = spare.toString();
        if (isNoiseWord(term)) {
            continue;
        }
        if (isSkipTerm(fieldName, term)) {
            continue;
        }

        final PostingsEnum docs = termsEnum.postings(null);
        int freq = 0;
        while(docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            freq += docs.freq();
        }
        
        // increment frequency
        Int cnt = termFreqMap.get(term);
        if (cnt == null) {
            cnt = new Int();
            termFreqMap.put(term, cnt);
            cnt.x = freq;
        } else {
            cnt.x += freq;
        }
    }
}
 
Example 22
Source Project: Elasticsearch   Source File: TermVectorsResponse.java    License: Apache License 2.0 5 votes vote down vote up
private void buildField(XContentBuilder builder, final CharsRefBuilder spare, Fields theFields, Iterator<String> fieldIter) throws IOException {
    String fieldName = fieldIter.next();
    builder.startObject(fieldName);
    Terms curTerms = theFields.terms(fieldName);
    // write field statistics
    buildFieldStatistics(builder, curTerms);
    builder.startObject(FieldStrings.TERMS);
    TermsEnum termIter = curTerms.iterator();
    BoostAttribute boostAtt = termIter.attributes().addAttribute(BoostAttribute.class);
    for (int i = 0; i < curTerms.size(); i++) {
        buildTerm(builder, spare, curTerms, termIter, boostAtt);
    }
    builder.endObject();
    builder.endObject();
}
 
Example 23
Source Project: lucene-solr   Source File: NRTSuggester.java    License: Apache License 2.0 5 votes vote down vote up
static int parseSurfaceForm(final BytesRef output, int payloadSep, CharsRefBuilder spare) {
  int surfaceFormLen = -1;
  for (int i = 0; i < output.length; i++) {
    if (output.bytes[output.offset + i] == payloadSep) {
      surfaceFormLen = i;
      break;
    }
  }
  assert surfaceFormLen != -1 : "no payloadSep found, unable to determine surface form";
  spare.copyUTF8Bytes(output.bytes, output.offset, surfaceFormLen);
  return surfaceFormLen;
}
 
Example 24
Source Project: lucene-solr   Source File: TestContextSuggestField.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testTokenStream() throws Exception {
  Analyzer analyzer = new MockAnalyzer(random());
  ContextSuggestField field = new ContextSuggestField("field", "input", 1, "context1", "context2");
  BytesRef surfaceForm = new BytesRef("input");
  ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
  try (OutputStreamDataOutput output = new OutputStreamDataOutput(byteArrayOutputStream)) {
    output.writeVInt(surfaceForm.length);
    output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
    output.writeVInt(1 + 1);
    output.writeByte(ContextSuggestField.TYPE);
  }
  BytesRef payload = new BytesRef(byteArrayOutputStream.toByteArray());
  String[] expectedOutputs = new String[2];
  CharsRefBuilder builder = new CharsRefBuilder();
  builder.append("context1");
  builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR));
  builder.append((char) ConcatenateGraphFilter.SEP_LABEL);
  builder.append("input");
  expectedOutputs[0] = builder.toCharsRef().toString();
  builder.clear();
  builder.append("context2");
  builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR));
  builder.append((char) ConcatenateGraphFilter.SEP_LABEL);
  builder.append("input");
  expectedOutputs[1] = builder.toCharsRef().toString();
  TokenStream stream = new TestSuggestField.PayloadAttrToTypeAttrFilter(field.tokenStream(analyzer, null));
  assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 0}, null, null);

  CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer);
  stream = new TestSuggestField.PayloadAttrToTypeAttrFilter(field.tokenStream(completionAnalyzer, null));
  assertTokenStreamContents(stream, expectedOutputs, null, null, new String[]{payload.utf8ToString(), payload.utf8ToString()}, new int[]{1, 0}, null, null);
}
 
Example 25
Source Project: lucene-solr   Source File: SynonymMap.java    License: Apache License 2.0 5 votes vote down vote up
/** Sugar: analyzes the text with the analyzer and
 *  separates by {@link SynonymMap#WORD_SEPARATOR}.
 *  reuse and its chars must not be null. */
public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException {
  try (TokenStream ts = analyzer.tokenStream("", text)) {
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
    ts.reset();
    reuse.clear();
    while (ts.incrementToken()) {
      int length = termAtt.length();
      if (length == 0) {
        throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
      }
      if (posIncAtt.getPositionIncrement() != 1) {
        throw new IllegalArgumentException("term: " + text + " analyzed to a token (" + termAtt +
                                           ") with position increment != 1 (got: " + posIncAtt.getPositionIncrement() + ")");
      }
      reuse.grow(reuse.length() + length + 1); /* current + word + separator */
      int end = reuse.length();
      if (reuse.length() > 0) {
        reuse.setCharAt(end++, SynonymMap.WORD_SEPARATOR);
        reuse.setLength(reuse.length() + 1);
      }
      System.arraycopy(termAtt.buffer(), 0, reuse.chars(), end, length);
      reuse.setLength(reuse.length() + length);
    }
    ts.end();
  }
  if (reuse.length() == 0) {
    throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
  }
  return reuse.get();
}
 
Example 26
Source Project: lucene-solr   Source File: WordnetSynonymParser.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void parse(Reader in) throws IOException, ParseException {
  LineNumberReader br = new LineNumberReader(in);
  try {
    String line = null;
    String lastSynSetID = "";
    CharsRef synset[] = new CharsRef[8];
    int synsetSize = 0;
    
    while ((line = br.readLine()) != null) {
      String synSetID = line.substring(2, 11);

      if (!synSetID.equals(lastSynSetID)) {
        addInternal(synset, synsetSize);
        synsetSize = 0;
      }

      synset = ArrayUtil.grow(synset, synsetSize + 1);
      synset[synsetSize] = parseSynonym(line, new CharsRefBuilder());
      synsetSize++;
      lastSynSetID = synSetID;
    }
    
    // final synset in the file
    addInternal(synset, synsetSize);
  } catch (IllegalArgumentException e) {
    ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
    ex.initCause(e);
    throw ex;
  } finally {
    br.close();
  }
}
 
Example 27
Source Project: lucene-solr   Source File: TestSynonymGraphFilter.java    License: Apache License 2.0 5 votes vote down vote up
private void add(SynonymMap.Builder b, String input, String output, boolean keepOrig) {
  if (VERBOSE) {
    //System.out.println("  add input=" + input + " output=" + output + " keepOrig=" + keepOrig);
  }
  CharsRefBuilder inputCharsRef = new CharsRefBuilder();
  SynonymMap.Builder.join(input.split(" +"), inputCharsRef);

  CharsRefBuilder outputCharsRef = new CharsRefBuilder();
  SynonymMap.Builder.join(output.split(" +"), outputCharsRef);

  b.add(inputCharsRef.get(), outputCharsRef.get(), keepOrig);
}
 
Example 28
Source Project: lucene-solr   Source File: TestSynonymGraphFilter.java    License: Apache License 2.0 5 votes vote down vote up
private void assertMapping(String inputString, String outputString) throws IOException {
  SynonymMap.Builder builder = new SynonymMap.Builder(false);
  // the rules must be lowercased up front, but the incoming tokens will be case insensitive:
  CharsRef input = SynonymMap.Builder.join(inputString.toLowerCase(Locale.ROOT).split(" "), new CharsRefBuilder());
  CharsRef output = SynonymMap.Builder.join(outputString.split(" "), new CharsRefBuilder());
  builder.add(input, output, true);
  Analyzer analyzer = new CustomAnalyzer(builder.build());
  TokenStream tokenStream = analyzer.tokenStream("field", inputString);
  assertTokenStreamContents(tokenStream, new String[]{
      outputString, inputString
    });
}
 
Example 29
Source Project: lucene-solr   Source File: TestSynonymMapFilter.java    License: Apache License 2.0 5 votes vote down vote up
private void add(String input, String output, boolean keepOrig) {
  if (VERBOSE) {
    System.out.println("  add input=" + input + " output=" + output + " keepOrig=" + keepOrig);
  }
  CharsRefBuilder inputCharsRef = new CharsRefBuilder();
  SynonymMap.Builder.join(input.split(" +"), inputCharsRef);

  CharsRefBuilder outputCharsRef = new CharsRefBuilder();
  SynonymMap.Builder.join(output.split(" +"), outputCharsRef);

  b.add(inputCharsRef.get(), outputCharsRef.get(), keepOrig);
}
 
Example 30
Source Project: lucene-solr   Source File: TestConcatenateGraphFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testWithMultipleTokens() throws Exception {
  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
  String input = "mykeyword another keyword";
  tokenStream.setReader(new StringReader(input));
  ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream);
  CharsRefBuilder builder = new CharsRefBuilder();
  builder.append("mykeyword");
  builder.append(SEP_LABEL);
  builder.append("another");
  builder.append(SEP_LABEL);
  builder.append("keyword");
  assertTokenStreamContents(stream, new String[]{builder.toCharsRef().toString()}, null, null, new int[]{1});
}