Java Code Examples for org.apache.lucene.util.CharsRef

The following examples show how to use org.apache.lucene.util.CharsRef. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: lucene-solr   Source File: SynonymMap.java    License: Apache License 2.0 6 votes vote down vote up
/** Sugar: just joins the provided terms with {@link
 *  SynonymMap#WORD_SEPARATOR}.  reuse and its chars
 *  must not be null. */
public static CharsRef join(String[] words, CharsRefBuilder reuse) {
  int upto = 0;
  char[] buffer = reuse.chars();
  for (String word : words) {
    final int wordLen = word.length();
    final int needed = (0 == upto ? wordLen : 1 + upto + wordLen); // Add 1 for WORD_SEPARATOR
    if (needed > buffer.length) {
      reuse.grow(needed);
      buffer = reuse.chars();
    }
    if (upto > 0) {
      buffer[upto++] = SynonymMap.WORD_SEPARATOR;
    }

    word.getChars(0, wordLen, buffer, upto);
    upto += wordLen;
  }
  reuse.setLength(upto);
  return reuse.get();
}
 
Example 2
Source Project: lucene-solr   Source File: SynonymMap.java    License: Apache License 2.0 6 votes vote down vote up
/** only used for asserting! */
private boolean hasHoles(CharsRef chars) {
  final int end = chars.offset + chars.length;
  for(int idx=chars.offset+1;idx<end;idx++) {
    if (chars.chars[idx] == SynonymMap.WORD_SEPARATOR && chars.chars[idx-1] == SynonymMap.WORD_SEPARATOR) {
      return true;
    }
  }
  if (chars.chars[chars.offset] == '\u0000') {
    return true;
  }
  if (chars.chars[chars.offset + chars.length - 1] == '\u0000') {
    return true;
  }

  return false;
}
 
Example 3
Source Project: lucene-solr   Source File: WordnetSynonymParser.java    License: Apache License 2.0 6 votes vote down vote up
private void addInternal(CharsRef synset[], int size) {
  if (size <= 1) {
    return; // nothing to do
  }
  
  if (expand) {
    for (int i = 0; i < size; i++) {
      for (int j = 0; j < size; j++) {
        if (i != j) {
          add(synset[i], synset[j], true);
        }
      }
    }
  } else {
    for (int i = 0; i < size; i++) {
      add(synset[i], synset[0], false);
    }
  }
}
 
Example 4
Source Project: lucene-solr   Source File: NormalizeCharMap.java    License: Apache License 2.0 6 votes vote down vote up
private NormalizeCharMap(FST<CharsRef> map) {
  this.map = map;
  if (map != null) {
    try {
      // Pre-cache root arcs:
      final FST.Arc<CharsRef> scratchArc = new FST.Arc<>();
      final FST.BytesReader fstReader = map.getBytesReader();
      map.getFirstArc(scratchArc);
      if (FST.targetHasArcs(scratchArc)) {
        map.readFirstRealTargetArc(scratchArc.target(), scratchArc, fstReader);
        while(true) {
          assert scratchArc.label() != FST.END_LABEL;
          cachedRootArcs.put(Character.valueOf((char) scratchArc.label()), new FST.Arc<CharsRef>().copyFrom(scratchArc));
          if (scratchArc.isLast()) {
            break;
          }
          map.readNextRealArc(scratchArc, fstReader);
        }
      }
      //System.out.println("cached " + cachedRootArcs.size() + " root arcs");
    } catch (IOException ioe) {
      // Bogus FST IOExceptions!!  (will never happen)
      throw new RuntimeException(ioe);
    }
  }
}
 
Example 5
Source Project: lucene-solr   Source File: NormalizeCharMap.java    License: Apache License 2.0 6 votes vote down vote up
/** Builds the NormalizeCharMap; call this once you
 *  are done calling {@link #add}. */
public NormalizeCharMap build() {

  final FST<CharsRef> map;
  try {
    final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
    final FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
    final IntsRefBuilder scratch = new IntsRefBuilder();
    for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
      fstCompiler.add(Util.toUTF16(ent.getKey(), scratch),
                  new CharsRef(ent.getValue()));
    }
    map = fstCompiler.compile();
    pendingPairs.clear();
  } catch (IOException ioe) {
    // Bogus FST IOExceptions!!  (will never happen)
    throw new RuntimeException(ioe);
  }

  return new NormalizeCharMap(map);
}
 
Example 6
Source Project: lucene-solr   Source File: Stemmer.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Find the unique stem(s) of the provided word
 * 
 * @param word Word to find the stems for
 * @return List of stems for the word
 */
public List<CharsRef> uniqueStems(char word[], int length) {
  List<CharsRef> stems = stem(word, length);
  if (stems.size() < 2) {
    return stems;
  }
  CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase);
  List<CharsRef> deduped = new ArrayList<>();
  for (CharsRef s : stems) {
    if (!terms.contains(s)) {
      deduped.add(s);
      terms.add(s);
    }
  }
  return deduped;
}
 
Example 7
Source Project: lucene-solr   Source File: Dictionary.java    License: Apache License 2.0 6 votes vote down vote up
private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException {
  Map<String,String> mappings = new TreeMap<>();
  
  for (int i = 0; i < num; i++) {
    String line = reader.readLine();
    String parts[] = line.split("\\s+");
    if (parts.length != 3) {
      throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
    }
    if (mappings.put(parts[1], parts[2]) != null) {
      throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
    }
  }
  
  Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
  FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  for (Map.Entry<String,String> entry : mappings.entrySet()) {
    Util.toUTF16(entry.getKey(), scratchInts);
    fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue()));
  }
  
  return fstCompiler.compile();
}
 
Example 8
Source Project: lucene-solr   Source File: TestConcatenateGraphFilter.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testSeparatorWithSynonyms() throws IOException {
  SynonymMap.Builder builder = new SynonymMap.Builder(true);
  builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true);
  builder.add(new CharsRef("mykeyword"), new CharsRef("three words synonym"), true);
  Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
  String input = " mykeyword another keyword   ";
  tokenizer.setReader(new StringReader(input));
  SynonymGraphFilter filter = new SynonymGraphFilter(tokenizer, builder.build(), true);
  ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter, '-', false, 100);
  assertTokenStreamContents(stream, new String[] {
      "mykeyword-another-keyword",
      "mysynonym-another-keyword",
      "three words synonym-another-keyword"
  }, null, null, new int[] { 1, 0 ,0});
}
 
Example 9
Source Project: lucene-solr   Source File: TestLimitTokenPositionFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testMaxPosition3WithSynomyms() throws IOException {
  for (final boolean consumeAll : new boolean[]{true, false}) {
    MockTokenizer tokenizer = whitespaceMockTokenizer("one two three four five");
    // if we are consuming all tokens, we can use the checks, otherwise we can't
    tokenizer.setEnableChecks(consumeAll);

    SynonymMap.Builder builder = new SynonymMap.Builder(true);
    builder.add(new CharsRef("one"), new CharsRef("first"), true);
    builder.add(new CharsRef("one"), new CharsRef("alpha"), true);
    builder.add(new CharsRef("one"), new CharsRef("beguine"), true);
    CharsRefBuilder multiWordCharsRef = new CharsRefBuilder();
    SynonymMap.Builder.join(new String[]{"and", "indubitably", "single", "only"}, multiWordCharsRef);
    builder.add(new CharsRef("one"), multiWordCharsRef.get(), true);
    SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef);
    builder.add(new CharsRef("two"), multiWordCharsRef.get(), true);
    SynonymMap synonymMap = builder.build();
    @SuppressWarnings("deprecation")
    TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true);
    stream = new LimitTokenPositionFilter(stream, 3, consumeAll);

    // "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3.
    assertTokenStreamContents(stream,
        new String[]{"one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger"},
        new int[]{1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0});
  }
}
 
Example 10
Source Project: lucene-solr   Source File: DaciukMihovAutomatonBuilder.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
 * strings in UTF-8. These strings must be binary-sorted.
 */
public static Automaton build(Collection<BytesRef> input) {
  final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();
  
  char[] chars = new char[0];
  CharsRef ref = new CharsRef();
  for (BytesRef b : input) {
    chars = ArrayUtil.grow(chars, b.length);
    final int len = UnicodeUtil.UTF8toUTF16(b, chars);
    ref.chars = chars;
    ref.length = len;
    builder.add(ref);
  }
  
  Automaton.Builder a = new Automaton.Builder();
  convert(a,
      builder.complete(), 
      new IdentityHashMap<State,Integer>());

  return a.finish();
}
 
Example 11
Source Project: lucene-solr   Source File: CharSequenceOutputs.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public CharsRef subtract(CharsRef output, CharsRef inc) {
  assert output != null;
  assert inc != null;
  if (inc == NO_OUTPUT) {
    // no prefix removed
    return output;
  } else if (inc.length == output.length) {
    // entire output removed
    return NO_OUTPUT;
  } else {
    assert inc.length < output.length: "inc.length=" + inc.length + " vs output.length=" + output.length;
    assert inc.length > 0;
    return new CharsRef(output.chars, output.offset + inc.length, output.length-inc.length);
  }
}
 
Example 12
Source Project: lucene-solr   Source File: CharSequenceOutputs.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public CharsRef add(CharsRef prefix, CharsRef output) {
  assert prefix != null;
  assert output != null;
  if (prefix == NO_OUTPUT) {
    return output;
  } else if (output == NO_OUTPUT) {
    return prefix;
  } else {
    assert prefix.length > 0;
    assert output.length > 0;
    CharsRef result = new CharsRef(prefix.length + output.length);
    System.arraycopy(prefix.chars, prefix.offset, result.chars, 0, prefix.length);
    System.arraycopy(output.chars, output.offset, result.chars, prefix.length, output.length);
    result.length = prefix.length + output.length;
    return result;
  }
}
 
Example 13
Source Project: lucene-solr   Source File: TestFSTDirectAddressing.java    License: Apache License 2.0 6 votes vote down vote up
private static void recompileAndWalk(String fstFilePath) throws IOException {
  try (InputStreamDataInput in = new InputStreamDataInput(newInputStream(Paths.get(fstFilePath)))) {

    System.out.println("Reading FST");
    long startTimeMs = System.currentTimeMillis();
    FST<CharsRef> originalFst = new FST<>(in, in, CharSequenceOutputs.getSingleton());
    long endTimeMs = System.currentTimeMillis();
    System.out.println("time = " + (endTimeMs - startTimeMs) + " ms");

    for (float oversizingFactor : List.of(0f, 0f, 0f, 1f, 1f, 1f)) {
      System.out.println("\nFST construction (oversizingFactor=" + oversizingFactor + ")");
      startTimeMs = System.currentTimeMillis();
      FST<CharsRef> fst = recompile(originalFst, oversizingFactor);
      endTimeMs = System.currentTimeMillis();
      System.out.println("time = " + (endTimeMs - startTimeMs) + " ms");
      System.out.println("FST RAM = " + fst.ramBytesUsed() + " B");

      System.out.println("FST enum");
      startTimeMs = System.currentTimeMillis();
      walk(fst);
      endTimeMs = System.currentTimeMillis();
      System.out.println("time = " + (endTimeMs - startTimeMs) + " ms");
    }
  }
}
 
Example 14
/** Convert NamedList (suggester response) to {@link SuggesterResult} */
private SuggesterResult toSuggesterResult(Map<String, SimpleOrderedMap<NamedList<Object>>> suggestionsMap) {
  SuggesterResult result = new SuggesterResult();
  if (suggestionsMap == null) {
    return result;
  }
  // for each token
  for(Map.Entry<String, SimpleOrderedMap<NamedList<Object>>> entry : suggestionsMap.entrySet()) {
    String suggesterName = entry.getKey();
    for (Iterator<Map.Entry<String, NamedList<Object>>> suggestionsIter = entry.getValue().iterator(); suggestionsIter.hasNext();) {
      Map.Entry<String, NamedList<Object>> suggestions = suggestionsIter.next(); 
      String tokenString = suggestions.getKey();
      List<LookupResult> lookupResults = new ArrayList<>();
      NamedList<Object> suggestion = suggestions.getValue();
      // for each suggestion
      for (int j = 0; j < suggestion.size(); j++) {
        String property = suggestion.getName(j);
        if (property.equals(SuggesterResultLabels.SUGGESTIONS)) {
          @SuppressWarnings("unchecked")
          List<NamedList<Object>> suggestionEntries = (List<NamedList<Object>>) suggestion.getVal(j);
          for(NamedList<Object> suggestionEntry : suggestionEntries) {
            String term = (String) suggestionEntry.get(SuggesterResultLabels.SUGGESTION_TERM);
            Long weight = (Long) suggestionEntry.get(SuggesterResultLabels.SUGGESTION_WEIGHT);
            String payload = (String) suggestionEntry.get(SuggesterResultLabels.SUGGESTION_PAYLOAD);
            LookupResult res = new LookupResult(new CharsRef(term), weight, new BytesRef(payload));
            lookupResults.add(res);
          }
        }
        result.add(suggesterName, tokenString, lookupResults);
      }
    }
  }
  return result;
}
 
Example 15
Source Project: lucene-solr   Source File: SynonymFilter.java    License: Apache License 2.0 5 votes vote down vote up
public CharsRef pullNext() {
  assert upto < count;
  lastEndOffset = endOffsets[upto];
  lastPosLength = posLengths[upto];
  final CharsRefBuilder result = outputs[upto++];
  posIncr = 0;
  if (upto == count) {
    reset();
  }
  return result.get();
}
 
Example 16
Source Project: lucene-solr   Source File: SynonymMap.java    License: Apache License 2.0 5 votes vote down vote up
private int countWords(CharsRef chars) {
  int wordCount = 1;
  int upto = chars.offset;
  final int limit = chars.offset + chars.length;
  while(upto < limit) {
    if (chars.chars[upto++] == SynonymMap.WORD_SEPARATOR) {
      wordCount++;
    }
  }
  return wordCount;
}
 
Example 17
Source Project: lucene-solr   Source File: SynonymMap.java    License: Apache License 2.0 5 votes vote down vote up
/** Sugar: analyzes the text with the analyzer and
 *  separates by {@link SynonymMap#WORD_SEPARATOR}.
 *  reuse and its chars must not be null. */
public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException {
  try (TokenStream ts = analyzer.tokenStream("", text)) {
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
    ts.reset();
    reuse.clear();
    while (ts.incrementToken()) {
      int length = termAtt.length();
      if (length == 0) {
        throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
      }
      if (posIncAtt.getPositionIncrement() != 1) {
        throw new IllegalArgumentException("term: " + text + " analyzed to a token (" + termAtt +
                                           ") with position increment != 1 (got: " + posIncAtt.getPositionIncrement() + ")");
      }
      reuse.grow(reuse.length() + length + 1); /* current + word + separator */
      int end = reuse.length();
      if (reuse.length() > 0) {
        reuse.setCharAt(end++, SynonymMap.WORD_SEPARATOR);
        reuse.setLength(reuse.length() + 1);
      }
      System.arraycopy(termAtt.buffer(), 0, reuse.chars(), end, length);
      reuse.setLength(reuse.length() + length);
    }
    ts.end();
  }
  if (reuse.length() == 0) {
    throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
  }
  return reuse.get();
}
 
Example 18
Source Project: lucene-solr   Source File: WordnetSynonymParser.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void parse(Reader in) throws IOException, ParseException {
  LineNumberReader br = new LineNumberReader(in);
  try {
    String line = null;
    String lastSynSetID = "";
    CharsRef synset[] = new CharsRef[8];
    int synsetSize = 0;
    
    while ((line = br.readLine()) != null) {
      String synSetID = line.substring(2, 11);

      if (!synSetID.equals(lastSynSetID)) {
        addInternal(synset, synsetSize);
        synsetSize = 0;
      }

      synset = ArrayUtil.grow(synset, synsetSize + 1);
      synset[synsetSize] = parseSynonym(line, new CharsRefBuilder());
      synsetSize++;
      lastSynSetID = synSetID;
    }
    
    // final synset in the file
    addInternal(synset, synsetSize);
  } catch (IllegalArgumentException e) {
    ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
    ex.initCause(e);
    throw ex;
  } finally {
    br.close();
  }
}
 
Example 19
Source Project: lucene-solr   Source File: Stemmer.java    License: Apache License 2.0 5 votes vote down vote up
private List<CharsRef> doStem(char word[], int length, boolean caseVariant) {
  List<CharsRef> stems = new ArrayList<>();
  IntsRef forms = dictionary.lookupWord(word, 0, length);
  if (forms != null) {
    for (int i = 0; i < forms.length; i += formStep) {
      boolean checkKeepCase = caseVariant && dictionary.keepcase != -1;
      boolean checkNeedAffix = dictionary.needaffix != -1;
      boolean checkOnlyInCompound = dictionary.onlyincompound != -1;
      if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) {
        dictionary.flagLookup.get(forms.ints[forms.offset+i], scratch);
        char wordFlags[] = Dictionary.decodeFlags(scratch);
        // we are looking for a case variant, but this word does not allow it
        if (checkKeepCase && Dictionary.hasFlag(wordFlags, (char)dictionary.keepcase)) {
          continue;
        }
        // we can't add this form, it's a pseudostem requiring an affix
        if (checkNeedAffix && Dictionary.hasFlag(wordFlags, (char)dictionary.needaffix)) {
          continue;
        }
        // we can't add this form, it only belongs inside a compound word
        if (checkOnlyInCompound && Dictionary.hasFlag(wordFlags, (char)dictionary.onlyincompound)) {
          continue;
        }
      }
      stems.add(newStem(word, length, forms, i));
    }
  }
  try {
    boolean v = stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false, caseVariant));
  } catch (IOException bogus) {
    throw new RuntimeException(bogus);
  }
  return stems;
}
 
Example 20
Source Project: lucene-solr   Source File: Stemmer.java    License: Apache License 2.0 5 votes vote down vote up
private CharsRef newStem(char buffer[], int length, IntsRef forms, int formID) {
  final String exception;
  if (dictionary.hasStemExceptions) {
    int exceptionID = forms.ints[forms.offset + formID + 1];
    if (exceptionID > 0) {
      exception = dictionary.getStemException(exceptionID);
    } else {
      exception = null;
    }
  } else {
    exception = null;
  }
  
  if (dictionary.needsOutputCleaning) {
    scratchSegment.setLength(0);
    if (exception != null) {
      scratchSegment.append(exception);
    } else {
      scratchSegment.append(buffer, 0, length);
    }
    try {
      Dictionary.applyMappings(dictionary.oconv, scratchSegment);
    } catch (IOException bogus) {
      throw new RuntimeException(bogus);
    }
    char cleaned[] = new char[scratchSegment.length()];
    scratchSegment.getChars(0, cleaned.length, cleaned, 0);
    return new CharsRef(cleaned, 0, cleaned.length);
  } else {
    if (exception != null) {
      return new CharsRef(exception);
    } else {
      return new CharsRef(buffer, 0, length);
    }
  }
}
 
Example 21
Source Project: lucene-solr   Source File: HunspellStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public int compare(CharsRef o1, CharsRef o2) {
  int cmp = Integer.compare(o2.length, o1.length);
  if (cmp == 0) {
    // tie break on text
    return o2.compareTo(o1);
  } else {
    return cmp;
  }
}
 
Example 22
Source Project: lucene-solr   Source File: Dictionary.java    License: Apache License 2.0 5 votes vote down vote up
static char[] decodeFlags(BytesRef b) {
  if (b.length == 0) {
    return CharsRef.EMPTY_CHARS;
  }
  int len = b.length >>> 1;
  char flags[] = new char[len];
  int upto = 0;
  int end = b.offset + b.length;
  for (int i = b.offset; i < end; i += 2) {
    flags[upto++] = (char)((b.bytes[i] << 8) | (b.bytes[i+1] & 0xff));
  }
  return flags;
}
 
Example 23
Source Project: lucene-solr   Source File: Dictionary.java    License: Apache License 2.0 5 votes vote down vote up
static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException {
  final FST.BytesReader bytesReader = fst.getBytesReader();
  final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<CharsRef>());
  final CharsRef NO_OUTPUT = fst.outputs.getNoOutput();
  
  // temporary stuff
  final FST.Arc<CharsRef> arc = new FST.Arc<>();
  int longestMatch;
  CharsRef longestOutput;
  
  for (int i = 0; i < sb.length(); i++) {
    arc.copyFrom(firstArc);
    CharsRef output = NO_OUTPUT;
    longestMatch = -1;
    longestOutput = null;
    
    for (int j = i; j < sb.length(); j++) {
      char ch = sb.charAt(j);
      if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
        break;
      } else {
        output = fst.outputs.add(output, arc.output());
      }
      if (arc.isFinal()) {
        longestOutput = fst.outputs.add(output, arc.nextFinalOutput());
        longestMatch = j;
      }
    }
    
    if (longestMatch >= 0) {
      sb.delete(i, longestMatch+1);
      sb.insert(i, longestOutput);
      i += (longestOutput.length - 1);
    }
  }
}
 
Example 24
Source Project: lucene-solr   Source File: TestSynonymGraphFilter.java    License: Apache License 2.0 5 votes vote down vote up
private void assertMapping(String inputString, String outputString) throws IOException {
  SynonymMap.Builder builder = new SynonymMap.Builder(false);
  // the rules must be lowercased up front, but the incoming tokens will be case insensitive:
  CharsRef input = SynonymMap.Builder.join(inputString.toLowerCase(Locale.ROOT).split(" "), new CharsRefBuilder());
  CharsRef output = SynonymMap.Builder.join(outputString.split(" "), new CharsRefBuilder());
  builder.add(input, output, true);
  Analyzer analyzer = new CustomAnalyzer(builder.build());
  TokenStream tokenStream = analyzer.tokenStream("field", inputString);
  assertTokenStreamContents(tokenStream, new String[]{
      outputString, inputString
    });
}
 
Example 25
Source Project: lucene-solr   Source File: BaseSynonymParserTestCase.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Helper method to validate synonym parsing.
 *
 * @param synonynMap  the generated synonym map after parsing
 * @param word        word (phrase) we are validating the synonyms for. Should be the value that comes out of the analyzer.
 *                    All spaces will be replaced by word separators.
 * @param includeOrig if synonyms should include original
 * @param synonyms    actual synonyms. All word separators are replaced with a single space.
 */
public static void assertEntryEquals(SynonymMap synonynMap, String word, boolean includeOrig, String[] synonyms)
    throws Exception {
  word = word.replace(' ', SynonymMap.WORD_SEPARATOR);
  BytesRef value = Util.get(synonynMap.fst, Util.toUTF32(new CharsRef(word), new IntsRefBuilder()));
  assertNotNull("No synonyms found for: " + word, value);

  ByteArrayDataInput bytesReader = new ByteArrayDataInput(value.bytes, value.offset, value.length);
  final int code = bytesReader.readVInt();

  final boolean keepOrig = (code & 0x1) == 0;
  assertEquals("Include original different than expected. Expected " + includeOrig + " was " + keepOrig,
      includeOrig, keepOrig);

  final int count = code >>> 1;
  assertEquals("Invalid synonym count. Expected " + synonyms.length + " was " + count,
      synonyms.length, count);

  Set<String> synonymSet = new HashSet<>(Arrays.asList(synonyms));

  BytesRef scratchBytes = new BytesRef();
  for (int i = 0; i < count; i++) {
    synonynMap.words.get(bytesReader.readVInt(), scratchBytes);
    String synonym = scratchBytes.utf8ToString().replace(SynonymMap.WORD_SEPARATOR, ' ');
    assertTrue("Unexpected synonym found: " + synonym, synonymSet.contains(synonym));
  }
}
 
Example 26
Source Project: lucene-solr   Source File: StemmerTestBase.java    License: Apache License 2.0 5 votes vote down vote up
static void assertStemsTo(String s, String... expected) {
  assertNotNull(stemmer);
  Arrays.sort(expected);
  
  List<CharsRef> stems = stemmer.stem(s);
  String actual[] = new String[stems.size()];
  for (int i = 0; i < actual.length; i++) {
    actual[i] = stems.get(i).toString();
  }
  Arrays.sort(actual);
  
  assertArrayEquals("expected=" + Arrays.toString(expected) + ",actual=" + Arrays.toString(actual), expected, actual);
}
 
Example 27
Source Project: lucene-solr   Source File: Test64kAffixes.java    License: Apache License 2.0 5 votes vote down vote up
public void test() throws Exception {
  Path tempDir = createTempDir("64kaffixes");
  Path affix = tempDir.resolve("64kaffixes.aff");
  Path dict = tempDir.resolve("64kaffixes.dic");
  
  BufferedWriter affixWriter = Files.newBufferedWriter(affix, StandardCharsets.UTF_8);
  
  // 65k affixes with flag 1, then an affix with flag 2
  affixWriter.write("SET UTF-8\nFLAG num\nSFX 1 Y 65536\n");
  for (int i = 0; i < 65536; i++) {
    affixWriter.write("SFX 1 0 " + Integer.toHexString(i) + " .\n");
  }
  affixWriter.write("SFX 2 Y 1\nSFX 2 0 s\n");
  affixWriter.close();
  
  BufferedWriter dictWriter = Files.newBufferedWriter(dict, StandardCharsets.UTF_8);
  
  // drink signed with affix 2 (takes -s)
  dictWriter.write("1\ndrink/2\n");
  dictWriter.close();
  
  try (InputStream affStream = Files.newInputStream(affix); InputStream dictStream = Files.newInputStream(dict); Directory tempDir2 = newDirectory()) {
    Dictionary dictionary = new Dictionary(tempDir2, "dictionary", affStream, dictStream);
    Stemmer stemmer = new Stemmer(dictionary);
    // drinks should still stem to drink
    List<CharsRef> stems = stemmer.stem("drinks");
    assertEquals(1, stems.size());
    assertEquals("drink", stems.get(0).toString());
  }
}
 
Example 28
Source Project: lucene-solr   Source File: TestConcatenateGraphFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testWithSynonym() throws Exception {
  SynonymMap.Builder builder = new SynonymMap.Builder(true);
  builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true);
  Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
  tokenizer.setReader(new StringReader("mykeyword"));
  @SuppressWarnings("deprecation")
  SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true);
  ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter);
  assertTokenStreamContents(stream, new String[] {"mykeyword", "mysynonym"}, null, null, new int[] { 1, 0 });
}
 
Example 29
Source Project: lucene-solr   Source File: TestConcatenateGraphFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testWithSynonyms() throws Exception {
  SynonymMap.Builder builder = new SynonymMap.Builder(true);
  builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true);
  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
  String input = "mykeyword another keyword";
  tokenStream.setReader(new StringReader(input));
  @SuppressWarnings("deprecation")
  SynonymFilter filter = new SynonymFilter(tokenStream, builder.build(), true);
  ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter, SEP_LABEL, false, 100);
  String[] expectedOutputs = new String[2];
  CharsRefBuilder expectedOutput = new CharsRefBuilder();
  expectedOutput.append("mykeyword");
  expectedOutput.append(SEP_LABEL);
  expectedOutput.append("another");
  expectedOutput.append(SEP_LABEL);
  expectedOutput.append("keyword");
  expectedOutputs[0] = expectedOutput.toCharsRef().toString();
  expectedOutput.clear();
  expectedOutput.append("mysynonym");
  expectedOutput.append(SEP_LABEL);
  expectedOutput.append("another");
  expectedOutput.append(SEP_LABEL);
  expectedOutput.append("keyword");
  expectedOutputs[1] = expectedOutput.toCharsRef().toString();
  assertTokenStreamContents(stream, expectedOutputs, null, null, new int[]{1, 0});
}
 
Example 30
Source Project: lucene-solr   Source File: TestConcatenateGraphFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testValidNumberOfExpansions() throws IOException {
  SynonymMap.Builder builder = new SynonymMap.Builder(true);
  for (int i = 0; i < 256; i++) {
    builder.add(new CharsRef("" + (i+1)), new CharsRef("" + (1000 + (i+1))), true);
  }
  StringBuilder valueBuilder = new StringBuilder();
  for (int i = 0 ; i < 8 ; i++) {
    valueBuilder.append(i+1);
    valueBuilder.append(" ");
  }
  MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
  tokenizer.setReader(new StringReader(valueBuilder.toString()));
  @SuppressWarnings("deprecation")
  SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true);

  int count;
  try (ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter)) {
    stream.reset();
    ConcatenateGraphFilter.BytesRefBuilderTermAttribute attr = stream.addAttribute(ConcatenateGraphFilter.BytesRefBuilderTermAttribute.class);
    count = 0;
    while (stream.incrementToken()) {
      count++;
      assertNotNull(attr.getBytesRef());
      assertTrue(attr.getBytesRef().length > 0);
    }
  }
  assertEquals(count, 256);
}