Java Code Examples for org.apache.lucene.analysis.CharArraySet#add()

The following examples show how to use org.apache.lucene.analysis.CharArraySet#add() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestProtectedTermFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testBasic() throws IOException {

    CannedTokenStream cts = new CannedTokenStream(
        new Token("Alice", 1, 0, 5),
        new Token("Bob", 1, 6, 9),
        new Token("Clara", 1, 10, 15),
        new Token("David", 1, 16, 21)
    );

    CharArraySet protectedTerms = new CharArraySet(5, true);
    protectedTerms.add("bob");

    TokenStream ts = new ProtectedTermFilter(protectedTerms, cts, LowerCaseFilter::new);
    assertTokenStreamContents(ts, new String[]{ "alice", "Bob", "clara", "david" });

  }
 
Example 2
Source File: TestMorfologikAnalyzer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** */
public final void testKeywordAttrTokens() throws IOException {
  Analyzer a = new MorfologikAnalyzer() {
    @Override
    protected TokenStreamComponents createComponents(String field) {
      final CharArraySet keywords = new CharArraySet(1, false);
      keywords.add("liście");

      final Tokenizer src = new StandardTokenizer();
      TokenStream result = new SetKeywordMarkerFilter(src, keywords);
      result = new MorfologikFilter(result); 

      return new TokenStreamComponents(src, result);
    }
  };

  assertAnalyzesTo(
    a,
    "liście danych",
    new String[] { "liście", "dany", "dana", "dane", "dać" },
    new int[] { 0, 7, 7, 7, 7 },
    new int[] { 6, 13, 13, 13, 13 },
    new int[] { 1, 1, 0, 0, 0 });
  a.close();
}
 
Example 3
Source File: Stemmer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Find the unique stem(s) of the provided word
 * 
 * @param word Word to find the stems for
 * @return List of stems for the word
 */
public List<CharsRef> uniqueStems(char word[], int length) {
  List<CharsRef> stems = stem(word, length);
  if (stems.size() < 2) {
    return stems;
  }
  CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase);
  List<CharsRef> deduped = new ArrayList<>();
  for (CharsRef s : stems) {
    if (!terms.contains(s)) {
      deduped.add(s);
      terms.add(s);
    }
  }
  return deduped;
}
 
Example 4
Source File: TestBulgarianAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testWithStemExclusionSet() throws IOException {
  CharArraySet set = new CharArraySet(1, true);
  set.add("строеве");
  Analyzer a = new BulgarianAnalyzer(CharArraySet.EMPTY_SET, set);
  assertAnalyzesTo(a, "строевете строеве", new String[] { "строй", "строеве" });
  a.close();
}
 
Example 5
Source File: TestConditionalTokenFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testFilteredTokenFilters() throws IOException {

    CharArraySet protectedTerms = new CharArraySet(2, true);
    protectedTerms.add("foobar");

    TokenStream ts = whitespaceMockTokenizer("wuthering foobar abc");
    ts = new ProtectedTermFilter(protectedTerms, ts, in -> new LengthFilter(in, 1, 4));
    assertTokenStreamContents(ts, new String[]{ "foobar", "abc" });

    ts = whitespaceMockTokenizer("foobar abc");
    ts = new ProtectedTermFilter(protectedTerms, ts, in -> new LengthFilter(in, 1, 4));
    assertTokenStreamContents(ts, new String[]{ "foobar", "abc" });

  }
 
Example 6
Source File: AutoPhrasingTokenFilter.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
private CharArraySet remove(char[] charArray) {
    CharArraySet newSet = new CharArraySet(5, false);
    for (Object aCurrentSetToCheck : currentSetToCheck) {
        char[] phrase = (char[]) aCurrentSetToCheck;
        if (!isEqualTo(phrase, charArray) && startsWith(phrase, charArray) || endsWith(charArray, phrase)) {
            newSet.add(phrase);
        }
    }
    return newSet;
}
 
Example 7
Source File: StopwordAnnotator.java    From coreNlp with Apache License 2.0 5 votes vote down vote up
public static CharArraySet getStopWordList(Version luceneVersion, String stopwordList, boolean ignoreCase) {
    String[] terms = stopwordList.split(",");
    CharArraySet stopwordSet = new CharArraySet(luceneVersion, terms.length, ignoreCase);
    for (String term : terms) {
        stopwordSet.add(term);
    }
    return CharArraySet.unmodifiableSet(stopwordSet);
}
 
Example 8
Source File: TestKeywordMarkerFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Test
public void testSetFilterIncrementToken() throws IOException {
  CharArraySet set = new CharArraySet( 5, true);
  set.add("lucenefox");
  String[] output = new String[] { "the", "quick", "brown", "LuceneFox",
      "jumps" };
  assertTokenStreamContents(new LowerCaseFilterMock(
      new SetKeywordMarkerFilter(whitespaceMockTokenizer("The quIck browN LuceneFox Jumps"), set)), output);
  CharArraySet mixedCaseSet = new CharArraySet( asSet("LuceneFox"), false);
  assertTokenStreamContents(new LowerCaseFilterMock(
      new SetKeywordMarkerFilter(whitespaceMockTokenizer("The quIck browN LuceneFox Jumps"), mixedCaseSet)), output);
  CharArraySet set2 = set;
  assertTokenStreamContents(new LowerCaseFilterMock(
      new SetKeywordMarkerFilter(whitespaceMockTokenizer("The quIck browN LuceneFox Jumps"), set2)), output);
}
 
Example 9
Source File: TestFrenchAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testExclusionTableViaCtor() throws Exception {
  CharArraySet set = new CharArraySet( 1, true);
  set.add("habitable");
  FrenchAnalyzer fa = new FrenchAnalyzer(
      CharArraySet.EMPTY_SET, set);
  assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
      "chist" });
  fa.close();

  fa = new FrenchAnalyzer( CharArraySet.EMPTY_SET, set);
  assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
      "chist" });
  fa.close();
}
 
Example 10
Source File: TestCzechStemmer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet(1, true);
  set.add("hole");
  final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  in.setReader(new StringReader("hole desek"));
  CzechStemFilter filter = new CzechStemFilter(new SetKeywordMarkerFilter(
      in, set));
  assertTokenStreamContents(filter, new String[] { "hole", "desk" });
}
 
Example 11
Source File: TestBrazilianAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet(1, true);
  set.add("Brasília");
  Tokenizer tokenizer = new LetterTokenizer();
  tokenizer.setReader(new StringReader("Brasília Brasilia"));
  BrazilianStemFilter filter = new BrazilianStemFilter(new SetKeywordMarkerFilter(new LowerCaseFilter(tokenizer), set));

  assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
}
 
Example 12
Source File: TestLithuanianAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Test stemmer exceptions */
public void testStemExclusion() throws IOException{
  CharArraySet set = new CharArraySet(1, true);
  set.add("vaikų");
  Analyzer a = new LithuanianAnalyzer(CharArraySet.EMPTY_SET, set);
  assertAnalyzesTo(a, "vaikų", new String[] {"vaikų"});
}
 
Example 13
Source File: TestJapaneseNumberFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Test
public void testName() throws IOException {
  // Test name that normalises to number
  assertAnalyzesTo(analyzer, "田中京一",
      new String[]{"田中", "10000000000000001"}, // 京一 is normalized to a number
      new int[]{0, 2},
      new int[]{2, 4},
      new int[]{1, 1}
  );

  // An analyzer that marks 京一 as a keyword
  Analyzer keywordMarkingAnalyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      CharArraySet set = new CharArraySet(1, false);
      set.add("京一");

      Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, false, JapaneseTokenizer.Mode.SEARCH);
      return new TokenStreamComponents(tokenizer, new JapaneseNumberFilter(new SetKeywordMarkerFilter(tokenizer, set)));
    }
  };

  assertAnalyzesTo(keywordMarkingAnalyzer, "田中京一",
      new String[]{"田中", "京一"}, // 京一 is not normalized
      new int[]{0, 2},
      new int[]{2, 4},
      new int[]{1, 1}
  );
  keywordMarkingAnalyzer.close();
}
 
Example 14
Source File: TestBulgarianStemmer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet(1, true);
  set.add("строеве");
  MockTokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenStream.setReader(new StringReader("строевете строеве"));

  BulgarianStemFilter filter = new BulgarianStemFilter(
      new SetKeywordMarkerFilter(tokenStream, set));
  assertTokenStreamContents(filter, new String[] { "строй", "строеве" });
}
 
Example 15
Source File: TestRussianAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testWithStemExclusionSet() throws Exception {
  CharArraySet set = new CharArraySet( 1, true);
  set.add("представление");
  Analyzer a = new RussianAnalyzer( RussianAnalyzer.getDefaultStopSet() , set);
  assertAnalyzesTo(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
      new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представление" });
  a.close();
}
 
Example 16
Source File: TestArabicStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet(1, true);
  set.add("ساهدهات");
  MockTokenizer tokenStream  = whitespaceMockTokenizer("ساهدهات");

  ArabicStemFilter filter = new ArabicStemFilter(new SetKeywordMarkerFilter(tokenStream, set));
  assertTokenStreamContents(filter, new String[]{"ساهدهات"});
}
 
Example 17
Source File: TestDutchAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testExclusionTableViaCtor() throws IOException {
  CharArraySet set = new CharArraySet( 1, true);
  set.add("lichamelijk");
  DutchAnalyzer a = new DutchAnalyzer( CharArraySet.EMPTY_SET, set);
  assertAnalyzesTo(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
  a.close();

  a = new DutchAnalyzer( CharArraySet.EMPTY_SET, set);
  assertAnalyzesTo(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
  a.close();
}
 
Example 18
Source File: TestGermanAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet( 1, true);
  set.add("fischen");
  final Tokenizer in = new LetterTokenizer();
  in.setReader(new StringReader("Fischen Trinken"));
  GermanStemFilter filter = new GermanStemFilter(
      new SetKeywordMarkerFilter(new LowerCaseFilter(in), set));
  assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
}
 
Example 19
Source File: TestCzechAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testWithStemExclusionSet() throws IOException{
  CharArraySet set = new CharArraySet(1, true);
  set.add("hole");
  CzechAnalyzer cz = new CzechAnalyzer(CharArraySet.EMPTY_SET, set);
  assertAnalyzesTo(cz, "hole desek", new String[] {"hole", "desk"});
  cz.close();
}
 
Example 20
Source File: FingerprintFilter.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/**
 * Gathers all tokens from input, de-duplicates, sorts then concatenates.
 * 
 * @return false for end of stream; true otherwise
 */
private final boolean buildSingleOutputToken() throws IOException {
  inputEnded = false;

  char clonedLastTerm[] = null;
  uniqueTerms = new CharArraySet(8, false);
  int outputTokenSize = 0;
  while (input.incrementToken()) {
    if (outputTokenSize > maxOutputTokenSize) {
      continue;
    }

    final char term[] = termAttribute.buffer();
    final int length = termAttribute.length();

    if (!uniqueTerms.contains(term, 0, length)) {
      // clone the term, and add to the set of seen terms.
      clonedLastTerm = new char[length];
      System.arraycopy(term, 0, clonedLastTerm, 0, length);
      if (uniqueTerms.size() > 0) {
        outputTokenSize++; //Add 1 for the separator char we will output
      }
      uniqueTerms.add(clonedLastTerm);
      outputTokenSize += length;
    }
  }
  //Force end-of-stream operations to get the final state.
  input.end();
  inputEnded = true;

  //Gathering complete - now output exactly zero or one token:

  //Set the attributes for the single output token
  offsetAtt.setOffset(0, offsetAtt.endOffset());
  posLenAtt.setPositionLength(1);
  posIncrAtt.setPositionIncrement(1);
  typeAtt.setType("fingerprint");

  //No tokens gathered - no output
  if (uniqueTerms.size() < 1) {
    termAttribute.setEmpty();
    return false;
  }

  //Tokens gathered are too large - no output
  if (outputTokenSize > maxOutputTokenSize) {
    termAttribute.setEmpty();
    uniqueTerms.clear();
    return false;
  }

  // Special case - faster option when we have a single token
  if (uniqueTerms.size() == 1) {
    termAttribute.setEmpty().append(new String(clonedLastTerm));
    uniqueTerms.clear();
    return true;
  }

  // Sort the set of deduplicated tokens and combine 
  Object[] items = uniqueTerms.toArray();

  Arrays.sort(items, new Comparator<Object>() {
    @Override
    public int compare(Object o1, Object o2) {
      char v1[] = (char[]) o1;
      char v2[] = (char[]) o2;
      int len1 = v1.length;
      int len2 = v2.length;
      int lim = Math.min(len1, len2);

      int k = 0;
      while (k < lim) {
        char c1 = v1[k];
        char c2 = v2[k];
        if (c1 != c2) {
          return c1 - c2;
        }
        k++;
      }
      return len1 - len2;
    }
  });

  //TODO lets append directly to termAttribute?
  StringBuilder sb = new StringBuilder();
  for (Object item : items) {
    if (sb.length() >= 1) {
      sb.append(separator);
    }
    sb.append((char[]) item);
  }
  termAttribute.setEmpty().append(sb);
  uniqueTerms.clear();
  return true;

}