org.apache.lucene.analysis.CharArraySet Java Examples

The following examples show how to use org.apache.lucene.analysis.CharArraySet. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestCompoundWordTokenFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testHyphenationCompoundWordsDELongestMatch() throws Exception {
  CharArraySet dict = makeDictionary("basketball", "basket", "ball", "kurv");

  InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
  HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
      .getHyphenationTree(is);

  // the word basket will not be added due to the longest match option
  HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
      whitespaceMockTokenizer("basketballkurv"),
      hyphenator, dict,
      CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
  assertTokenStreamContents(tf, 
      new String[] { "basketballkurv", "basketball", "ball", "kurv" },
      new int[] { 1, 0, 0, 0 }
  );

}
 
Example #2
Source File: Stemmer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Find the unique stem(s) of the provided word
 * 
 * @param word Word to find the stems for
 * @return List of stems for the word
 */
public List<CharsRef> uniqueStems(char word[], int length) {
  List<CharsRef> stems = stem(word, length);
  if (stems.size() < 2) {
    return stems;
  }
  CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase);
  List<CharsRef> deduped = new ArrayList<>();
  for (CharsRef s : stems) {
    if (!terms.contains(s)) {
      deduped.add(s);
      terms.add(s);
    }
  }
  return deduped;
}
 
Example #3
Source File: TestStopAnalyzer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testStopList() throws IOException {
  CharArraySet stopWordsSet = new CharArraySet(asSet("good", "test", "analyzer"), false);
  StopAnalyzer newStop = new StopAnalyzer(stopWordsSet);
  try (TokenStream stream = newStop.tokenStream("test", "This is a good test of the english stop analyzer")) {
    assertNotNull(stream);
    CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
  
    stream.reset();
    while (stream.incrementToken()) {
      String text = termAtt.toString();
      assertFalse(stopWordsSet.contains(text));
    }
    stream.end();
  }
  newStop.close();
}
 
Example #4
Source File: TestSuggestStopFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testEndIsStopWord() throws Exception {
                            
  CharArraySet stopWords = StopFilter.makeStopSet("to");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to "));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] {"go"},
                            new int[] {0},
                            new int[] {2},
                            null,
                            new int[] {1},
                            null,
                            6,
                            new boolean[] {false},
                            true);
}
 
Example #5
Source File: IEX2LevAMAZON.java    From Clusion with GNU General Public License v3.0 6 votes vote down vote up
public void map(Text key, Text value, Context context) throws IOException, InterruptedException {
	String line = value.toString();

	CharArraySet noise = EnglishAnalyzer.getDefaultStopSet();
	// We are using a standard tokenizer that eliminates the stop words.
	// We can use Stemming tokenizer such Porter
	// A set of English noise keywords is used that will eliminates
	// words such as "the, a, etc"
	Analyzer analyzer = new StandardAnalyzer(noise);
	List<String> token = Tokenizer.tokenizeString(analyzer, line);
	Iterator<String> it = token.iterator();
	while (it.hasNext()) {
		word.set(it.next());
		fileName.set(key);
		if (!mapTable.containsKey(fileName.toString() + word.toString())) {
			context.write(fileName, word);
			mapTable.put(fileName.toString() + word.toString(), new IntWritable(1));
		}
	}
}
 
Example #6
Source File: TestSuggestStopFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testMidStopWord() throws Exception {
                            
  CharArraySet stopWords = StopFilter.makeStopSet("to");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to school"));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] {"go", "school"},
                            new int[] {0, 6},
                            new int[] {2, 12},
                            null,
                            new int[] {1, 2},
                            null,
                            12,
                            new boolean[] {false, false},
                            true);
}
 
Example #7
Source File: TestCompoundWordTokenFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testReset() throws Exception {
  CharArraySet dict = makeDictionary("Rind", "Fleisch", "Draht", "Schere", "Gesetz",
      "Aufgabe", "Überwachung");

  MockTokenizer wsTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wsTokenizer.setEnableChecks(false); // we will reset in a strange place
  wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
  DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
      wsTokenizer, dict,
      CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
  
  CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
  tf.reset();
  assertTrue(tf.incrementToken());
  assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
  assertTrue(tf.incrementToken());
  assertEquals("Rind", termAtt.toString());
  tf.end();
  tf.close();
  wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
  tf.reset();
  assertTrue(tf.incrementToken());
  assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
}
 
Example #8
Source File: CompoundWordTokenFilterBase.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
protected CompoundWordTokenFilterBase(TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
  super(input);
  this.tokens=new LinkedList<>();
  if (minWordSize < 0) {
    throw new IllegalArgumentException("minWordSize cannot be negative");
  }
  this.minWordSize=minWordSize;
  if (minSubwordSize < 0) {
    throw new IllegalArgumentException("minSubwordSize cannot be negative");
  }
  this.minSubwordSize=minSubwordSize;
  if (maxSubwordSize < 0) {
    throw new IllegalArgumentException("maxSubwordSize cannot be negative");
  }
  this.maxSubwordSize=maxSubwordSize;
  this.onlyLongestMatch=onlyLongestMatch;
  this.dictionary = dictionary;
}
 
Example #9
Source File: AutoPhrasingTokenFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testIncompletePhrase() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "big apple", "new york city", "property tax", "three word phrase"), false);

    final String input = "some new york";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("some", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("new", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("york", term.toString());
}
 
Example #10
Source File: TestKeepWordFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
  final Set<String> words = new HashSet<>();
  words.add( "a" );
  words.add( "b" );
  
  Analyzer a = new Analyzer() {

    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream stream = new KeepWordFilter(tokenizer, new CharArraySet( words, true));
      return new TokenStreamComponents(tokenizer, stream);
    }
  };
  
  checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER);
  a.close();
}
 
Example #11
Source File: TestCompoundWordTokenFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testWordComponentWithLessThanMinimumLength() throws Exception {
  CharArraySet dict = makeDictionary("abc", "d", "efg");

  Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenizer.setReader(new StringReader("abcdefg"));
  DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
    tokenizer,
    dict,
    CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);

// since "d" is shorter than the minimum subword size, it should not be added to the token stream
  assertTokenStreamContents(tf,
    new String[] { "abcdefg", "abc", "efg" },
    new int[] { 0, 0, 0},
    new int[] { 7, 7, 7},
    new int[] { 1, 0, 0}
    );
}
 
Example #12
Source File: ArabicAnalyzerProvider.java    From crate with Apache License 2.0 5 votes vote down vote up
ArabicAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    arabicAnalyzer = new ArabicAnalyzer(
        Analysis.parseStopWords(env, settings, ArabicAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    arabicAnalyzer.setVersion(version);
}
 
Example #13
Source File: TestSuggestStopFilterFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testInform() throws Exception {
  ResourceLoader loader = new ClasspathResourceLoader(getClass());
  assertTrue("loader is null and it shouldn't be", loader != null);
  SuggestStopFilterFactory factory = createFactory(
      "words", "stop-1.txt",
      "ignoreCase", "true");
  CharArraySet words = factory.getStopWords();
  assertTrue("words is null and it shouldn't be", words != null);
  assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
  assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);

  factory = createFactory("words", "stop-1.txt, stop-2.txt",
      "ignoreCase", "true");
  words = factory.getStopWords();
  assertTrue("words is null and it shouldn't be", words != null);
  assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4);
  assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);

  factory = createFactory("words", "stop-snowball.txt",
      "format", "snowball",
      "ignoreCase", "true");
  words = factory.getStopWords();
  assertEquals(8, words.size());
  assertTrue(words.contains("he"));
  assertTrue(words.contains("him"));
  assertTrue(words.contains("his"));
  assertTrue(words.contains("himself"));
  assertTrue(words.contains("she"));
  assertTrue(words.contains("her"));
  assertTrue(words.contains("hers"));
  assertTrue(words.contains("herself"));

  // defaults
  factory = createFactory();
  assertEquals(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, factory.getStopWords());
  assertEquals(false, factory.isIgnoreCase());
}
 
Example #14
Source File: TestJapaneseNumberFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Test
public void testName() throws IOException {
  // Test name that normalises to number
  assertAnalyzesTo(analyzer, "田中京一",
      new String[]{"田中", "10000000000000001"}, // 京一 is normalized to a number
      new int[]{0, 2},
      new int[]{2, 4},
      new int[]{1, 1}
  );

  // An analyzer that marks 京一 as a keyword
  Analyzer keywordMarkingAnalyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      CharArraySet set = new CharArraySet(1, false);
      set.add("京一");

      Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, false, JapaneseTokenizer.Mode.SEARCH);
      return new TokenStreamComponents(tokenizer, new JapaneseNumberFilter(new SetKeywordMarkerFilter(tokenizer, set)));
    }
  };

  assertAnalyzesTo(keywordMarkingAnalyzer, "田中京一",
      new String[]{"田中", "京一"}, // 京一 is not normalized
      new int[]{0, 2},
      new int[]{2, 4},
      new int[]{1, 1}
  );
  keywordMarkingAnalyzer.close();
}
 
Example #15
Source File: TestCompoundWordTokenFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testDumbCompoundWordsSE() throws Exception {
  CharArraySet dict = makeDictionary("Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
      "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",
      "Sko", "Vind", "Rute", "Torkare", "Blad");

  DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
      whitespaceMockTokenizer(
              "Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba"),
      dict);

  assertTokenStreamContents(tf, new String[] { "Bildörr", "Bil", "dörr", "Bilmotor",
      "Bil", "motor", "Biltak", "Bil", "tak", "Slagborr", "Slag", "borr",
      "Hammarborr", "Hammar", "borr", "Pelarborr", "Pelar", "borr",
      "Glasögonfodral", "Glas", "ögon", "fodral", "Basfiolsfodral", "Bas",
      "fiol", "fodral", "Basfiolsfodralmakaregesäll", "Bas", "fiol",
      "fodral", "makare", "gesäll", "Skomakare", "Sko", "makare",
      "Vindrutetorkare", "Vind", "rute", "torkare", "Vindrutetorkarblad",
      "Vind", "rute", "blad", "abba" }, new int[] { 0, 0, 0, 8, 8, 8, 17,
      17, 17, 24, 24, 24, 33, 33, 33, 44, 44, 44, 54, 54, 54, 54, 69, 69, 69,
      69, 84, 84, 84, 84, 84, 84, 111, 111, 111, 121, 121, 121, 121, 137,
      137, 137, 137, 156 }, new int[] { 7, 7, 7, 16, 16, 16, 23, 23, 23, 32,
      32, 32, 43, 43, 43, 53, 53, 53, 68, 68, 68, 68, 83, 83, 83, 83, 110,
      110, 110, 110, 110, 110, 120, 120, 120, 136, 136, 136, 136, 155, 155, 155,
      155, 160 }, new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
      0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
      0, 0, 0, 1 });
}
 
Example #16
Source File: BengaliAnalyzerProvider.java    From crate with Apache License 2.0 5 votes vote down vote up
BengaliAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new BengaliAnalyzer(
        Analysis.parseStopWords(env, settings, BengaliAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example #17
Source File: TestArabicAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testWithStemExclusionSet() throws IOException {
  CharArraySet set = new CharArraySet(asSet("ساهدهات"), false);
  ArabicAnalyzer a = new ArabicAnalyzer(CharArraySet.EMPTY_SET, set);
  assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
  assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
  a.close();
  
  a = new ArabicAnalyzer(CharArraySet.EMPTY_SET, CharArraySet.EMPTY_SET);
  assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
  assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
  a.close();
}
 
Example #18
Source File: TestCommonGramsQueryFilterFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testCompleteGraph() throws Exception {
  CommonGramsQueryFilterFactory factory = (CommonGramsQueryFilterFactory) tokenFilterFactory("CommonGramsQuery");
  CharArraySet words = factory.getCommonWords();
  assertTrue("words is null and it shouldn't be", words != null);
  assertTrue(words.contains("the"));
  Tokenizer tokenizer = whitespaceMockTokenizer("testing the factory works");
  TokenStream stream = factory.create(tokenizer);
  assertGraphStrings(stream, "testing_the the_factory factory works");
}
 
Example #19
Source File: QueryAutoStopWordAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
  Set<String> stopWords = stopWordsPerField.get(fieldName);
  if (stopWords == null) {
    return components;
  }
  StopFilter stopFilter = new StopFilter(components.getTokenStream(), 
      new CharArraySet(stopWords, false));
  return new TokenStreamComponents(components.getSource(), stopFilter);
}
 
Example #20
Source File: FrenchAnalyzerProvider.java    From crate with Apache License 2.0 5 votes vote down vote up
FrenchAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new FrenchAnalyzer(
        Analysis.parseStopWords(env, settings, FrenchAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example #21
Source File: TestFrenchAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testExclusionTableViaCtor() throws Exception {
  CharArraySet set = new CharArraySet( 1, true);
  set.add("habitable");
  FrenchAnalyzer fa = new FrenchAnalyzer(
      CharArraySet.EMPTY_SET, set);
  assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
      "chist" });
  fa.close();

  fa = new FrenchAnalyzer( CharArraySet.EMPTY_SET, set);
  assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
      "chist" });
  fa.close();
}
 
Example #22
Source File: TestPortugueseLightStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("quilométricas"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new PortugueseLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "quilométricas", "quilométricas");
  a.close();
}
 
Example #23
Source File: TestGalicianAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** test use of exclusion set */
public void testExclude() throws IOException {
  CharArraySet exclusionSet = new CharArraySet( asSet("correspondente"), false);
  Analyzer a = new GalicianAnalyzer( 
      GalicianAnalyzer.getDefaultStopSet(), exclusionSet);
  checkOneTerm(a, "correspondente", "correspondente");
  checkOneTerm(a, "corresponderá", "correspond");
  a.close();
}
 
Example #24
Source File: TestCapitalizationFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
static void assertCapitalizesToKeyword(String input, String expected,
    boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
    Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
    int maxTokenLength) throws IOException {
  final MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
  tokenizer.setReader(new StringReader(input));
  assertCapitalizesTo(tokenizer,
      new String[] { expected }, onlyFirstWord, keep, forceFirstLetter, okPrefix,
      minWordLength, maxWordCount, maxTokenLength);    
}
 
Example #25
Source File: TestThaiAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testReusableTokenStream() throws Exception {
  ThaiAnalyzer analyzer = new ThaiAnalyzer(CharArraySet.EMPTY_SET);
  assertAnalyzesTo(analyzer, "", new String[] {});
  
  assertAnalyzesTo(
      analyzer,
      "การที่ได้ต้องแสดงว่างานดี",
      new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
  
  assertAnalyzesTo(
      analyzer,
      "บริษัทชื่อ XY&Z - คุยกับ [email protected]",
      new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
  analyzer.close();
}
 
Example #26
Source File: LithuanianAnalyzerProvider.java    From crate with Apache License 2.0 5 votes vote down vote up
LithuanianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new LithuanianAnalyzer(
        Analysis.parseStopWords(env, settings, LithuanianAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example #27
Source File: TestCapitalizationFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
static void assertCapitalizesTo(Tokenizer tokenizer, String expected[],
    boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
    Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
    int maxTokenLength) throws IOException {
  CapitalizationFilter filter = new CapitalizationFilter(tokenizer, onlyFirstWord, keep, 
      forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength);
  assertTokenStreamContents(filter, expected);    
}
 
Example #28
Source File: GalicianAnalyzerProvider.java    From crate with Apache License 2.0 5 votes vote down vote up
GalicianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new GalicianAnalyzer(
        Analysis.parseStopWords(env, settings, GalicianAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example #29
Source File: TestBrazilianAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet(1, true);
  set.add("Brasília");
  Tokenizer tokenizer = new LetterTokenizer();
  tokenizer.setReader(new StringReader("Brasília Brasilia"));
  BrazilianStemFilter filter = new BrazilianStemFilter(new SetKeywordMarkerFilter(new LowerCaseFilter(tokenizer), set));

  assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
}
 
Example #30
Source File: TestWordDelimiterGraphFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testProtectedWords() throws Exception {
  TokenStream tokens = new CannedTokenStream(new Token("foo17-bar", 0, 9),
                                             new Token("foo-bar", 0, 7));

  CharArraySet protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("foo17-BAR")), true);
  WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(tokens, GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | CATENATE_ALL, protectedWords);
  assertGraphStrings(wdf,
                     "foo17-bar foo bar",
                     "foo17-bar foo-bar",
                     "foo17-bar foobar");
}