Java Code Examples for org.apache.lucene.analysis.CharArraySet

The following examples show how to use org.apache.lucene.analysis.CharArraySet. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: lucene-solr   Source File: Stemmer.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Find the unique stem(s) of the provided word
 * 
 * @param word Word to find the stems for
 * @return List of stems for the word
 */
public List<CharsRef> uniqueStems(char word[], int length) {
  List<CharsRef> stems = stem(word, length);
  if (stems.size() < 2) {
    return stems;
  }
  CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase);
  List<CharsRef> deduped = new ArrayList<>();
  for (CharsRef s : stems) {
    if (!terms.contains(s)) {
      deduped.add(s);
      terms.add(s);
    }
  }
  return deduped;
}
 
Example 2
Source Project: lucene-solr   Source File: TestStopAnalyzer.java    License: Apache License 2.0 6 votes vote down vote up
public void testStopList() throws IOException {
  CharArraySet stopWordsSet = new CharArraySet(asSet("good", "test", "analyzer"), false);
  StopAnalyzer newStop = new StopAnalyzer(stopWordsSet);
  try (TokenStream stream = newStop.tokenStream("test", "This is a good test of the english stop analyzer")) {
    assertNotNull(stream);
    CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
  
    stream.reset();
    while (stream.incrementToken()) {
      String text = termAtt.toString();
      assertFalse(stopWordsSet.contains(text));
    }
    stream.end();
  }
  newStop.close();
}
 
Example 3
Source Project: lucene-solr   Source File: TestSuggestStopFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testEndIsStopWord() throws Exception {
                            
  CharArraySet stopWords = StopFilter.makeStopSet("to");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to "));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] {"go"},
                            new int[] {0},
                            new int[] {2},
                            null,
                            new int[] {1},
                            null,
                            6,
                            new boolean[] {false},
                            true);
}
 
Example 4
Source Project: lucene-solr   Source File: TestSuggestStopFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testMidStopWord() throws Exception {
                            
  CharArraySet stopWords = StopFilter.makeStopSet("to");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to school"));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] {"go", "school"},
                            new int[] {0, 6},
                            new int[] {2, 12},
                            null,
                            new int[] {1, 2},
                            null,
                            12,
                            new boolean[] {false, false},
                            true);
}
 
Example 5
Source Project: lucene-solr   Source File: TestKeepWordFilter.java    License: Apache License 2.0 6 votes vote down vote up
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
  final Set<String> words = new HashSet<>();
  words.add( "a" );
  words.add( "b" );
  
  Analyzer a = new Analyzer() {

    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream stream = new KeepWordFilter(tokenizer, new CharArraySet( words, true));
      return new TokenStreamComponents(tokenizer, stream);
    }
  };
  
  checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER);
  a.close();
}
 
Example 6
Source Project: lucene-solr   Source File: TestCompoundWordTokenFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testWordComponentWithLessThanMinimumLength() throws Exception {
  CharArraySet dict = makeDictionary("abc", "d", "efg");

  Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenizer.setReader(new StringReader("abcdefg"));
  DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
    tokenizer,
    dict,
    CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);

// since "d" is shorter than the minimum subword size, it should not be added to the token stream
  assertTokenStreamContents(tf,
    new String[] { "abcdefg", "abc", "efg" },
    new int[] { 0, 0, 0},
    new int[] { 7, 7, 7},
    new int[] { 1, 0, 0}
    );
}
 
Example 7
public void testIncompletePhrase() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "big apple", "new york city", "property tax", "three word phrase"), false);

    final String input = "some new york";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("some", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("new", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("york", term.toString());
}
 
Example 8
Source Project: lucene-solr   Source File: TestCompoundWordTokenFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testHyphenationCompoundWordsDELongestMatch() throws Exception {
  CharArraySet dict = makeDictionary("basketball", "basket", "ball", "kurv");

  InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
  HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
      .getHyphenationTree(is);

  // the word basket will not be added due to the longest match option
  HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
      whitespaceMockTokenizer("basketballkurv"),
      hyphenator, dict,
      CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
  assertTokenStreamContents(tf, 
      new String[] { "basketballkurv", "basketball", "ball", "kurv" },
      new int[] { 1, 0, 0, 0 }
  );

}
 
Example 9
Source Project: lucene-solr   Source File: TestCompoundWordTokenFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testReset() throws Exception {
  CharArraySet dict = makeDictionary("Rind", "Fleisch", "Draht", "Schere", "Gesetz",
      "Aufgabe", "Überwachung");

  MockTokenizer wsTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wsTokenizer.setEnableChecks(false); // we will reset in a strange place
  wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
  DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
      wsTokenizer, dict,
      CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
  
  CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
  tf.reset();
  assertTrue(tf.incrementToken());
  assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
  assertTrue(tf.incrementToken());
  assertEquals("Rind", termAtt.toString());
  tf.end();
  tf.close();
  wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
  tf.reset();
  assertTrue(tf.incrementToken());
  assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
}
 
Example 10
Source Project: Clusion   Source File: IEX2LevAMAZON.java    License: GNU General Public License v3.0 6 votes vote down vote up
public void map(Text key, Text value, Context context) throws IOException, InterruptedException {
	String line = value.toString();

	CharArraySet noise = EnglishAnalyzer.getDefaultStopSet();
	// We are using a standard tokenizer that eliminates the stop words.
	// We can use Stemming tokenizer such Porter
	// A set of English noise keywords is used that will eliminates
	// words such as "the, a, etc"
	Analyzer analyzer = new StandardAnalyzer(noise);
	List<String> token = Tokenizer.tokenizeString(analyzer, line);
	Iterator<String> it = token.iterator();
	while (it.hasNext()) {
		word.set(it.next());
		fileName.set(key);
		if (!mapTable.containsKey(fileName.toString() + word.toString())) {
			context.write(fileName, word);
			mapTable.put(fileName.toString() + word.toString(), new IntWritable(1));
		}
	}
}
 
Example 11
Source Project: lucene-solr   Source File: CompoundWordTokenFilterBase.java    License: Apache License 2.0 6 votes vote down vote up
protected CompoundWordTokenFilterBase(TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
  super(input);
  this.tokens=new LinkedList<>();
  if (minWordSize < 0) {
    throw new IllegalArgumentException("minWordSize cannot be negative");
  }
  this.minWordSize=minWordSize;
  if (minSubwordSize < 0) {
    throw new IllegalArgumentException("minSubwordSize cannot be negative");
  }
  this.minSubwordSize=minSubwordSize;
  if (maxSubwordSize < 0) {
    throw new IllegalArgumentException("maxSubwordSize cannot be negative");
  }
  this.maxSubwordSize=maxSubwordSize;
  this.onlyLongestMatch=onlyLongestMatch;
  this.dictionary = dictionary;
}
 
Example 12
Source Project: crate   Source File: Analysis.java    License: Apache License 2.0 5 votes vote down vote up
public static CharArraySet parseStemExclusion(Settings settings, CharArraySet defaultStemExclusion) {
    String value = settings.get("stem_exclusion");
    if ("_none_".equals(value)) {
        return CharArraySet.EMPTY_SET;
    }
    List<String> stemExclusion = settings.getAsList("stem_exclusion", null);
    if (stemExclusion != null) {
        // LUCENE 4 UPGRADE: Should be settings.getAsBoolean("stem_exclusion_case", false)?
        return new CharArraySet(stemExclusion, false);
    } else {
        return defaultStemExclusion;
    }
}
 
Example 13
Source Project: lucene-solr   Source File: TestFilesystemResourceLoader.java    License: Apache License 2.0 5 votes vote down vote up
private void assertClasspathDelegation(ResourceLoader rl) throws Exception {
  // try a stopwords file from classpath
  CharArraySet set = WordlistLoader.getSnowballWordSet(
    new InputStreamReader(rl.openResource("org/apache/lucene/analysis/snowball/english_stop.txt"), StandardCharsets.UTF_8)
  );
  assertTrue(set.contains("you"));
  // try to load a class; we use string comparison because classloader may be different...
  assertEquals("org.apache.lucene.analysis.util.RollingCharBuffer",
      rl.newInstance("org.apache.lucene.analysis.util.RollingCharBuffer", Object.class).getClass().getName());
}
 
Example 14
Source Project: crate   Source File: LatvianAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
LatvianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new LatvianAnalyzer(
        Analysis.parseStopWords(env, settings, LatvianAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example 15
Source Project: lucene-solr   Source File: TestArabicAnalyzer.java    License: Apache License 2.0 5 votes vote down vote up
public void testWithStemExclusionSet() throws IOException {
  CharArraySet set = new CharArraySet(asSet("ساهدهات"), false);
  ArabicAnalyzer a = new ArabicAnalyzer(CharArraySet.EMPTY_SET, set);
  assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
  assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
  a.close();
  
  a = new ArabicAnalyzer(CharArraySet.EMPTY_SET, CharArraySet.EMPTY_SET);
  assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
  assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
  a.close();
}
 
Example 16
Source Project: crate   Source File: BengaliAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
BengaliAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new BengaliAnalyzer(
        Analysis.parseStopWords(env, settings, BengaliAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example 17
Source Project: lucene-solr   Source File: TestDanishAnalyzer.java    License: Apache License 2.0 5 votes vote down vote up
/** test use of exclusion set */
public void testExclude() throws IOException {
  CharArraySet exclusionSet = new CharArraySet( asSet("undersøgelse"), false);
  Analyzer a = new DanishAnalyzer( 
      DanishAnalyzer.getDefaultStopSet(), exclusionSet);
  checkOneTerm(a, "undersøgelse", "undersøgelse");
  checkOneTerm(a, "undersøg", "undersøg");
  a.close();
}
 
Example 18
Source Project: lucene-solr   Source File: TestFinnishLightStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("edeltäjistään"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new FinnishLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "edeltäjistään", "edeltäjistään");
  a.close();
}
 
Example 19
Source Project: coreNlp   Source File: StopwordAnnotator.java    License: Apache License 2.0 5 votes vote down vote up
public StopwordAnnotator(String annotatorClass, Properties props) {
    this.props = props;

    this.checkLemma = Boolean.parseBoolean(props.getProperty(CHECK_LEMMA, "false"));

    if (this.props.containsKey(STOPWORDS_LIST)) {
        String stopwordList = props.getProperty(STOPWORDS_LIST);
        boolean ignoreCase = Boolean.parseBoolean(props.getProperty(IGNORE_STOPWORD_CASE, "false"));
        this.stopwords = getStopWordList(Version.LUCENE_36, stopwordList, ignoreCase);
    } else {
        this.stopwords = (CharArraySet) StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }
}
 
Example 20
Source Project: lucene-solr   Source File: TestCatalanAnalyzer.java    License: Apache License 2.0 5 votes vote down vote up
/** test use of exclusion set */
public void testExclude() throws IOException {
  CharArraySet exclusionSet = new CharArraySet(asSet("llengües"), false);
  Analyzer a = new CatalanAnalyzer(CatalanAnalyzer.getDefaultStopSet(), exclusionSet);
  checkOneTerm(a, "llengües", "llengües");
  checkOneTerm(a, "llengua", "llengu");
  a.close();
}
 
Example 21
Source Project: lucene-solr   Source File: TestSuggestStopFilterFactory.java    License: Apache License 2.0 5 votes vote down vote up
public void testInform() throws Exception {
  ResourceLoader loader = new ClasspathResourceLoader(getClass());
  assertTrue("loader is null and it shouldn't be", loader != null);
  SuggestStopFilterFactory factory = createFactory(
      "words", "stop-1.txt",
      "ignoreCase", "true");
  CharArraySet words = factory.getStopWords();
  assertTrue("words is null and it shouldn't be", words != null);
  assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
  assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);

  factory = createFactory("words", "stop-1.txt, stop-2.txt",
      "ignoreCase", "true");
  words = factory.getStopWords();
  assertTrue("words is null and it shouldn't be", words != null);
  assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4);
  assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);

  factory = createFactory("words", "stop-snowball.txt",
      "format", "snowball",
      "ignoreCase", "true");
  words = factory.getStopWords();
  assertEquals(8, words.size());
  assertTrue(words.contains("he"));
  assertTrue(words.contains("him"));
  assertTrue(words.contains("his"));
  assertTrue(words.contains("himself"));
  assertTrue(words.contains("she"));
  assertTrue(words.contains("her"));
  assertTrue(words.contains("hers"));
  assertTrue(words.contains("herself"));

  // defaults
  factory = createFactory();
  assertEquals(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, factory.getStopWords());
  assertEquals(false, factory.isIgnoreCase());
}
 
Example 22
Source Project: lucene-solr   Source File: TestJapaneseNumberFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testName() throws IOException {
  // Test name that normalises to number
  assertAnalyzesTo(analyzer, "田中京一",
      new String[]{"田中", "10000000000000001"}, // 京一 is normalized to a number
      new int[]{0, 2},
      new int[]{2, 4},
      new int[]{1, 1}
  );

  // An analyzer that marks 京一 as a keyword
  Analyzer keywordMarkingAnalyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      CharArraySet set = new CharArraySet(1, false);
      set.add("京一");

      Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, false, JapaneseTokenizer.Mode.SEARCH);
      return new TokenStreamComponents(tokenizer, new JapaneseNumberFilter(new SetKeywordMarkerFilter(tokenizer, set)));
    }
  };

  assertAnalyzesTo(keywordMarkingAnalyzer, "田中京一",
      new String[]{"田中", "京一"}, // 京一 is not normalized
      new int[]{0, 2},
      new int[]{2, 4},
      new int[]{1, 1}
  );
  keywordMarkingAnalyzer.close();
}
 
Example 23
Source Project: lucene-solr   Source File: TestCompoundWordTokenFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testDumbCompoundWordsSE() throws Exception {
  CharArraySet dict = makeDictionary("Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
      "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",
      "Sko", "Vind", "Rute", "Torkare", "Blad");

  DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
      whitespaceMockTokenizer(
              "Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba"),
      dict);

  assertTokenStreamContents(tf, new String[] { "Bildörr", "Bil", "dörr", "Bilmotor",
      "Bil", "motor", "Biltak", "Bil", "tak", "Slagborr", "Slag", "borr",
      "Hammarborr", "Hammar", "borr", "Pelarborr", "Pelar", "borr",
      "Glasögonfodral", "Glas", "ögon", "fodral", "Basfiolsfodral", "Bas",
      "fiol", "fodral", "Basfiolsfodralmakaregesäll", "Bas", "fiol",
      "fodral", "makare", "gesäll", "Skomakare", "Sko", "makare",
      "Vindrutetorkare", "Vind", "rute", "torkare", "Vindrutetorkarblad",
      "Vind", "rute", "blad", "abba" }, new int[] { 0, 0, 0, 8, 8, 8, 17,
      17, 17, 24, 24, 24, 33, 33, 33, 44, 44, 44, 54, 54, 54, 54, 69, 69, 69,
      69, 84, 84, 84, 84, 84, 84, 111, 111, 111, 121, 121, 121, 121, 137,
      137, 137, 137, 156 }, new int[] { 7, 7, 7, 16, 16, 16, 23, 23, 23, 32,
      32, 32, 43, 43, 43, 53, 53, 53, 68, 68, 68, 68, 83, 83, 83, 83, 110,
      110, 110, 110, 110, 110, 120, 120, 120, 136, 136, 136, 136, 155, 155, 155,
      155, 160 }, new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
      0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
      0, 0, 0, 1 });
}
 
Example 24
public void testCompleteGraph() throws Exception {
  CommonGramsQueryFilterFactory factory = (CommonGramsQueryFilterFactory) tokenFilterFactory("CommonGramsQuery");
  CharArraySet words = factory.getCommonWords();
  assertTrue("words is null and it shouldn't be", words != null);
  assertTrue(words.contains("the"));
  Tokenizer tokenizer = whitespaceMockTokenizer("testing the factory works");
  TokenStream stream = factory.create(tokenizer);
  assertGraphStrings(stream, "testing_the the_factory factory works");
}
 
Example 25
Source Project: crate   Source File: ArabicAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
ArabicAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    arabicAnalyzer = new ArabicAnalyzer(
        Analysis.parseStopWords(env, settings, ArabicAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    arabicAnalyzer.setVersion(version);
}
 
Example 26
Source Project: lucene-solr   Source File: QueryAutoStopWordAnalyzer.java    License: Apache License 2.0 5 votes vote down vote up
@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
  Set<String> stopWords = stopWordsPerField.get(fieldName);
  if (stopWords == null) {
    return components;
  }
  StopFilter stopFilter = new StopFilter(components.getTokenStream(), 
      new CharArraySet(stopWords, false));
  return new TokenStreamComponents(components.getSource(), stopFilter);
}
 
Example 27
Source Project: crate   Source File: FrenchAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
FrenchAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new FrenchAnalyzer(
        Analysis.parseStopWords(env, settings, FrenchAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example 28
Source Project: lucene-solr   Source File: TestFrenchAnalyzer.java    License: Apache License 2.0 5 votes vote down vote up
public void testExclusionTableViaCtor() throws Exception {
  CharArraySet set = new CharArraySet( 1, true);
  set.add("habitable");
  FrenchAnalyzer fa = new FrenchAnalyzer(
      CharArraySet.EMPTY_SET, set);
  assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
      "chist" });
  fa.close();

  fa = new FrenchAnalyzer( CharArraySet.EMPTY_SET, set);
  assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
      "chist" });
  fa.close();
}
 
Example 29
Source Project: lucene-solr   Source File: TestPortugueseLightStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("quilométricas"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new PortugueseLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "quilométricas", "quilométricas");
  a.close();
}
 
Example 30
Source Project: lucene-solr   Source File: TestGalicianAnalyzer.java    License: Apache License 2.0 5 votes vote down vote up
/** test use of exclusion set */
public void testExclude() throws IOException {
  CharArraySet exclusionSet = new CharArraySet( asSet("correspondente"), false);
  Analyzer a = new GalicianAnalyzer( 
      GalicianAnalyzer.getDefaultStopSet(), exclusionSet);
  checkOneTerm(a, "correspondente", "correspondente");
  checkOneTerm(a, "corresponderá", "correspond");
  a.close();
}