Java Code Examples for org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter

The following examples show how to use org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: lucene-solr   Source File: TestMorfologikAnalyzer.java    License: Apache License 2.0 6 votes vote down vote up
/** */
public final void testKeywordAttrTokens() throws IOException {
  Analyzer a = new MorfologikAnalyzer() {
    @Override
    protected TokenStreamComponents createComponents(String field) {
      final CharArraySet keywords = new CharArraySet(1, false);
      keywords.add("liście");

      final Tokenizer src = new StandardTokenizer();
      TokenStream result = new SetKeywordMarkerFilter(src, keywords);
      result = new MorfologikFilter(result); 

      return new TokenStreamComponents(src, result);
    }
  };

  assertAnalyzesTo(
    a,
    "liście danych",
    new String[] { "liście", "dany", "dana", "dane", "dać" },
    new int[] { 0, 7, 7, 7, 7 },
    new int[] { 6, 13, 13, 13, 13 },
    new int[] { 1, 1, 0, 0, 0 });
  a.close();
}
 
Example 2
Source Project: lucene-solr   Source File: TestJapaneseNumberFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testName() throws IOException {
  // Test name that normalises to number
  assertAnalyzesTo(analyzer, "田中京一",
      new String[]{"田中", "10000000000000001"}, // 京一 is normalized to a number
      new int[]{0, 2},
      new int[]{2, 4},
      new int[]{1, 1}
  );

  // An analyzer that marks 京一 as a keyword
  Analyzer keywordMarkingAnalyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      CharArraySet set = new CharArraySet(1, false);
      set.add("京一");

      Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, false, JapaneseTokenizer.Mode.SEARCH);
      return new TokenStreamComponents(tokenizer, new JapaneseNumberFilter(new SetKeywordMarkerFilter(tokenizer, set)));
    }
  };

  assertAnalyzesTo(keywordMarkingAnalyzer, "田中京一",
      new String[]{"田中", "京一"}, // 京一 is not normalized
      new int[]{0, 2},
      new int[]{2, 4},
      new int[]{1, 1}
  );
  keywordMarkingAnalyzer.close();
}
 
Example 3
Source Project: lucene-solr   Source File: TestJapaneseBaseFormFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet(asSet("あり"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new JapaneseTokenizer(newAttributeFactory(), null, true, JapaneseTokenizer.DEFAULT_MODE);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new JapaneseBaseFormFilter(sink));
    }
  };
  assertAnalyzesTo(a, "それはまだ実験段階にあります",
      new String[] { "それ", "は", "まだ", "実験", "段階", "に", "あり", "ます"  }
  );
  a.close();
}
 
Example 4
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet(asSet("コーヒー"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new JapaneseKatakanaStemFilter(sink));
    }
  };
  checkOneTerm(a, "コーヒー", "コーヒー");
  a.close();
}
 
Example 5
Source Project: lucene-solr   Source File: TestKoreanNumberFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testName() throws IOException {
  // Test name that normalises to number
  assertAnalyzesTo(analyzer, "전중경일",
      new String[]{"전중", "10000000000000001"}, // 경일 is normalized to a number
      new int[]{0, 2},
      new int[]{2, 4},
      new int[]{1, 1}
  );

  // An analyzer that marks 경일 as a keyword
  Analyzer keywordMarkingAnalyzer = new Analyzer() {

    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      CharArraySet set = new CharArraySet(1, false);
      set.add("경일");
      UserDictionary userDictionary = readDict();
      Set<POS.Tag> stopTags = new HashSet<>();
      stopTags.add(POS.Tag.SP);
      Tokenizer tokenizer = new KoreanTokenizer(newAttributeFactory(), userDictionary,
          KoreanTokenizer.DEFAULT_DECOMPOUND, false, false);
      TokenStream stream = new KoreanPartOfSpeechStopFilter(tokenizer, stopTags);
      return new TokenStreamComponents(tokenizer, new KoreanNumberFilter(new SetKeywordMarkerFilter(stream, set)));
    }
  };

  assertAnalyzesTo(keywordMarkingAnalyzer, "전중경일",
      new String[]{"전중", "경일"}, // 경일 is not normalized
      new int[]{0, 2},
      new int[]{2, 4},
      new int[]{1, 1}
  );
  keywordMarkingAnalyzer.close();
}
 
Example 6
Source Project: lucene-solr   Source File: TestGermanStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("sängerinnen"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new GermanStemFilter(sink));
    }
  };
  checkOneTerm(a, "sängerinnen", "sängerinnen");
  a.close();
}
 
Example 7
Source Project: lucene-solr   Source File: TestGermanMinimalStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("sängerinnen"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new GermanMinimalStemFilter(sink));
    }
  };
  checkOneTerm(a, "sängerinnen", "sängerinnen");
  a.close();
}
 
Example 8
Source Project: lucene-solr   Source File: TestGermanAnalyzer.java    License: Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet( 1, true);
  set.add("fischen");
  final Tokenizer in = new LetterTokenizer();
  in.setReader(new StringReader("Fischen Trinken"));
  GermanStemFilter filter = new GermanStemFilter(
      new SetKeywordMarkerFilter(new LowerCaseFilter(in), set));
  assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
}
 
Example 9
Source Project: lucene-solr   Source File: TestGermanLightStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("sängerinnen"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new GermanLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "sängerinnen", "sängerinnen");
  a.close();
}
 
Example 10
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("sekretæren"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new NorwegianMinimalStemFilter(sink));
    }
  };
  checkOneTerm(a, "sekretæren", "sekretæren");
  a.close();
}
 
Example 11
Source Project: lucene-solr   Source File: TestNorwegianLightStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("sekretæren"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new NorwegianLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "sekretæren", "sekretæren");
  a.close();
}
 
Example 12
Source Project: lucene-solr   Source File: TestFinnishLightStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("edeltäjistään"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new FinnishLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "edeltäjistään", "edeltäjistään");
  a.close();
}
 
Example 13
Source Project: lucene-solr   Source File: TestArabicStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet(1, true);
  set.add("ساهدهات");
  MockTokenizer tokenStream  = whitespaceMockTokenizer("ساهدهات");

  ArabicStemFilter filter = new ArabicStemFilter(new SetKeywordMarkerFilter(tokenStream, set));
  assertTokenStreamContents(filter, new String[]{"ساهدهات"});
}
 
Example 14
Source Project: lucene-solr   Source File: TestRussianLightStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("энергии"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new RussianLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "энергии", "энергии");
  a.close();
}
 
Example 15
Source Project: lucene-solr   Source File: TestSwedishLightStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("jaktkarlens"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new SwedishLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "jaktkarlens", "jaktkarlens");
  a.close();
}
 
Example 16
Source Project: lucene-solr   Source File: TestHungarianLightStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("babakocsi"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new HungarianLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "babakocsi", "babakocsi");
  a.close();
}
 
Example 17
Source Project: lucene-solr   Source File: TestBulgarianStemmer.java    License: Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet(1, true);
  set.add("строеве");
  MockTokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenStream.setReader(new StringReader("строевете строеве"));

  BulgarianStemFilter filter = new BulgarianStemFilter(
      new SetKeywordMarkerFilter(tokenStream, set));
  assertTokenStreamContents(filter, new String[] { "строй", "строеве" });
}
 
Example 18
Source Project: lucene-solr   Source File: TestGalicianMinimalStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("elefantes"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new GalicianMinimalStemFilter(sink));
    }
  };
  checkOneTerm(a, "elefantes", "elefantes");
  a.close();
}
 
Example 19
Source Project: lucene-solr   Source File: TestBrazilianAnalyzer.java    License: Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet(1, true);
  set.add("Brasília");
  Tokenizer tokenizer = new LetterTokenizer();
  tokenizer.setReader(new StringReader("Brasília Brasilia"));
  BrazilianStemFilter filter = new BrazilianStemFilter(new SetKeywordMarkerFilter(new LowerCaseFilter(tokenizer), set));

  assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
}
 
Example 20
Source Project: lucene-solr   Source File: TestFrenchLightStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("chevaux"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new FrenchLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "chevaux", "chevaux");
  a.close();
}
 
Example 21
Source Project: lucene-solr   Source File: TestFrenchMinimalStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("chevaux"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer( MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new FrenchMinimalStemFilter(sink));
    }
  };
  checkOneTerm(a, "chevaux", "chevaux");
  a.close();
}
 
Example 22
Source Project: lucene-solr   Source File: TestHunspellStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
/** Simple test for KeywordAttribute */
public void testKeywordAttribute() throws IOException {
  MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome");
  tokenizer.setEnableChecks(true);
  HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary);
  assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1});
  
  // assert with keyword marker
  tokenizer = whitespaceMockTokenizer("lucene is awesome");
  CharArraySet set = new CharArraySet( Arrays.asList("Lucene"), true);
  filter = new HunspellStemFilter(new SetKeywordMarkerFilter(tokenizer, set), dictionary);
  assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
}
 
Example 23
Source Project: lucene-solr   Source File: TestPorterStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet( 1, true);
  set.add("yourselves");
  Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenizer.setReader(new StringReader("yourselves yours"));
  TokenStream filter = new PorterStemFilter(new SetKeywordMarkerFilter(tokenizer, set));   
  assertTokenStreamContents(filter, new String[] {"yourselves", "your"});
}
 
Example 24
Source Project: lucene-solr   Source File: TestPortugueseLightStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("quilométricas"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new PortugueseLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "quilométricas", "quilométricas");
  a.close();
}
 
Example 25
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("quilométricas"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new PortugueseMinimalStemFilter(sink));
    }
  };
  checkOneTerm(a, "quilométricas", "quilométricas");
  a.close();
}
 
Example 26
Source Project: lucene-solr   Source File: TestPortugueseStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("quilométricas"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new PortugueseStemFilter(sink));
    }
  };
  checkOneTerm(a, "quilométricas", "quilométricas");
  a.close();
}
 
Example 27
Source Project: lucene-solr   Source File: TestCzechStemmer.java    License: Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet(1, true);
  set.add("hole");
  final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  in.setReader(new StringReader("hole desek"));
  CzechStemFilter filter = new CzechStemFilter(new SetKeywordMarkerFilter(
      in, set));
  assertTokenStreamContents(filter, new String[] { "hole", "desk" });
}
 
Example 28
Source Project: crate   Source File: KeywordMarkerTokenFilterFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public TokenStream create(TokenStream tokenStream) {
    if (keywordPattern != null) {
        return new PatternKeywordMarkerFilter(tokenStream, keywordPattern);
    } else {
        return new SetKeywordMarkerFilter(tokenStream, keywordLookup);
    }
}
 
Example 29
@Override
public TokenStream create(TokenStream tokenStream) {
    return new BrazilianStemFilter(new SetKeywordMarkerFilter(tokenStream, exclusions));
}
 
Example 30
@Override
public TokenStream create(TokenStream tokenStream) {
    return new GermanStemFilter(new SetKeywordMarkerFilter(tokenStream, exclusions));
}