Java Code Examples for org.apache.lucene.analysis.Tokenizer

The following examples show how to use org.apache.lucene.analysis.Tokenizer. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source Project: lucene-solr   Author: apache   File: TestJapaneseTokenizer.java    License: Apache License 2.0 6 votes vote down vote up
public void testEmptyUserDict() throws Exception {
  Reader emptyReader = new StringReader("\n# This is an empty user dictionary\n\n");
  UserDictionary emptyDict = UserDictionary.open(emptyReader);

  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), emptyDict, false, Mode.SEARCH);
      return new TokenStreamComponents(tokenizer, tokenizer);
    }
  };

  assertAnalyzesTo(analyzer, "これは本ではない",
      new String[]{"これ", "は", "本", "で", "は", "ない"},
      new int[]{0, 2, 3, 4, 5, 6},
      new int[]{2, 3, 4, 5, 6, 8}
  );
  analyzer.close();
}
 
Example #2
Source Project: lucene-solr   Author: apache   File: TestSuggestStopFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testEndNotStopWord() throws Exception {
  CharArraySet stopWords = StopFilter.makeStopSet("to");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to"));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] {"go", "to"},
                            new int[] {0, 3},
                            new int[] {2, 5},
                            null,
                            new int[] {1, 1},
                            null,
                            5,
                            new boolean[] {false, true},
                            true);
}
 
Example #3
Source Project: lucene-solr   Author: apache   File: TestICUNormalizer2CharFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testMassiveLigature() throws IOException {
  String input = "\uFDFA";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenStream.setReader(reader);

  assertTokenStreamContents(tokenStream,
    new String[] {"صلى", "الله", "عليه", "وسلم"},
    new int[]{0, 0, 0, 0},
    new int[]{0, 0, 0, 1},
    input.length()
  );
}
 
Example #4
public void testTransformTraditionalSimplified() throws Exception {
    String source = "簡化字";
    String[] expected =  new String[] { "简化", "字" };
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_ch").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_ch");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example #5
Source Project: lucene-solr   Author: apache   File: TestCompoundWordTokenFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception {
  CharArraySet dict = makeDictionary("ab", "cd", "ef");

  Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenizer.setReader(new StringReader("abcdef"));
  DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
    tokenizer,
    dict,
    CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);

  assertTokenStreamContents(tf,
    new String[] { "abcdef", "ab", "cd", "ef" },
    new int[] { 0, 0, 0, 0},
    new int[] { 6, 6, 6, 6},
    new int[] { 1, 0, 0, 0}
    );
}
 
Example #6
Source Project: lucene-solr   Author: apache   File: EdgeNGramTokenFilterTest.java    License: Apache License 2.0 6 votes vote down vote up
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
  for (int i = 0; i < 10; i++) {
    final int min = TestUtil.nextInt(random(), 2, 10);
    final int max = TestUtil.nextInt(random(), min, 20);
    final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
  
    Analyzer a = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
        return new TokenStreamComponents(tokenizer, 
          new EdgeNGramTokenFilter(tokenizer, min, max, preserveOriginal));
      }    
    };
    checkRandomData(random(), a, 10*RANDOM_MULTIPLIER);
    a.close();
  }
}
 
Example #7
Source Project: lucene-solr   Author: apache   File: TestWordDelimiterFilter.java    License: Apache License 2.0 6 votes vote down vote up
/** concat numbers + words + all + preserve original */
public void testLotsOfConcatenating2() throws Exception {
  final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;    

  /* analyzer that uses whitespace + wdf */
  Analyzer a = new Analyzer() {
    @Override
    public TokenStreamComponents createComponents(String field) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null));
    }
  };
  
  assertAnalyzesTo(a, "abc-def-123-456", 
      new String[] { "abc-def-123-456", "abc", "abcdef", "abcdef123456", "def", "123", "123456", "456" }, 
      new int[] { 0, 0, 0, 0, 4, 8, 8, 12 }, 
      new int[] { 15, 3, 7, 15, 7, 11, 15, 15 },
      null,
      new int[] { 1, 0, 0, 0, 1, 1, 0, 1 },
      null,
      false);
  a.close();
}
 
Example #8
Source Project: lucene-solr   Author: apache   File: TestBeiderMorseFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testCustomAttribute() throws IOException {
  TokenStream stream = new MockTokenizer(MockTokenizer.KEYWORD, false);
  ((Tokenizer)stream).setReader(new StringReader("D'Angelo"));
  stream = new PatternKeywordMarkerFilter(stream, Pattern.compile(".*"));
  stream = new BeiderMorseFilter(stream, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true));
  KeywordAttribute keyAtt = stream.addAttribute(KeywordAttribute.class);
  stream.reset();
  int i = 0;
  while(stream.incrementToken()) {
    assertTrue(keyAtt.isKeyword());
    i++;
  }
  assertEquals(12, i);
  stream.end();
  stream.close();
}
 
Example #9
Source Project: query-autofiltering-component   Author: lucidworks   File: QueryAutoFilteringComponent.java    License: Apache License 2.0 6 votes vote down vote up
private ArrayList<char[]> tokenize( String input ) throws IOException {
      
  Log.debug( "tokenize '" + input + "'" );
  ArrayList<char[]> tokens = new ArrayList<char[]>( );
  Tokenizer tk = getTokenizerImpl( input );
  
  CharTermAttribute term = tk.addAttribute( CharTermAttribute.class );
  tk.reset( );
  while (tk.incrementToken( ) ) {
    int bufLen = term.length();
    char[] copy = new char[ bufLen ];
    System.arraycopy(term.buffer( ), 0, copy, 0, bufLen );
    tokens.add( copy );
  }
      
  return tokens;
}
 
Example #10
Source Project: lucene-solr   Author: apache   File: TestSynonymMapFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testRecursion4() throws Exception {
  b = new SynonymMap.Builder(true);
  final boolean keepOrig = true;
  add("zoo zoo", "zoo", keepOrig);
  add("zoo", "zoo zoo", keepOrig);
  final SynonymMap map = b.build();
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
    }
  };
  
  assertAnalyzesTo(a, "zoo zoo $ zoo",
      new String[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" },
      new int[] { 1, 0, 1, 1, 1, 0, 1 });
  a.close();
}
 
Example #11
Source Project: crate   Author: crate   File: CharGroupTokenizerFactory.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public Tokenizer create() {
    return new CharTokenizer() {
        @Override
        protected boolean isTokenChar(int c) {
            if (tokenizeOnSpace && Character.isWhitespace(c)) {
                return false;
            }
            if (tokenizeOnLetter && Character.isLetter(c)) {
                return false;
            }
            if (tokenizeOnDigit && Character.isDigit(c)) {
                return false;
            }
            if (tokenizeOnPunctuation && CharMatcher.Basic.PUNCTUATION.isTokenChar(c)) {
                return false;
            }
            if (tokenizeOnSymbol && CharMatcher.Basic.SYMBOL.isTokenChar(c)) {
                return false;
            }
            return !tokenizeOnChars.contains(c);
        }
    };
}
 
Example #12
Source Project: lucene-solr   Author: apache   File: TestSoraniStemFilter.java    License: Apache License 2.0 6 votes vote down vote up
/** test against a basic vocabulary file */
public void testVocabulary() throws Exception {
  // top 8k words or so: freq > 1000
  
  // just normalization+stem, we are testing that the stemming doesn't break.
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
      TokenStream stream = new SoraniNormalizationFilter(tokenizer);
      stream = new SoraniStemFilter(stream);
      return new TokenStreamComponents(tokenizer, stream);
    }
  };
  assertVocabulary(a, getDataPath("ckbtestdata.zip"), "testdata.txt");
  a.close();
}
 
Example #13
Source Project: lucene-solr   Author: apache   File: TestSynonymMapFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testEmptyTerm() throws IOException {
  Random random = random();
  final int numIters = atLeast(10);
  for (int i = 0; i < numIters; i++) {
    b = new SynonymMap.Builder(random.nextBoolean());
    final int numEntries = atLeast(10);
    for (int j = 0; j < numEntries; j++) {
      add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean());
    }
    final SynonymMap map = b.build();
    final boolean ignoreCase = random.nextBoolean();
    
    final Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new KeywordTokenizer();
        return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase));
      }
    };

    checkAnalysisConsistency(random, analyzer, random.nextBoolean(), "");
    analyzer.close();
  }
}
 
Example #14
Source Project: lucene-solr   Author: apache   File: TestDelimitedPayloadTokenFilterFactory.java    License: Apache License 2.0 6 votes vote down vote up
public void testDelim() throws Exception {
  Reader reader = new StringReader("the*0.1 quick*0.1 red*0.1");
  TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  ((Tokenizer)stream).setReader(reader);
  stream = tokenFilterFactory("DelimitedPayload",
      "encoder", "float",
      "delimiter", "*").create(stream);
  stream.reset();
  while (stream.incrementToken()) {
    PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
    assertNotNull(payAttr);
    byte[] payData = payAttr.getPayload().bytes;
    assertNotNull(payData);
    float payFloat = PayloadHelper.decodeFloat(payData);
    assertEquals(0.1f, payFloat, 0.0f);
  }
  stream.end();
  stream.close();
}
 
Example #15
Source Project: lucene-solr   Author: apache   File: TestPackedTokenAttributeImpl.java    License: Apache License 2.0 6 votes vote down vote up
public void testPackedTokenAttributeFactory() throws Exception {
  TokenStream ts = new MockTokenizer(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
  ((Tokenizer)ts).setReader(new StringReader("foo bar"));
  
  assertTrue("CharTermAttribute is not implemented by Token",
    ts.addAttribute(CharTermAttribute.class) instanceof PackedTokenAttributeImpl);
  assertTrue("OffsetAttribute is not implemented by Token",
    ts.addAttribute(OffsetAttribute.class) instanceof PackedTokenAttributeImpl);
  assertTrue("PositionIncrementAttribute is not implemented by Token", 
    ts.addAttribute(PositionIncrementAttribute.class) instanceof PackedTokenAttributeImpl);
  assertTrue("TypeAttribute is not implemented by Token",
    ts.addAttribute(TypeAttribute.class) instanceof PackedTokenAttributeImpl);

  assertTrue("FlagsAttribute is not implemented by FlagsAttributeImpl",
      ts.addAttribute(FlagsAttribute.class) instanceof FlagsAttributeImpl);  
}
 
Example #16
Source Project: lucene-solr   Author: apache   File: TestFrenchLightStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("chevaux"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new FrenchLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "chevaux", "chevaux");
  a.close();
}
 
Example #17
Source Project: SciGraph   Author: SciGraph   File: EntityAnalyzer.java    License: Apache License 2.0 5 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(String fieldName) {
  Tokenizer tokenizer = new WhitespaceTokenizer();
  TokenStream result =
      new PatternReplaceFilter(tokenizer,
          Pattern.compile("^([\\.!\\?,:;\"'\\(\\)]*)(.*?)([\\.!\\?,:;\"'\\(\\)]*)$"), "$2", true);
  result = new PatternReplaceFilter(result, Pattern.compile("'s"), "s", true);
  return new TokenStreamComponents(tokenizer, result);
}
 
Example #18
Source Project: lucene-solr   Author: apache   File: TestGermanAnalyzer.java    License: Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet( 1, true);
  set.add("fischen");
  final Tokenizer in = new LetterTokenizer();
  in.setReader(new StringReader("Fischen Trinken"));
  GermanStemFilter filter = new GermanStemFilter(
      new SetKeywordMarkerFilter(new LowerCaseFilter(in), set));
  assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
}
 
Example #19
Source Project: lucene-solr   Author: apache   File: TestArabicStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testEmptyTerm() throws IOException {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new KeywordTokenizer();
      return new TokenStreamComponents(tokenizer, new ArabicStemFilter(tokenizer));
    }
  };
  checkOneTerm(a, "", "");
  a.close();
}
 
Example #20
Source Project: lucene-solr   Author: apache   File: PathHierarchyTokenizerFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Tokenizer create(AttributeFactory factory) {
  if (reverse) {
    return new ReversePathHierarchyTokenizer(factory, delimiter, replacement, skip);
  }
  return new PathHierarchyTokenizer(factory, delimiter, replacement, skip);
}
 
Example #21
Source Project: mecab-ko-lucene-analyzer   Author: bibreen   File: MeCabKoStandardTokenizerTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testUnknownSurface() throws Exception {
  Tokenizer tokenizer = createTokenizer(
      new StringReader("걀꿀 없는 단어"),
      TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH);
  assertEquals(
      "걀꿀:UNKNOWN:UNKNOWN:null:1:1:0:2,없는:EOJEOL:VA+ETM:null:1:1:3:5,"
      + "단어:N:NNG:null:1:1:6:8,",
      tokenizerToString(tokenizer));
  tokenizer.close();
}
 
Example #22
private static Analyzer createAnalyzer() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(false, false));
            return new TokenStreamComponents(tokenizer);
        }
    };
}
 
Example #23
Source Project: lucene-solr   Author: apache   File: TestNorwegianLightStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("sekretæren"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new NorwegianLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "sekretæren", "sekretæren");
  a.close();
}
 
Example #24
public void testIdentifierNonBreak() throws Exception {
    String source = "ISBN 3-428-84350-9";
    String[] expected = {"ISBN", "3-428-84350-9"};
    String resource = "icu_tokenizer.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_icu_tokenizer").create();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenizer, expected);
}
 
Example #25
Source Project: lucene-solr   Author: apache   File: TestUAX29URLEmailTokenizerFactory.java    License: Apache License 2.0 5 votes vote down vote up
public void testArabic() throws Exception {
  Reader reader = new StringReader("الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.");
  Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory());
  stream.setReader(reader);
  assertTokenStreamContents(stream, 
      new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
      "بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008"  });
}
 
Example #26
Source Project: lucene-solr   Author: apache   File: TestHungarianLightStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void setUp() throws Exception {
  super.setUp();
  analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(source, new HungarianLightStemFilter(source));
    }
  };
}
 
Example #27
Source Project: mecab-ko-lucene-analyzer   Author: bibreen   File: StandardQueryTokenizerFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Tokenizer create(AttributeFactory factory, Reader input) {
  return new MeCabKoTokenizer(
      factory,
      input,
      mecabDicDir,
      new StandardPosAppender(),
      TokenGenerator.NO_DECOMPOUND);
}
 
Example #28
Source Project: lucene-solr   Author: apache   File: TestFrenchMinimalStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void setUp() throws Exception {
  super.setUp();
  analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(source, new FrenchMinimalStemFilter(source));
    }
  };
}
 
Example #29
Source Project: lucene-solr   Author: apache   File: TestICUNormalizer2Filter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void setUp() throws Exception {
  super.setUp();
  a = new Analyzer() {
    @Override
    public TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new ICUNormalizer2Filter(tokenizer));
    }
  };
}
 
Example #30
Source Project: lucene-solr   Author: apache   File: TestHyphenatedWordsFilter.java    License: Apache License 2.0 5 votes vote down vote up
/** blast some random strings through the analyzer */
public void testRandomString() throws Exception {
  Analyzer a = new Analyzer() {

    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new HyphenatedWordsFilter(tokenizer));
    }
  };
  
  checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER);
  a.close();
}