Java Code Examples for org.apache.lucene.analysis.Tokenizer

The following examples show how to use org.apache.lucene.analysis.Tokenizer. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: lucene-solr   Source File: TestJapaneseTokenizer.java    License: Apache License 2.0 6 votes vote down vote up
public void testEmptyUserDict() throws Exception {
  Reader emptyReader = new StringReader("\n# This is an empty user dictionary\n\n");
  UserDictionary emptyDict = UserDictionary.open(emptyReader);

  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), emptyDict, false, Mode.SEARCH);
      return new TokenStreamComponents(tokenizer, tokenizer);
    }
  };

  assertAnalyzesTo(analyzer, "これは本ではない",
      new String[]{"これ", "は", "本", "で", "は", "ない"},
      new int[]{0, 2, 3, 4, 5, 6},
      new int[]{2, 3, 4, 5, 6, 8}
  );
  analyzer.close();
}
 
Example 2
Source Project: lucene-solr   Source File: TestSuggestStopFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testEndNotStopWord() throws Exception {
  CharArraySet stopWords = StopFilter.makeStopSet("to");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to"));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] {"go", "to"},
                            new int[] {0, 3},
                            new int[] {2, 5},
                            null,
                            new int[] {1, 1},
                            null,
                            5,
                            new boolean[] {false, true},
                            true);
}
 
Example 3
Source Project: lucene-solr   Source File: TestICUNormalizer2CharFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testMassiveLigature() throws IOException {
  String input = "\uFDFA";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenStream.setReader(reader);

  assertTokenStreamContents(tokenStream,
    new String[] {"صلى", "الله", "عليه", "وسلم"},
    new int[]{0, 0, 0, 0},
    new int[]{0, 0, 0, 1},
    input.length()
  );
}
 
Example 4
public void testTransformTraditionalSimplified() throws Exception {
    String source = "簡化字";
    String[] expected =  new String[] { "简化", "字" };
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_ch").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_ch");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example 5
Source Project: lucene-solr   Source File: TestCompoundWordTokenFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception {
  CharArraySet dict = makeDictionary("ab", "cd", "ef");

  Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenizer.setReader(new StringReader("abcdef"));
  DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
    tokenizer,
    dict,
    CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);

  assertTokenStreamContents(tf,
    new String[] { "abcdef", "ab", "cd", "ef" },
    new int[] { 0, 0, 0, 0},
    new int[] { 6, 6, 6, 6},
    new int[] { 1, 0, 0, 0}
    );
}
 
Example 6
Source Project: lucene-solr   Source File: EdgeNGramTokenFilterTest.java    License: Apache License 2.0 6 votes vote down vote up
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
  for (int i = 0; i < 10; i++) {
    final int min = TestUtil.nextInt(random(), 2, 10);
    final int max = TestUtil.nextInt(random(), min, 20);
    final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
  
    Analyzer a = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
        return new TokenStreamComponents(tokenizer, 
          new EdgeNGramTokenFilter(tokenizer, min, max, preserveOriginal));
      }    
    };
    checkRandomData(random(), a, 10*RANDOM_MULTIPLIER);
    a.close();
  }
}
 
Example 7
Source Project: lucene-solr   Source File: TestWordDelimiterFilter.java    License: Apache License 2.0 6 votes vote down vote up
/** concat numbers + words + all + preserve original */
public void testLotsOfConcatenating2() throws Exception {
  final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;    

  /* analyzer that uses whitespace + wdf */
  Analyzer a = new Analyzer() {
    @Override
    public TokenStreamComponents createComponents(String field) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null));
    }
  };
  
  assertAnalyzesTo(a, "abc-def-123-456", 
      new String[] { "abc-def-123-456", "abc", "abcdef", "abcdef123456", "def", "123", "123456", "456" }, 
      new int[] { 0, 0, 0, 0, 4, 8, 8, 12 }, 
      new int[] { 15, 3, 7, 15, 7, 11, 15, 15 },
      null,
      new int[] { 1, 0, 0, 0, 1, 1, 0, 1 },
      null,
      false);
  a.close();
}
 
Example 8
Source Project: lucene-solr   Source File: TestBeiderMorseFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testCustomAttribute() throws IOException {
  TokenStream stream = new MockTokenizer(MockTokenizer.KEYWORD, false);
  ((Tokenizer)stream).setReader(new StringReader("D'Angelo"));
  stream = new PatternKeywordMarkerFilter(stream, Pattern.compile(".*"));
  stream = new BeiderMorseFilter(stream, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true));
  KeywordAttribute keyAtt = stream.addAttribute(KeywordAttribute.class);
  stream.reset();
  int i = 0;
  while(stream.incrementToken()) {
    assertTrue(keyAtt.isKeyword());
    i++;
  }
  assertEquals(12, i);
  stream.end();
  stream.close();
}
 
Example 9
private ArrayList<char[]> tokenize( String input ) throws IOException {
      
  Log.debug( "tokenize '" + input + "'" );
  ArrayList<char[]> tokens = new ArrayList<char[]>( );
  Tokenizer tk = getTokenizerImpl( input );
  
  CharTermAttribute term = tk.addAttribute( CharTermAttribute.class );
  tk.reset( );
  while (tk.incrementToken( ) ) {
    int bufLen = term.length();
    char[] copy = new char[ bufLen ];
    System.arraycopy(term.buffer( ), 0, copy, 0, bufLen );
    tokens.add( copy );
  }
      
  return tokens;
}
 
Example 10
Source Project: lucene-solr   Source File: TestSynonymMapFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testRecursion4() throws Exception {
  b = new SynonymMap.Builder(true);
  final boolean keepOrig = true;
  add("zoo zoo", "zoo", keepOrig);
  add("zoo", "zoo zoo", keepOrig);
  final SynonymMap map = b.build();
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
    }
  };
  
  assertAnalyzesTo(a, "zoo zoo $ zoo",
      new String[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" },
      new int[] { 1, 0, 1, 1, 1, 0, 1 });
  a.close();
}
 
Example 11
Source Project: crate   Source File: CharGroupTokenizerFactory.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public Tokenizer create() {
    return new CharTokenizer() {
        @Override
        protected boolean isTokenChar(int c) {
            if (tokenizeOnSpace && Character.isWhitespace(c)) {
                return false;
            }
            if (tokenizeOnLetter && Character.isLetter(c)) {
                return false;
            }
            if (tokenizeOnDigit && Character.isDigit(c)) {
                return false;
            }
            if (tokenizeOnPunctuation && CharMatcher.Basic.PUNCTUATION.isTokenChar(c)) {
                return false;
            }
            if (tokenizeOnSymbol && CharMatcher.Basic.SYMBOL.isTokenChar(c)) {
                return false;
            }
            return !tokenizeOnChars.contains(c);
        }
    };
}
 
Example 12
Source Project: lucene-solr   Source File: TestSoraniStemFilter.java    License: Apache License 2.0 6 votes vote down vote up
/** test against a basic vocabulary file */
public void testVocabulary() throws Exception {
  // top 8k words or so: freq > 1000
  
  // just normalization+stem, we are testing that the stemming doesn't break.
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
      TokenStream stream = new SoraniNormalizationFilter(tokenizer);
      stream = new SoraniStemFilter(stream);
      return new TokenStreamComponents(tokenizer, stream);
    }
  };
  assertVocabulary(a, getDataPath("ckbtestdata.zip"), "testdata.txt");
  a.close();
}
 
Example 13
Source Project: lucene-solr   Source File: TestSynonymMapFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testEmptyTerm() throws IOException {
  Random random = random();
  final int numIters = atLeast(10);
  for (int i = 0; i < numIters; i++) {
    b = new SynonymMap.Builder(random.nextBoolean());
    final int numEntries = atLeast(10);
    for (int j = 0; j < numEntries; j++) {
      add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean());
    }
    final SynonymMap map = b.build();
    final boolean ignoreCase = random.nextBoolean();
    
    final Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new KeywordTokenizer();
        return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase));
      }
    };

    checkAnalysisConsistency(random, analyzer, random.nextBoolean(), "");
    analyzer.close();
  }
}
 
Example 14
public void testDelim() throws Exception {
  Reader reader = new StringReader("the*0.1 quick*0.1 red*0.1");
  TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  ((Tokenizer)stream).setReader(reader);
  stream = tokenFilterFactory("DelimitedPayload",
      "encoder", "float",
      "delimiter", "*").create(stream);
  stream.reset();
  while (stream.incrementToken()) {
    PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
    assertNotNull(payAttr);
    byte[] payData = payAttr.getPayload().bytes;
    assertNotNull(payData);
    float payFloat = PayloadHelper.decodeFloat(payData);
    assertEquals(0.1f, payFloat, 0.0f);
  }
  stream.end();
  stream.close();
}
 
Example 15
Source Project: lucene-solr   Source File: TestPackedTokenAttributeImpl.java    License: Apache License 2.0 6 votes vote down vote up
public void testPackedTokenAttributeFactory() throws Exception {
  TokenStream ts = new MockTokenizer(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
  ((Tokenizer)ts).setReader(new StringReader("foo bar"));
  
  assertTrue("CharTermAttribute is not implemented by Token",
    ts.addAttribute(CharTermAttribute.class) instanceof PackedTokenAttributeImpl);
  assertTrue("OffsetAttribute is not implemented by Token",
    ts.addAttribute(OffsetAttribute.class) instanceof PackedTokenAttributeImpl);
  assertTrue("PositionIncrementAttribute is not implemented by Token", 
    ts.addAttribute(PositionIncrementAttribute.class) instanceof PackedTokenAttributeImpl);
  assertTrue("TypeAttribute is not implemented by Token",
    ts.addAttribute(TypeAttribute.class) instanceof PackedTokenAttributeImpl);

  assertTrue("FlagsAttribute is not implemented by FlagsAttributeImpl",
      ts.addAttribute(FlagsAttribute.class) instanceof FlagsAttributeImpl);  
}
 
Example 16
Source Project: lucene-solr   Source File: TestFrenchLightStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("chevaux"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new FrenchLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "chevaux", "chevaux");
  a.close();
}
 
Example 17
Source Project: SciGraph   Source File: EntityAnalyzer.java    License: Apache License 2.0 5 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(String fieldName) {
  Tokenizer tokenizer = new WhitespaceTokenizer();
  TokenStream result =
      new PatternReplaceFilter(tokenizer,
          Pattern.compile("^([\\.!\\?,:;\"'\\(\\)]*)(.*?)([\\.!\\?,:;\"'\\(\\)]*)$"), "$2", true);
  result = new PatternReplaceFilter(result, Pattern.compile("'s"), "s", true);
  return new TokenStreamComponents(tokenizer, result);
}
 
Example 18
Source Project: lucene-solr   Source File: TestGermanAnalyzer.java    License: Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet( 1, true);
  set.add("fischen");
  final Tokenizer in = new LetterTokenizer();
  in.setReader(new StringReader("Fischen Trinken"));
  GermanStemFilter filter = new GermanStemFilter(
      new SetKeywordMarkerFilter(new LowerCaseFilter(in), set));
  assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
}
 
Example 19
Source Project: lucene-solr   Source File: TestArabicStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testEmptyTerm() throws IOException {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new KeywordTokenizer();
      return new TokenStreamComponents(tokenizer, new ArabicStemFilter(tokenizer));
    }
  };
  checkOneTerm(a, "", "");
  a.close();
}
 
Example 20
Source Project: lucene-solr   Source File: PathHierarchyTokenizerFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Tokenizer create(AttributeFactory factory) {
  if (reverse) {
    return new ReversePathHierarchyTokenizer(factory, delimiter, replacement, skip);
  }
  return new PathHierarchyTokenizer(factory, delimiter, replacement, skip);
}
 
Example 21
@Test
public void testUnknownSurface() throws Exception {
  Tokenizer tokenizer = createTokenizer(
      new StringReader("걀꿀 없는 단어"),
      TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH);
  assertEquals(
      "걀꿀:UNKNOWN:UNKNOWN:null:1:1:0:2,없는:EOJEOL:VA+ETM:null:1:1:3:5,"
      + "단어:N:NNG:null:1:1:6:8,",
      tokenizerToString(tokenizer));
  tokenizer.close();
}
 
Example 22
private static Analyzer createAnalyzer() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(false, false));
            return new TokenStreamComponents(tokenizer);
        }
    };
}
 
Example 23
Source Project: lucene-solr   Source File: TestNorwegianLightStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("sekretæren"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new NorwegianLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "sekretæren", "sekretæren");
  a.close();
}
 
Example 24
public void testIdentifierNonBreak() throws Exception {
    String source = "ISBN 3-428-84350-9";
    String[] expected = {"ISBN", "3-428-84350-9"};
    String resource = "icu_tokenizer.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_icu_tokenizer").create();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenizer, expected);
}
 
Example 25
public void testArabic() throws Exception {
  Reader reader = new StringReader("الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.");
  Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory());
  stream.setReader(reader);
  assertTokenStreamContents(stream, 
      new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
      "بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008"  });
}
 
Example 26
Source Project: lucene-solr   Source File: TestHungarianLightStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void setUp() throws Exception {
  super.setUp();
  analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(source, new HungarianLightStemFilter(source));
    }
  };
}
 
Example 27
@Override
public Tokenizer create(AttributeFactory factory, Reader input) {
  return new MeCabKoTokenizer(
      factory,
      input,
      mecabDicDir,
      new StandardPosAppender(),
      TokenGenerator.NO_DECOMPOUND);
}
 
Example 28
Source Project: lucene-solr   Source File: TestFrenchMinimalStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void setUp() throws Exception {
  super.setUp();
  analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(source, new FrenchMinimalStemFilter(source));
    }
  };
}
 
Example 29
Source Project: lucene-solr   Source File: TestICUNormalizer2Filter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void setUp() throws Exception {
  super.setUp();
  a = new Analyzer() {
    @Override
    public TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new ICUNormalizer2Filter(tokenizer));
    }
  };
}
 
Example 30
Source Project: lucene-solr   Source File: TestHyphenatedWordsFilter.java    License: Apache License 2.0 5 votes vote down vote up
/** blast some random strings through the analyzer */
public void testRandomString() throws Exception {
  Analyzer a = new Analyzer() {

    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new HyphenatedWordsFilter(tokenizer));
    }
  };
  
  checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER);
  a.close();
}