org.apache.lucene.analysis.Tokenizer Java Examples

The following examples show how to use org.apache.lucene.analysis.Tokenizer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: IcuTransformFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testTransformTraditionalSimplified() throws Exception {
    String source = "簡化字";
    String[] expected =  new String[] { "简化", "字" };
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_ch").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_ch");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example #2
Source File: TestDelimitedPayloadTokenFilterFactory.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testDelim() throws Exception {
  Reader reader = new StringReader("the*0.1 quick*0.1 red*0.1");
  TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  ((Tokenizer)stream).setReader(reader);
  stream = tokenFilterFactory("DelimitedPayload",
      "encoder", "float",
      "delimiter", "*").create(stream);
  stream.reset();
  while (stream.incrementToken()) {
    PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
    assertNotNull(payAttr);
    byte[] payData = payAttr.getPayload().bytes;
    assertNotNull(payData);
    float payFloat = PayloadHelper.decodeFloat(payData);
    assertEquals(0.1f, payFloat, 0.0f);
  }
  stream.end();
  stream.close();
}
 
Example #3
Source File: TestSynonymMapFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testEmptyTerm() throws IOException {
  Random random = random();
  final int numIters = atLeast(10);
  for (int i = 0; i < numIters; i++) {
    b = new SynonymMap.Builder(random.nextBoolean());
    final int numEntries = atLeast(10);
    for (int j = 0; j < numEntries; j++) {
      add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean());
    }
    final SynonymMap map = b.build();
    final boolean ignoreCase = random.nextBoolean();
    
    final Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new KeywordTokenizer();
        return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase));
      }
    };

    checkAnalysisConsistency(random, analyzer, random.nextBoolean(), "");
    analyzer.close();
  }
}
 
Example #4
Source File: TestPackedTokenAttributeImpl.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testPackedTokenAttributeFactory() throws Exception {
  TokenStream ts = new MockTokenizer(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
  ((Tokenizer)ts).setReader(new StringReader("foo bar"));
  
  assertTrue("CharTermAttribute is not implemented by Token",
    ts.addAttribute(CharTermAttribute.class) instanceof PackedTokenAttributeImpl);
  assertTrue("OffsetAttribute is not implemented by Token",
    ts.addAttribute(OffsetAttribute.class) instanceof PackedTokenAttributeImpl);
  assertTrue("PositionIncrementAttribute is not implemented by Token", 
    ts.addAttribute(PositionIncrementAttribute.class) instanceof PackedTokenAttributeImpl);
  assertTrue("TypeAttribute is not implemented by Token",
    ts.addAttribute(TypeAttribute.class) instanceof PackedTokenAttributeImpl);

  assertTrue("FlagsAttribute is not implemented by FlagsAttributeImpl",
      ts.addAttribute(FlagsAttribute.class) instanceof FlagsAttributeImpl);  
}
 
Example #5
Source File: TestSuggestStopFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testEndNotStopWord() throws Exception {
  CharArraySet stopWords = StopFilter.makeStopSet("to");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to"));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] {"go", "to"},
                            new int[] {0, 3},
                            new int[] {2, 5},
                            null,
                            new int[] {1, 1},
                            null,
                            5,
                            new boolean[] {false, true},
                            true);
}
 
Example #6
Source File: TestJapaneseTokenizer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testEmptyUserDict() throws Exception {
  Reader emptyReader = new StringReader("\n# This is an empty user dictionary\n\n");
  UserDictionary emptyDict = UserDictionary.open(emptyReader);

  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), emptyDict, false, Mode.SEARCH);
      return new TokenStreamComponents(tokenizer, tokenizer);
    }
  };

  assertAnalyzesTo(analyzer, "これは本ではない",
      new String[]{"これ", "は", "本", "で", "は", "ない"},
      new int[]{0, 2, 3, 4, 5, 6},
      new int[]{2, 3, 4, 5, 6, 8}
  );
  analyzer.close();
}
 
Example #7
Source File: TestSoraniStemFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** test against a basic vocabulary file */
public void testVocabulary() throws Exception {
  // top 8k words or so: freq > 1000
  
  // just normalization+stem, we are testing that the stemming doesn't break.
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
      TokenStream stream = new SoraniNormalizationFilter(tokenizer);
      stream = new SoraniStemFilter(stream);
      return new TokenStreamComponents(tokenizer, stream);
    }
  };
  assertVocabulary(a, getDataPath("ckbtestdata.zip"), "testdata.txt");
  a.close();
}
 
Example #8
Source File: CharGroupTokenizerFactory.java    From crate with Apache License 2.0 6 votes vote down vote up
@Override
public Tokenizer create() {
    return new CharTokenizer() {
        @Override
        protected boolean isTokenChar(int c) {
            if (tokenizeOnSpace && Character.isWhitespace(c)) {
                return false;
            }
            if (tokenizeOnLetter && Character.isLetter(c)) {
                return false;
            }
            if (tokenizeOnDigit && Character.isDigit(c)) {
                return false;
            }
            if (tokenizeOnPunctuation && CharMatcher.Basic.PUNCTUATION.isTokenChar(c)) {
                return false;
            }
            if (tokenizeOnSymbol && CharMatcher.Basic.SYMBOL.isTokenChar(c)) {
                return false;
            }
            return !tokenizeOnChars.contains(c);
        }
    };
}
 
Example #9
Source File: TestSynonymMapFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testRecursion4() throws Exception {
  b = new SynonymMap.Builder(true);
  final boolean keepOrig = true;
  add("zoo zoo", "zoo", keepOrig);
  add("zoo", "zoo zoo", keepOrig);
  final SynonymMap map = b.build();
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
    }
  };
  
  assertAnalyzesTo(a, "zoo zoo $ zoo",
      new String[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" },
      new int[] { 1, 0, 1, 1, 1, 0, 1 });
  a.close();
}
 
Example #10
Source File: QueryAutoFilteringComponent.java    From query-autofiltering-component with Apache License 2.0 6 votes vote down vote up
private ArrayList<char[]> tokenize( String input ) throws IOException {
      
  Log.debug( "tokenize '" + input + "'" );
  ArrayList<char[]> tokens = new ArrayList<char[]>( );
  Tokenizer tk = getTokenizerImpl( input );
  
  CharTermAttribute term = tk.addAttribute( CharTermAttribute.class );
  tk.reset( );
  while (tk.incrementToken( ) ) {
    int bufLen = term.length();
    char[] copy = new char[ bufLen ];
    System.arraycopy(term.buffer( ), 0, copy, 0, bufLen );
    tokens.add( copy );
  }
      
  return tokens;
}
 
Example #11
Source File: TestBeiderMorseFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testCustomAttribute() throws IOException {
  TokenStream stream = new MockTokenizer(MockTokenizer.KEYWORD, false);
  ((Tokenizer)stream).setReader(new StringReader("D'Angelo"));
  stream = new PatternKeywordMarkerFilter(stream, Pattern.compile(".*"));
  stream = new BeiderMorseFilter(stream, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true));
  KeywordAttribute keyAtt = stream.addAttribute(KeywordAttribute.class);
  stream.reset();
  int i = 0;
  while(stream.incrementToken()) {
    assertTrue(keyAtt.isKeyword());
    i++;
  }
  assertEquals(12, i);
  stream.end();
  stream.close();
}
 
Example #12
Source File: TestICUNormalizer2CharFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testMassiveLigature() throws IOException {
  String input = "\uFDFA";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenStream.setReader(reader);

  assertTokenStreamContents(tokenStream,
    new String[] {"صلى", "الله", "عليه", "وسلم"},
    new int[]{0, 0, 0, 0},
    new int[]{0, 0, 0, 1},
    input.length()
  );
}
 
Example #13
Source File: TestWordDelimiterFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** concat numbers + words + all + preserve original */
public void testLotsOfConcatenating2() throws Exception {
  final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;    

  /* analyzer that uses whitespace + wdf */
  Analyzer a = new Analyzer() {
    @Override
    public TokenStreamComponents createComponents(String field) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null));
    }
  };
  
  assertAnalyzesTo(a, "abc-def-123-456", 
      new String[] { "abc-def-123-456", "abc", "abcdef", "abcdef123456", "def", "123", "123456", "456" }, 
      new int[] { 0, 0, 0, 0, 4, 8, 8, 12 }, 
      new int[] { 15, 3, 7, 15, 7, 11, 15, 15 },
      null,
      new int[] { 1, 0, 0, 0, 1, 1, 0, 1 },
      null,
      false);
  a.close();
}
 
Example #14
Source File: EdgeNGramTokenFilterTest.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
  for (int i = 0; i < 10; i++) {
    final int min = TestUtil.nextInt(random(), 2, 10);
    final int max = TestUtil.nextInt(random(), min, 20);
    final boolean preserveOriginal = TestUtil.nextInt(random(), 0, 1) % 2 == 0;
  
    Analyzer a = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
        return new TokenStreamComponents(tokenizer, 
          new EdgeNGramTokenFilter(tokenizer, min, max, preserveOriginal));
      }    
    };
    checkRandomData(random(), a, 10*RANDOM_MULTIPLIER);
    a.close();
  }
}
 
Example #15
Source File: TestCompoundWordTokenFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception {
  CharArraySet dict = makeDictionary("ab", "cd", "ef");

  Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenizer.setReader(new StringReader("abcdef"));
  DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
    tokenizer,
    dict,
    CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);

  assertTokenStreamContents(tf,
    new String[] { "abcdef", "ab", "cd", "ef" },
    new int[] { 0, 0, 0, 0},
    new int[] { 6, 6, 6, 6},
    new int[] { 1, 0, 0, 0}
    );
}
 
Example #16
Source File: HanLPTokenizerFactory.java    From elasticsearch-analysis-hanlp with Apache License 2.0 5 votes vote down vote up
public static HanLPTokenizerFactory createSpeed(IndexSettings indexSettings,
                                                Environment environment,
                                                String name,
                                                Settings settings) {
    return new HanLPTokenizerFactory(indexSettings, environment, name, settings) {
        @Override
        public Tokenizer create() {
            return new HanLPTokenizer(SpeedTokenizer.SEGMENT, defaultStopWordDictionary, enablePorterStemming);
        }
    };
}
 
Example #17
Source File: TestGermanStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEmptyTerm() throws IOException {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new KeywordTokenizer();
      return new TokenStreamComponents(tokenizer, new GermanStemFilter(tokenizer));
    }
  };
  checkOneTerm(a, "", "");
  a.close();
}
 
Example #18
Source File: TestJapaneseNumberFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void setUp() throws Exception {
  super.setUp();
  analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, false, false, JapaneseTokenizer.Mode.SEARCH);
      return new TokenStreamComponents(tokenizer, new JapaneseNumberFilter(tokenizer));
    }
  };
}
 
Example #19
Source File: TestICUTokenizerFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testMixedText() throws Exception {
  Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี  This is a test ກວ່າດອກ");
  ICUTokenizerFactory factory = new ICUTokenizerFactory(new HashMap<String,String>());
  factory.inform(new ClasspathResourceLoader(getClass()));
  Tokenizer stream = factory.create(newAttributeFactory());
  stream.setReader(reader);
  assertTokenStreamContents(stream,
      new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี",
      "This", "is", "a", "test", "ກວ່າ", "ດອກ"});
}
 
Example #20
Source File: HanLPTokenizerFactory.java    From elasticsearch-analysis-hanlp with Apache License 2.0 5 votes vote down vote up
public static HanLPTokenizerFactory createCRF(IndexSettings indexSettings,
                                              Environment environment,
                                              String name,
                                              Settings settings) {
    return new HanLPTokenizerFactory(indexSettings, environment, name, settings) {
        @Override
        public Tokenizer create() {
            Segment seg = new CRFSegment().enablePartOfSpeechTagging(true);
            return new HanLPTokenizer(seg, defaultStopWordDictionary, enablePorterStemming);
        }
    };
}
 
Example #21
Source File: TestDaitchMokotoffSoundexFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEmptyTerm() throws IOException {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new KeywordTokenizer();
      return new TokenStreamComponents(tokenizer, new DaitchMokotoffSoundexFilter(tokenizer, random().nextBoolean()));
    }
  };
  checkOneTerm(a, "", "");
  a.close();
}
 
Example #22
Source File: TestCapitalizationFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** blast some random strings through the analyzer */
public void testRandomString() throws Exception {
  Analyzer a = new Analyzer() {

    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new CapitalizationFilter(tokenizer));
    }
  };
  
  checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER);
  a.close();
}
 
Example #23
Source File: TestGermanMinimalStemFilterFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testStemming() throws Exception {
  Reader reader = new StringReader("bilder");
  TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  ((Tokenizer)stream).setReader(reader);
  stream = tokenFilterFactory("GermanMinimalStem").create(stream);
  assertTokenStreamContents(stream, new String[] { "bild" });
}
 
Example #24
Source File: TestHindiStemmer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEmptyTerm() throws IOException {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new KeywordTokenizer();
      return new TokenStreamComponents(tokenizer, new HindiStemFilter(tokenizer));
    }
  };
  checkOneTerm(a, "", "");
  a.close();
}
 
Example #25
Source File: TestCJKAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEmptyTerm() throws IOException {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new KeywordTokenizer();
      return new TokenStreamComponents(tokenizer, new CJKBigramFilter(tokenizer));
    }
  };
  checkOneTerm(a, "", "");
  a.close();
}
 
Example #26
Source File: TestSimplePatternTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEndLookahead() throws Exception {
  Tokenizer t = new SimplePatternTokenizer("(ab)+");
  t.setReader(new StringReader("aba"));
  assertTokenStreamContents(t,
      new String[] { "ab" },
      new int[] { 0 },
      new int[] { 2 },
      3);
}
 
Example #27
Source File: CustomAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(String fieldName) {
  final Tokenizer tk = tokenizer.create(attributeFactory(fieldName));
  TokenStream ts = tk;
  for (final TokenFilterFactory filter : tokenFilters) {
    ts = filter.create(ts);
  }
  return new TokenStreamComponents(tk, ts);
}
 
Example #28
Source File: TestFrenchLightStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("chevaux"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new FrenchLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "chevaux", "chevaux");
  a.close();
}
 
Example #29
Source File: TestICUTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** test for bugs like http://bugs.icu-project.org/trac/ticket/10767 */
public void testICUConcurrency() throws Exception {
  int numThreads = 8;
  final CountDownLatch startingGun = new CountDownLatch(1);
  Thread threads[] = new Thread[numThreads];
  for (int i = 0; i < threads.length; i++) {
    threads[i] = new Thread() {
      @Override
      public void run() {
        try {
          startingGun.await();
          long tokenCount = 0;
          final String contents = "英 เบียร์ ビール ເບຍ abc";
          for (int i = 0; i < 1000; i++) {
            try (Tokenizer tokenizer = new ICUTokenizer()) {
              tokenizer.setReader(new StringReader(contents));
              tokenizer.reset();
              while (tokenizer.incrementToken()) {
                tokenCount++;
              }
              tokenizer.end();
            }
          }
          if (VERBOSE) {
            System.out.println(tokenCount);
          }
        } catch (Exception e) {
          throw new RuntimeException(e);
        }
      } 
    };
    threads[i].start();
  }
  startingGun.countDown();
  for (int i = 0; i < threads.length; i++) {
    threads[i].join();
  }
}
 
Example #30
Source File: TestSerbianNormalizationFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEmptyTerm() throws IOException {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new KeywordTokenizer();
      return new TokenStreamComponents(tokenizer, new SerbianNormalizationFilter(tokenizer));
    }
  };
  checkOneTerm(a, "", "");
  a.close();
}