org.apache.lucene.analysis.TokenStream Java Examples

The following examples show how to use org.apache.lucene.analysis.TokenStream. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestProtectedTermFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testBasic() throws IOException {

    CannedTokenStream cts = new CannedTokenStream(
        new Token("Alice", 1, 0, 5),
        new Token("Bob", 1, 6, 9),
        new Token("Clara", 1, 10, 15),
        new Token("David", 1, 16, 21)
    );

    CharArraySet protectedTerms = new CharArraySet(5, true);
    protectedTerms.add("bob");

    TokenStream ts = new ProtectedTermFilter(protectedTerms, cts, LowerCaseFilter::new);
    assertTokenStreamContents(ts, new String[]{ "alice", "Bob", "clara", "david" });

  }
 
Example #2
Source File: HighlighterTest.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testHighlightingSynonymQuery() throws Exception {
  searcher = newSearcher(reader);
  Query query = new SynonymQuery.Builder(FIELD_NAME)
      .addTerm(new Term(FIELD_NAME, "jfk"))
      .addTerm(new Term(FIELD_NAME, "kennedy"))
      .build();
  QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
  Highlighter highlighter = new Highlighter(scorer);
  TokenStream stream = getAnyTokenStream(FIELD_NAME, 2);
  Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
  highlighter.setTextFragmenter(fragmenter);
  String storedField = searcher.doc(2).get(FIELD_NAME);
  String fragment = highlighter.getBestFragment(stream, storedField);
  assertEquals("<B>JFK</B> has been shot", fragment);

  stream = getAnyTokenStream(FIELD_NAME, 3);
  storedField = searcher.doc(3).get(FIELD_NAME);
  fragment = highlighter.getBestFragment(stream, storedField);
  assertEquals("John <B>Kennedy</B> has been shot", fragment);
}
 
Example #3
Source File: NGramSynonymTokenizerTest.java    From elasticsearch-analysis-synonym with Apache License 2.0 6 votes vote down vote up
@Test
public void testSandwichStrExpand2() throws Exception {
  Analyzer a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa");
  TokenStream stream = a.tokenStream("f", new StringReader("aba"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/a,2,3,1/aa,2,3,0");

  a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abba"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/bb,1,3,0/b,2,3,0/a,3,4,1/aa,3,4,0");

  a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcda"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/bc,1,3,0/cd,2,4,1/d,3,4,0/a,4,5,1/aa,4,5,0");

  a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa");
  stream = a.tokenStream("f", new StringReader("abcdea"));
  stream.reset();
  assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/bc,1,3,0/cd,2,4,1/de,3,5,1/e,4,5,0/a,5,6,1/aa,5,6,0");
}
 
Example #4
Source File: NGramSynonymTokenizerTest.java    From elasticsearch-analysis-synonym with Apache License 2.0 6 votes vote down vote up
@Test
public void testPrevStrSingleSynonym3() throws Exception {
  Analyzer a = new NGramSynonymTokenizerTestAnalyzer(3, false, "a,aa");
  TokenStream stream = a.tokenStream("f", new StringReader("ba"));
  stream.reset();
  assertTokenStream(stream, "b,0,1,1/a,1,2,1");

  a = new NGramSynonymTokenizerTestAnalyzer(3, false, "a,aa");
  stream = a.tokenStream("f", new StringReader("bba"));
  stream.reset();
  assertTokenStream(stream, "bb,0,2,1/a,2,3,1");

  a = new NGramSynonymTokenizerTestAnalyzer(3, false, "a,aa");
  stream = a.tokenStream("f", new StringReader("dcba"));
  stream.reset();
  assertTokenStream(stream, "dcb,0,3,1/a,3,4,1");

  a = new NGramSynonymTokenizerTestAnalyzer(3, false, "a,aa");
  stream = a.tokenStream("f", new StringReader("edcba"));
  stream.reset();
  assertTokenStream(stream, "edc,0,3,1/dcb,1,4,1/a,4,5,1");
}
 
Example #5
Source File: TestSuggestStopFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testEndNotStopWord() throws Exception {
  CharArraySet stopWords = StopFilter.makeStopSet("to");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to"));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] {"go", "to"},
                            new int[] {0, 3},
                            new int[] {2, 5},
                            null,
                            new int[] {1, 1},
                            null,
                            5,
                            new boolean[] {false, true},
                            true);
}
 
Example #6
Source File: TestPackedTokenAttributeImpl.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testPackedTokenAttributeFactory() throws Exception {
  TokenStream ts = new MockTokenizer(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
  ((Tokenizer)ts).setReader(new StringReader("foo bar"));
  
  assertTrue("CharTermAttribute is not implemented by Token",
    ts.addAttribute(CharTermAttribute.class) instanceof PackedTokenAttributeImpl);
  assertTrue("OffsetAttribute is not implemented by Token",
    ts.addAttribute(OffsetAttribute.class) instanceof PackedTokenAttributeImpl);
  assertTrue("PositionIncrementAttribute is not implemented by Token", 
    ts.addAttribute(PositionIncrementAttribute.class) instanceof PackedTokenAttributeImpl);
  assertTrue("TypeAttribute is not implemented by Token",
    ts.addAttribute(TypeAttribute.class) instanceof PackedTokenAttributeImpl);

  assertTrue("FlagsAttribute is not implemented by FlagsAttributeImpl",
      ts.addAttribute(FlagsAttribute.class) instanceof FlagsAttributeImpl);  
}
 
Example #7
Source File: AnalyzersTest.java    From russianmorphology with Apache License 2.0 6 votes vote down vote up
@Test
public void shouldProvideCorrectIndentForWordWithMelitaForm() throws IOException {
    Analyzer morphlogyAnalyzer = new RussianAnalyzer();
    InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год".getBytes()), "UTF-8");

    TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader);
    tokenStream.reset();
    Set<String> foromsOfWine = new HashSet<String>();
    foromsOfWine.add("вина");
    foromsOfWine.add("винo");
    boolean wordSeen = false;
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
        PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class);
        if(foromsOfWine.contains(charTerm.toString()) && wordSeen){
            assertThat(position.getPositionIncrement(),equalTo(0));
        }
        if(foromsOfWine.contains(charTerm.toString())){
            wordSeen = true;
        }
    }
}
 
Example #8
Source File: TestFlattenGraphFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testStrangelyNumberedNodes() throws Exception {

    // Uses only nodes 0, 2, 3, i.e. 1 is just never used (it is not a hole!!)
    TokenStream in = new CannedTokenStream(0, 27, new Token[] {
        token("dog", 1, 3, 0, 5),
        token("puppy", 0, 3, 0, 5),
        token("flies", 3, 1, 6, 11),
      });

    TokenStream out = new FlattenGraphFilter(in);

    assertTokenStreamContents(out,
                              new String[] {"dog", "puppy", "flies"},
                              new int[] {0, 0, 6},
                              new int[] {5, 5, 11},
                              new int[] {1, 0, 1},
                              new int[] {1, 1, 1},
                              27);
  }
 
Example #9
Source File: TestDelimitedPayloadTokenFilterFactory.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testDelim() throws Exception {
  Reader reader = new StringReader("the*0.1 quick*0.1 red*0.1");
  TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  ((Tokenizer)stream).setReader(reader);
  stream = tokenFilterFactory("DelimitedPayload",
      "encoder", "float",
      "delimiter", "*").create(stream);
  stream.reset();
  while (stream.incrementToken()) {
    PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
    assertNotNull(payAttr);
    byte[] payData = payAttr.getPayload().bytes;
    assertNotNull(payData);
    float payFloat = PayloadHelper.decodeFloat(payData);
    assertEquals(0.1f, payFloat, 0.0f);
  }
  stream.end();
  stream.close();
}
 
Example #10
Source File: ShingleAnalyzerWrapperTest.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testShingleAnalyzerWrapperPhraseQuery() throws Exception {
  PhraseQuery.Builder builder = new PhraseQuery.Builder();
  try (TokenStream ts = analyzer.tokenStream("content", "this sentence")) {
    int j = -1;
  
    PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
  
    ts.reset();
    while (ts.incrementToken()) {
      j += posIncrAtt.getPositionIncrement();
      String termText = termAtt.toString();
      builder.add(new Term("content", termText), j);
    }
    ts.end();
  }

  PhraseQuery q = builder.build();
  ScoreDoc[] hits = searcher.search(q, 1000).scoreDocs;
  int[] ranks = new int[] { 0 };
  compareRanks(hits, ranks);
}
 
Example #11
Source File: TestKeepWordFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
  final Set<String> words = new HashSet<>();
  words.add( "a" );
  words.add( "b" );
  
  Analyzer a = new Analyzer() {

    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream stream = new KeepWordFilter(tokenizer, new CharArraySet( words, true));
      return new TokenStreamComponents(tokenizer, stream);
    }
  };
  
  checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER);
  a.close();
}
 
Example #12
Source File: AnalysisImpl.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Analyzes the given TokenStream, collecting the Tokens it produces.
 *
 * @param tokenStream TokenStream to analyze
 *
 * @return List of tokens produced from the TokenStream
 */
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream, List<Token> result) {
  final List<AttributeSource> tokens = new ArrayList<>();
  try {
    tokenStream.reset();
    CharTermAttribute charAtt = tokenStream.getAttribute(CharTermAttribute.class);
    while (tokenStream.incrementToken()) {
      tokens.add(tokenStream.cloneAttributes());
      List<TokenAttribute> attributes = copyAttributes(tokenStream, charAtt);
      result.add(new Token(charAtt.toString(), attributes));
    }
    tokenStream.end();
  } catch (IOException ioe) {
    throw new RuntimeException("Error occurred while iterating over TokenStream", ioe);
  } finally {
    IOUtils.closeWhileHandlingException(tokenStream);
  }
  return tokens;
}
 
Example #13
Source File: DefaultQueryBuilder.java    From modernmt with Apache License 2.0 6 votes vote down vote up
private static void loadTerms(String fieldName, Sentence sentence, Analyzer analyzer, BooleanQuery output) {
    final int maxClauseCount = BooleanQuery.getMaxClauseCount();
    String text = TokensOutputStream.serialize(sentence, false, true);

    TokenStream stream = null;
    try {
        stream = analyzer.tokenStream(fieldName, text);
        CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);

        int count = 0;

        stream.reset();
        while (stream.incrementToken() && (count + 1) < maxClauseCount) {
            Term term = new Term(fieldName, charTermAttribute.toString());
            output.add(new TermQuery(term), BooleanClause.Occur.SHOULD);
            count++;
        }
    } catch (IOException e) {
        throw new Error("This should never happen", e);
    } finally {
        closeQuietly(stream);
    }
}
 
Example #14
Source File: TestNGramFilters.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Test NGramFilterFactory on tokens with payloads
 */
public void testNGramFilterPayload() throws Exception {
  Reader reader = new StringReader("test|0.1");
  TokenStream stream = whitespaceMockTokenizer(reader);
  stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
  stream = tokenFilterFactory("NGram", "minGramSize", "1", "maxGramSize", "2").create(stream);

  stream.reset();
  while (stream.incrementToken()) {
    PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
    assertNotNull(payAttr);
    BytesRef payData = payAttr.getPayload();
    assertNotNull(payData);
    float payFloat = PayloadHelper.decodeFloat(payData.bytes);
    assertEquals(0.1f, payFloat, 0.0f);
  }
  stream.end();
  stream.close();
}
 
Example #15
Source File: TestFlattenGraphFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testHoleUnderSyn() throws Exception {
  // Tests a StopFilter after SynFilter where a stopword in a syn is removed
  //
  //   wizard of oz -> woz syn, but then "of" becomes a hole

  TokenStream in = new CannedTokenStream(0, 12, new Token[] {
      token("wizard", 1, 1, 0, 6),
      token("woz", 0, 3, 0, 12),
      token("oz", 2, 1, 10, 12),
    });


  TokenStream out = new FlattenGraphFilter(in);

  assertTokenStreamContents(out,
                            new String[] {"wizard", "woz", "oz"},
                            new int[] {0, 0, 10},
                            new int[] {6, 12, 12},
                            new int[] {1, 0, 2},
                            new int[] {1, 3, 1},
                            12);
}
 
Example #16
Source File: ChineseWordAnalyzerTest.java    From word with Apache License 2.0 6 votes vote down vote up
@Test
public void test1() {
    try{
        Analyzer analyzer = new ChineseWordAnalyzer();
        TokenStream tokenStream = analyzer.tokenStream("text", "杨尚川是APDPlat应用级产品开发平台的作者");
        List<String> words = new ArrayList<>();
        tokenStream.reset();
        while(tokenStream.incrementToken()){
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            words.add(charTermAttribute.toString());
        }
        tokenStream.close();
        String expResult = "[杨尚川, 是, apdplat, 应用级, 产品, 开发, 平台, 的, 作者]";
        if("bigram".equals(WordConfTools.get("ngram", "bigram"))){
            expResult = "[杨尚川, 是, apdplat, 应用, 级, 产品, 开发, 平台, 的, 作者]";
        }
        assertEquals(expResult, words.toString());
    }catch(IOException e){
        fail("分词出错"+e.getMessage());
    }
}
 
Example #17
Source File: Tagger.java    From SolrTextTagger with Apache License 2.0 6 votes vote down vote up
public Tagger(Terms terms, Bits liveDocs, TokenStream tokenStream,
              TagClusterReducer tagClusterReducer, boolean skipAltTokens,
              boolean ignoreStopWords) throws IOException {
  this.terms = terms;
  this.liveDocs = liveDocs;
  this.tokenStream = tokenStream;
  this.skipAltTokens = skipAltTokens;
  this.ignoreStopWords = ignoreStopWords;
  byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class);
  posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
  offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
  taggingAtt = tokenStream.addAttribute(TaggingAttribute.class);
  tokenStream.reset();

  this.tagClusterReducer = tagClusterReducer;
}
 
Example #18
Source File: TokenSources.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Deprecated // maintenance reasons LUCENE-6445
public static TokenStream getTokenStream(Document doc, String field,
    Analyzer analyzer) {
  String contents = doc.get(field);
  if (contents == null) {
    throw new IllegalArgumentException("Field " + field
        + " in document is not stored and cannot be analyzed");
  }
  return getTokenStream(field, contents, analyzer);
}
 
Example #19
Source File: SmartcnUDF.java    From incubator-hivemall with Apache License 2.0 5 votes vote down vote up
private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results)
        throws IOException {
    // instantiate an attribute placeholder once
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        String term = termAttr.toString();
        results.add(new Text(term));
    }
}
 
Example #20
Source File: LowerCaseTokenFilterFactory.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Override
public TokenStream create(TokenStream tokenStream) {
    if (lang == null) {
        return new LowerCaseFilter(tokenStream);
    } else if (lang.equalsIgnoreCase("greek")) {
        return new GreekLowerCaseFilter(tokenStream);
    } else if (lang.equalsIgnoreCase("irish")) {
        return new IrishLowerCaseFilter(tokenStream);
    } else if (lang.equalsIgnoreCase("turkish")) {
        return new TurkishLowerCaseFilter(tokenStream);
    } else {
        throw new IllegalArgumentException("language [" + lang + "] not support for lower case");
    }
}
 
Example #21
Source File: TestCapitalizationFilterFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testCapitalization5() throws Exception {
  Reader reader = new StringReader("big");
  TokenStream stream = whitespaceMockTokenizer(reader);
  stream = tokenFilterFactory("Capitalization",
      "keep", "and the it BIG",
      "onlyFirstWord", "true",
      "forceFirstLetter", "true").create(stream);
  assertTokenStreamContents(stream, new String[] { "Big" });
}
 
Example #22
Source File: StandardnumberAnalyzer.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer tokenizer = tokenizerFactory.create();
    TokenStream tokenStream = tokenizer;
    for (TokenFilterFactory tokenFilter : Collections.singletonList(stdnumTokenFilterFactory)) {
        tokenStream = tokenFilter.create(tokenStream);
    }
    return new TokenStreamComponents(tokenizer, tokenStream);
}
 
Example #23
Source File: TestICUTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testTokenAttributes() throws Exception {
  try (TokenStream ts = a.tokenStream("dummy", "This is a test")) {
    ScriptAttribute scriptAtt = ts.addAttribute(ScriptAttribute.class);
    ts.reset();
    while (ts.incrementToken()) {
      assertEquals(UScript.LATIN, scriptAtt.getCode());
      assertEquals(UScript.getName(UScript.LATIN), scriptAtt.getName());
      assertEquals(UScript.getShortName(UScript.LATIN), scriptAtt.getShortName());
      assertTrue(ts.reflectAsString(false).contains("script=Latin"));
    }
    ts.end();
  }
}
 
Example #24
Source File: XMoreLikeThis.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
/**
 * Adds term frequencies found by tokenizing text from reader into the Map words
 *
 * @param r a source of text to be tokenized
 * @param termFreqMap a Map of terms and their frequencies
 * @param fieldName Used by analyzer for any special per-field analysis
 */
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName)
        throws IOException {
    if (analyzer == null) {
        throw new UnsupportedOperationException("To use MoreLikeThis without " +
                "term vectors, you must provide an Analyzer");
    }
    try (TokenStream ts = analyzer.tokenStream(fieldName, r)) {
        int tokenCount = 0;
        // for every token
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            String word = termAtt.toString();
            tokenCount++;
            if (tokenCount > maxNumTokensParsed) {
                break;
            }
            if (isNoiseWord(word)) {
                continue;
            }
            if (isSkipTerm(fieldName, word)) {
                continue;
            }

            // increment frequency
            Int cnt = termFreqMap.get(word);
            if (cnt == null) {
                termFreqMap.put(word, new Int());
            } else {
                cnt.x++;
            }
        }
        ts.end();
    }
}
 
Example #25
Source File: QueryFactory.java    From airsonic-advanced with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Query generation expression extracted from
 * {@link org.airsonic.player.service.SearchService#getRandomSongs(RandomSearchCriteria)}.
 */
public Query getRandomSongs(RandomSearchCriteria criteria) throws IOException {

    BooleanQuery.Builder query = new BooleanQuery.Builder();

    Analyzer analyzer = analyzerFactory.getQueryAnalyzer();

    // Unanalyzed field
    query.add(new TermQuery(new Term(FieldNames.MEDIA_TYPE, MediaType.MUSIC.name())), Occur.MUST);

    if (!isEmpty(criteria.getGenre())) {

        // Unanalyzed field, but performs filtering according to id3 tag parser.
        try (TokenStream stream = analyzer.tokenStream(FieldNames.GENRE, criteria.getGenre())) {
            stream.reset();
            if (stream.incrementToken()) {
                String token = stream.getAttribute(CharTermAttribute.class).toString();
                query.add(new TermQuery(new Term(FieldNames.GENRE, token)), Occur.MUST);
            }
        }
    }

    if (!(isEmpty(criteria.getFromYear()) && isEmpty(criteria.getToYear()))) {
        query.add(toYearRangeQuery.apply(criteria.getFromYear(), criteria.getToYear()), Occur.MUST);
    }

    query.add(toFolderQuery.apply(false, criteria.getMusicFolders()), Occur.MUST);

    return query.build();
}
 
Example #26
Source File: EnglishBaseformTokenFilterTests.java    From elasticsearch-analysis-baseform with Apache License 2.0 5 votes vote down vote up
private void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException {
    stream.reset();
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    assertNotNull(termAttr);
    int i = 0;
    while (stream.incrementToken()) {
        assertTrue(i < expected.length);
        assertEquals(expected[i++], termAttr.toString());
    }
    assertEquals(i, expected.length);
    stream.close();
}
 
Example #27
Source File: TestConcatenateGraphFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Test
public void testSeparatorWithStopWords() throws IOException {
  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  String input = "A B C D E F J H";
  tokenStream.setReader(new StringReader(input));
  TokenStream tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("A", "D", "E", "J"));
  ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenFilter, '-', false, 100);

  assertTokenStreamContents(stream, new String[] {"B-C-F-H"}, null, null, new int[] { 1 });
}
 
Example #28
Source File: TestKoreanTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void assertReadings(Analyzer analyzer, String input, String... readings) throws IOException {
  try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
    ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
    ts.reset();
    for(String reading : readings) {
      assertTrue(ts.incrementToken());
      assertEquals(reading, readingAtt.getReading());
    }
    assertFalse(ts.incrementToken());
    ts.end();
  }
}
 
Example #29
Source File: TestPatternReplaceFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testReplaceFirst() throws Exception {
  String input = "aabfooaabfooabfoob ab caaaaaaaaab";
  TokenStream ts = new PatternReplaceFilter
          (whitespaceMockTokenizer(input),
                  Pattern.compile("a*b"),
                  "-", false);
  assertTokenStreamContents(ts, 
      new String[] { "-fooaabfooabfoob", "-", "c-" });
}
 
Example #30
Source File: PatternAnalyzerImpl.java    From database with GNU General Public License v2.0 5 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(final String field) {
	//Use default grouping
	final Tokenizer tokenizer = new PatternTokenizer(pattern,-1);
	final TokenStream filter = new LowerCaseFilter(tokenizer);
	return new TokenStreamComponents(tokenizer, filter);
}