org.apache.lucene.analysis.TokenStream Java Examples
The following examples show how to use
org.apache.lucene.analysis.TokenStream.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestProtectedTermFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testBasic() throws IOException { CannedTokenStream cts = new CannedTokenStream( new Token("Alice", 1, 0, 5), new Token("Bob", 1, 6, 9), new Token("Clara", 1, 10, 15), new Token("David", 1, 16, 21) ); CharArraySet protectedTerms = new CharArraySet(5, true); protectedTerms.add("bob"); TokenStream ts = new ProtectedTermFilter(protectedTerms, cts, LowerCaseFilter::new); assertTokenStreamContents(ts, new String[]{ "alice", "Bob", "clara", "david" }); }
Example #2
Source File: HighlighterTest.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testHighlightingSynonymQuery() throws Exception { searcher = newSearcher(reader); Query query = new SynonymQuery.Builder(FIELD_NAME) .addTerm(new Term(FIELD_NAME, "jfk")) .addTerm(new Term(FIELD_NAME, "kennedy")) .build(); QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(scorer); TokenStream stream = getAnyTokenStream(FIELD_NAME, 2); Fragmenter fragmenter = new SimpleSpanFragmenter(scorer); highlighter.setTextFragmenter(fragmenter); String storedField = searcher.doc(2).get(FIELD_NAME); String fragment = highlighter.getBestFragment(stream, storedField); assertEquals("<B>JFK</B> has been shot", fragment); stream = getAnyTokenStream(FIELD_NAME, 3); storedField = searcher.doc(3).get(FIELD_NAME); fragment = highlighter.getBestFragment(stream, storedField); assertEquals("John <B>Kennedy</B> has been shot", fragment); }
Example #3
Source File: NGramSynonymTokenizerTest.java From elasticsearch-analysis-synonym with Apache License 2.0 | 6 votes |
@Test public void testSandwichStrExpand2() throws Exception { Analyzer a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa"); TokenStream stream = a.tokenStream("f", new StringReader("aba")); stream.reset(); assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/a,2,3,1/aa,2,3,0"); a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa"); stream = a.tokenStream("f", new StringReader("abba")); stream.reset(); assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/bb,1,3,0/b,2,3,0/a,3,4,1/aa,3,4,0"); a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa"); stream = a.tokenStream("f", new StringReader("abcda")); stream.reset(); assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/bc,1,3,0/cd,2,4,1/d,3,4,0/a,4,5,1/aa,4,5,0"); a = new NGramSynonymTokenizerTestAnalyzer(2, true, "a,aa"); stream = a.tokenStream("f", new StringReader("abcdea")); stream.reset(); assertTokenStream(stream, "a,0,1,1/aa,0,1,0/b,1,2,1/bc,1,3,0/cd,2,4,1/de,3,5,1/e,4,5,0/a,5,6,1/aa,5,6,0"); }
Example #4
Source File: NGramSynonymTokenizerTest.java From elasticsearch-analysis-synonym with Apache License 2.0 | 6 votes |
@Test public void testPrevStrSingleSynonym3() throws Exception { Analyzer a = new NGramSynonymTokenizerTestAnalyzer(3, false, "a,aa"); TokenStream stream = a.tokenStream("f", new StringReader("ba")); stream.reset(); assertTokenStream(stream, "b,0,1,1/a,1,2,1"); a = new NGramSynonymTokenizerTestAnalyzer(3, false, "a,aa"); stream = a.tokenStream("f", new StringReader("bba")); stream.reset(); assertTokenStream(stream, "bb,0,2,1/a,2,3,1"); a = new NGramSynonymTokenizerTestAnalyzer(3, false, "a,aa"); stream = a.tokenStream("f", new StringReader("dcba")); stream.reset(); assertTokenStream(stream, "dcb,0,3,1/a,3,4,1"); a = new NGramSynonymTokenizerTestAnalyzer(3, false, "a,aa"); stream = a.tokenStream("f", new StringReader("edcba")); stream.reset(); assertTokenStream(stream, "edc,0,3,1/dcb,1,4,1/a,4,5,1"); }
Example #5
Source File: TestSuggestStopFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testEndNotStopWord() throws Exception { CharArraySet stopWords = StopFilter.makeStopSet("to"); Tokenizer stream = new MockTokenizer(); stream.setReader(new StringReader("go to")); TokenStream filter = new SuggestStopFilter(stream, stopWords); assertTokenStreamContents(filter, new String[] {"go", "to"}, new int[] {0, 3}, new int[] {2, 5}, null, new int[] {1, 1}, null, 5, new boolean[] {false, true}, true); }
Example #6
Source File: TestPackedTokenAttributeImpl.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testPackedTokenAttributeFactory() throws Exception { TokenStream ts = new MockTokenizer(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH); ((Tokenizer)ts).setReader(new StringReader("foo bar")); assertTrue("CharTermAttribute is not implemented by Token", ts.addAttribute(CharTermAttribute.class) instanceof PackedTokenAttributeImpl); assertTrue("OffsetAttribute is not implemented by Token", ts.addAttribute(OffsetAttribute.class) instanceof PackedTokenAttributeImpl); assertTrue("PositionIncrementAttribute is not implemented by Token", ts.addAttribute(PositionIncrementAttribute.class) instanceof PackedTokenAttributeImpl); assertTrue("TypeAttribute is not implemented by Token", ts.addAttribute(TypeAttribute.class) instanceof PackedTokenAttributeImpl); assertTrue("FlagsAttribute is not implemented by FlagsAttributeImpl", ts.addAttribute(FlagsAttribute.class) instanceof FlagsAttributeImpl); }
Example #7
Source File: AnalyzersTest.java From russianmorphology with Apache License 2.0 | 6 votes |
@Test public void shouldProvideCorrectIndentForWordWithMelitaForm() throws IOException { Analyzer morphlogyAnalyzer = new RussianAnalyzer(); InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год".getBytes()), "UTF-8"); TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader); tokenStream.reset(); Set<String> foromsOfWine = new HashSet<String>(); foromsOfWine.add("вина"); foromsOfWine.add("винo"); boolean wordSeen = false; while (tokenStream.incrementToken()) { CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class); if(foromsOfWine.contains(charTerm.toString()) && wordSeen){ assertThat(position.getPositionIncrement(),equalTo(0)); } if(foromsOfWine.contains(charTerm.toString())){ wordSeen = true; } } }
Example #8
Source File: TestFlattenGraphFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testStrangelyNumberedNodes() throws Exception { // Uses only nodes 0, 2, 3, i.e. 1 is just never used (it is not a hole!!) TokenStream in = new CannedTokenStream(0, 27, new Token[] { token("dog", 1, 3, 0, 5), token("puppy", 0, 3, 0, 5), token("flies", 3, 1, 6, 11), }); TokenStream out = new FlattenGraphFilter(in); assertTokenStreamContents(out, new String[] {"dog", "puppy", "flies"}, new int[] {0, 0, 6}, new int[] {5, 5, 11}, new int[] {1, 0, 1}, new int[] {1, 1, 1}, 27); }
Example #9
Source File: TestDelimitedPayloadTokenFilterFactory.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testDelim() throws Exception { Reader reader = new StringReader("the*0.1 quick*0.1 red*0.1"); TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false); ((Tokenizer)stream).setReader(reader); stream = tokenFilterFactory("DelimitedPayload", "encoder", "float", "delimiter", "*").create(stream); stream.reset(); while (stream.incrementToken()) { PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class); assertNotNull(payAttr); byte[] payData = payAttr.getPayload().bytes; assertNotNull(payData); float payFloat = PayloadHelper.decodeFloat(payData); assertEquals(0.1f, payFloat, 0.0f); } stream.end(); stream.close(); }
Example #10
Source File: ShingleAnalyzerWrapperTest.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testShingleAnalyzerWrapperPhraseQuery() throws Exception { PhraseQuery.Builder builder = new PhraseQuery.Builder(); try (TokenStream ts = analyzer.tokenStream("content", "this sentence")) { int j = -1; PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { j += posIncrAtt.getPositionIncrement(); String termText = termAtt.toString(); builder.add(new Term("content", termText), j); } ts.end(); } PhraseQuery q = builder.build(); ScoreDoc[] hits = searcher.search(q, 1000).scoreDocs; int[] ranks = new int[] { 0 }; compareRanks(hits, ranks); }
Example #11
Source File: TestKeepWordFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
/** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { final Set<String> words = new HashSet<>(); words.add( "a" ); words.add( "b" ); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); TokenStream stream = new KeepWordFilter(tokenizer, new CharArraySet( words, true)); return new TokenStreamComponents(tokenizer, stream); } }; checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER); a.close(); }
Example #12
Source File: AnalysisImpl.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * Analyzes the given TokenStream, collecting the Tokens it produces. * * @param tokenStream TokenStream to analyze * * @return List of tokens produced from the TokenStream */ private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream, List<Token> result) { final List<AttributeSource> tokens = new ArrayList<>(); try { tokenStream.reset(); CharTermAttribute charAtt = tokenStream.getAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { tokens.add(tokenStream.cloneAttributes()); List<TokenAttribute> attributes = copyAttributes(tokenStream, charAtt); result.add(new Token(charAtt.toString(), attributes)); } tokenStream.end(); } catch (IOException ioe) { throw new RuntimeException("Error occurred while iterating over TokenStream", ioe); } finally { IOUtils.closeWhileHandlingException(tokenStream); } return tokens; }
Example #13
Source File: DefaultQueryBuilder.java From modernmt with Apache License 2.0 | 6 votes |
private static void loadTerms(String fieldName, Sentence sentence, Analyzer analyzer, BooleanQuery output) { final int maxClauseCount = BooleanQuery.getMaxClauseCount(); String text = TokensOutputStream.serialize(sentence, false, true); TokenStream stream = null; try { stream = analyzer.tokenStream(fieldName, text); CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class); int count = 0; stream.reset(); while (stream.incrementToken() && (count + 1) < maxClauseCount) { Term term = new Term(fieldName, charTermAttribute.toString()); output.add(new TermQuery(term), BooleanClause.Occur.SHOULD); count++; } } catch (IOException e) { throw new Error("This should never happen", e); } finally { closeQuietly(stream); } }
Example #14
Source File: TestNGramFilters.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * Test NGramFilterFactory on tokens with payloads */ public void testNGramFilterPayload() throws Exception { Reader reader = new StringReader("test|0.1"); TokenStream stream = whitespaceMockTokenizer(reader); stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream); stream = tokenFilterFactory("NGram", "minGramSize", "1", "maxGramSize", "2").create(stream); stream.reset(); while (stream.incrementToken()) { PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class); assertNotNull(payAttr); BytesRef payData = payAttr.getPayload(); assertNotNull(payData); float payFloat = PayloadHelper.decodeFloat(payData.bytes); assertEquals(0.1f, payFloat, 0.0f); } stream.end(); stream.close(); }
Example #15
Source File: TestFlattenGraphFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testHoleUnderSyn() throws Exception { // Tests a StopFilter after SynFilter where a stopword in a syn is removed // // wizard of oz -> woz syn, but then "of" becomes a hole TokenStream in = new CannedTokenStream(0, 12, new Token[] { token("wizard", 1, 1, 0, 6), token("woz", 0, 3, 0, 12), token("oz", 2, 1, 10, 12), }); TokenStream out = new FlattenGraphFilter(in); assertTokenStreamContents(out, new String[] {"wizard", "woz", "oz"}, new int[] {0, 0, 10}, new int[] {6, 12, 12}, new int[] {1, 0, 2}, new int[] {1, 3, 1}, 12); }
Example #16
Source File: ChineseWordAnalyzerTest.java From word with Apache License 2.0 | 6 votes |
@Test public void test1() { try{ Analyzer analyzer = new ChineseWordAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("text", "杨尚川是APDPlat应用级产品开发平台的作者"); List<String> words = new ArrayList<>(); tokenStream.reset(); while(tokenStream.incrementToken()){ CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); words.add(charTermAttribute.toString()); } tokenStream.close(); String expResult = "[杨尚川, 是, apdplat, 应用级, 产品, 开发, 平台, 的, 作者]"; if("bigram".equals(WordConfTools.get("ngram", "bigram"))){ expResult = "[杨尚川, 是, apdplat, 应用, 级, 产品, 开发, 平台, 的, 作者]"; } assertEquals(expResult, words.toString()); }catch(IOException e){ fail("分词出错"+e.getMessage()); } }
Example #17
Source File: Tagger.java From SolrTextTagger with Apache License 2.0 | 6 votes |
public Tagger(Terms terms, Bits liveDocs, TokenStream tokenStream, TagClusterReducer tagClusterReducer, boolean skipAltTokens, boolean ignoreStopWords) throws IOException { this.terms = terms; this.liveDocs = liveDocs; this.tokenStream = tokenStream; this.skipAltTokens = skipAltTokens; this.ignoreStopWords = ignoreStopWords; byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class); posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); taggingAtt = tokenStream.addAttribute(TaggingAttribute.class); tokenStream.reset(); this.tagClusterReducer = tagClusterReducer; }
Example #18
Source File: TokenSources.java From lucene-solr with Apache License 2.0 | 5 votes |
@Deprecated // maintenance reasons LUCENE-6445 public static TokenStream getTokenStream(Document doc, String field, Analyzer analyzer) { String contents = doc.get(field); if (contents == null) { throw new IllegalArgumentException("Field " + field + " in document is not stored and cannot be analyzed"); } return getTokenStream(field, contents, analyzer); }
Example #19
Source File: SmartcnUDF.java From incubator-hivemall with Apache License 2.0 | 5 votes |
private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results) throws IOException { // instantiate an attribute placeholder once CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAttr.toString(); results.add(new Text(term)); } }
Example #20
Source File: LowerCaseTokenFilterFactory.java From Elasticsearch with Apache License 2.0 | 5 votes |
@Override public TokenStream create(TokenStream tokenStream) { if (lang == null) { return new LowerCaseFilter(tokenStream); } else if (lang.equalsIgnoreCase("greek")) { return new GreekLowerCaseFilter(tokenStream); } else if (lang.equalsIgnoreCase("irish")) { return new IrishLowerCaseFilter(tokenStream); } else if (lang.equalsIgnoreCase("turkish")) { return new TurkishLowerCaseFilter(tokenStream); } else { throw new IllegalArgumentException("language [" + lang + "] not support for lower case"); } }
Example #21
Source File: TestCapitalizationFilterFactory.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testCapitalization5() throws Exception { Reader reader = new StringReader("big"); TokenStream stream = whitespaceMockTokenizer(reader); stream = tokenFilterFactory("Capitalization", "keep", "and the it BIG", "onlyFirstWord", "true", "forceFirstLetter", "true").create(stream); assertTokenStreamContents(stream, new String[] { "Big" }); }
Example #22
Source File: StandardnumberAnalyzer.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = tokenizerFactory.create(); TokenStream tokenStream = tokenizer; for (TokenFilterFactory tokenFilter : Collections.singletonList(stdnumTokenFilterFactory)) { tokenStream = tokenFilter.create(tokenStream); } return new TokenStreamComponents(tokenizer, tokenStream); }
Example #23
Source File: TestICUTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testTokenAttributes() throws Exception { try (TokenStream ts = a.tokenStream("dummy", "This is a test")) { ScriptAttribute scriptAtt = ts.addAttribute(ScriptAttribute.class); ts.reset(); while (ts.incrementToken()) { assertEquals(UScript.LATIN, scriptAtt.getCode()); assertEquals(UScript.getName(UScript.LATIN), scriptAtt.getName()); assertEquals(UScript.getShortName(UScript.LATIN), scriptAtt.getShortName()); assertTrue(ts.reflectAsString(false).contains("script=Latin")); } ts.end(); } }
Example #24
Source File: XMoreLikeThis.java From Elasticsearch with Apache License 2.0 | 5 votes |
/** * Adds term frequencies found by tokenizing text from reader into the Map words * * @param r a source of text to be tokenized * @param termFreqMap a Map of terms and their frequencies * @param fieldName Used by analyzer for any special per-field analysis */ private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException { if (analyzer == null) { throw new UnsupportedOperationException("To use MoreLikeThis without " + "term vectors, you must provide an Analyzer"); } try (TokenStream ts = analyzer.tokenStream(fieldName, r)) { int tokenCount = 0; // for every token CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (isNoiseWord(word)) { continue; } if (isSkipTerm(fieldName, word)) { continue; } // increment frequency Int cnt = termFreqMap.get(word); if (cnt == null) { termFreqMap.put(word, new Int()); } else { cnt.x++; } } ts.end(); } }
Example #25
Source File: QueryFactory.java From airsonic-advanced with GNU General Public License v3.0 | 5 votes |
/** * Query generation expression extracted from * {@link org.airsonic.player.service.SearchService#getRandomSongs(RandomSearchCriteria)}. */ public Query getRandomSongs(RandomSearchCriteria criteria) throws IOException { BooleanQuery.Builder query = new BooleanQuery.Builder(); Analyzer analyzer = analyzerFactory.getQueryAnalyzer(); // Unanalyzed field query.add(new TermQuery(new Term(FieldNames.MEDIA_TYPE, MediaType.MUSIC.name())), Occur.MUST); if (!isEmpty(criteria.getGenre())) { // Unanalyzed field, but performs filtering according to id3 tag parser. try (TokenStream stream = analyzer.tokenStream(FieldNames.GENRE, criteria.getGenre())) { stream.reset(); if (stream.incrementToken()) { String token = stream.getAttribute(CharTermAttribute.class).toString(); query.add(new TermQuery(new Term(FieldNames.GENRE, token)), Occur.MUST); } } } if (!(isEmpty(criteria.getFromYear()) && isEmpty(criteria.getToYear()))) { query.add(toYearRangeQuery.apply(criteria.getFromYear(), criteria.getToYear()), Occur.MUST); } query.add(toFolderQuery.apply(false, criteria.getMusicFolders()), Occur.MUST); return query.build(); }
Example #26
Source File: EnglishBaseformTokenFilterTests.java From elasticsearch-analysis-baseform with Apache License 2.0 | 5 votes |
private void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException { stream.reset(); CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); assertNotNull(termAttr); int i = 0; while (stream.incrementToken()) { assertTrue(i < expected.length); assertEquals(expected[i++], termAttr.toString()); } assertEquals(i, expected.length); stream.close(); }
Example #27
Source File: TestConcatenateGraphFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
@Test public void testSeparatorWithStopWords() throws IOException { Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false); String input = "A B C D E F J H"; tokenStream.setReader(new StringReader(input)); TokenStream tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("A", "D", "E", "J")); ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenFilter, '-', false, 100); assertTokenStreamContents(stream, new String[] {"B-C-F-H"}, null, null, new int[] { 1 }); }
Example #28
Source File: TestKoreanTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
private void assertReadings(Analyzer analyzer, String input, String... readings) throws IOException { try (TokenStream ts = analyzer.tokenStream("ignored", input)) { ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class); ts.reset(); for(String reading : readings) { assertTrue(ts.incrementToken()); assertEquals(reading, readingAtt.getReading()); } assertFalse(ts.incrementToken()); ts.end(); } }
Example #29
Source File: TestPatternReplaceFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testReplaceFirst() throws Exception { String input = "aabfooaabfooabfoob ab caaaaaaaaab"; TokenStream ts = new PatternReplaceFilter (whitespaceMockTokenizer(input), Pattern.compile("a*b"), "-", false); assertTokenStreamContents(ts, new String[] { "-fooaabfooabfoob", "-", "c-" }); }
Example #30
Source File: PatternAnalyzerImpl.java From database with GNU General Public License v2.0 | 5 votes |
@Override protected TokenStreamComponents createComponents(final String field) { //Use default grouping final Tokenizer tokenizer = new PatternTokenizer(pattern,-1); final TokenStream filter = new LowerCaseFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); }