org.apache.lucene.analysis.TokenFilter Java Examples
The following examples show how to use
org.apache.lucene.analysis.TokenFilter.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: NGramTokenFilterTest.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testInvalidOffsets() throws Exception { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); TokenFilter filters = new ASCIIFoldingFilter(tokenizer); filters = new NGramTokenFilter(filters, 2, 2, false); return new TokenStreamComponents(tokenizer, filters); } }; assertAnalyzesTo(analyzer, "mosfellsbær", new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }, new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }); analyzer.close(); }
Example #2
Source File: TestConcatenateGraphFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
@Test public void testWithStopword() throws Exception { for (boolean preservePosInc : new boolean[]{true, false}) { Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true); String input = "a mykeyword a keyword"; //LUCENE-8344 add "a" tokenStream.setReader(new StringReader(input)); TokenFilter tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("a")); ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenFilter, SEP_LABEL, preservePosInc, 10); CharsRefBuilder builder = new CharsRefBuilder(); if (preservePosInc) { builder.append(SEP_LABEL); } builder.append("mykeyword"); builder.append(SEP_LABEL); if (preservePosInc) { builder.append(SEP_LABEL); } builder.append("keyword"); // if (preservePosInc) { LUCENE-8344 uncomment // builder.append(SEP_LABEL); // } assertTokenStreamContents(concatStream, new String[]{builder.toCharsRef().toString()}); } }
Example #3
Source File: ThrowingMockTokenFilterFactory.java From lucene-solr with Apache License 2.0 | 6 votes |
@Override public TokenStream create(TokenStream input) { return new TokenFilter(input) { @Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { try { throw exceptionClass.getConstructor().newInstance(); } catch (IllegalAccessException | InstantiationException | InvocationTargetException | NoSuchMethodException iae) { throw new RuntimeException(iae); } } return false; } }; }
Example #4
Source File: TestElision.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testElision() throws Exception { String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin."; Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory()); tokenizer.setReader(new StringReader(test)); CharArraySet articles = new CharArraySet(asSet("l", "M"), false); TokenFilter filter = new ElisionFilter(tokenizer, articles); List<String> tas = filter(filter); assertEquals("embrouille", tas.get(4)); assertEquals("O'brian", tas.get(6)); assertEquals("enfin", tas.get(7)); }
Example #5
Source File: CommonGramsFilterTest.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Test that CommonGramsFilter works correctly in case-insensitive mode */ public void testCaseSensitive() throws Exception { final String input = "How The s a brown s cow d like A B thing?"; MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false); wt.setReader(new StringReader(input)); TokenFilter cgf = new CommonGramsFilter(wt, commonWords); assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s", "s_a", "a", "a_brown", "brown", "brown_s", "s", "s_cow", "cow", "cow_d", "d", "d_like", "like", "A", "B", "thing?"}); }
Example #6
Source File: CommonGramsFilterTest.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Test CommonGramsQueryFilter in the case that the last word is a stopword */ public void testLastWordisStopWord() throws Exception { final String input = "dog the"; MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false); wt.setReader(new StringReader(input)); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); TokenFilter nsf = new CommonGramsQueryFilter(cgf); assertTokenStreamContents(nsf, new String[] { "dog_the" }); }
Example #7
Source File: CommonGramsFilterTest.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Test CommonGramsQueryFilter in the case that the first word is a stopword */ public void testFirstWordisStopWord() throws Exception { final String input = "the dog"; MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false); wt.setReader(new StringReader(input)); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); TokenFilter nsf = new CommonGramsQueryFilter(cgf); assertTokenStreamContents(nsf, new String[] { "the_dog" }); }
Example #8
Source File: CommonGramsFilterTest.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Test CommonGramsQueryFilter in the case of a single (stop)word query */ public void testOneWordQueryStopWord() throws Exception { final String input = "the"; MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false); wt.setReader(new StringReader(input)); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); TokenFilter nsf = new CommonGramsQueryFilter(cgf); assertTokenStreamContents(nsf, new String[] { "the" }); }
Example #9
Source File: CommonGramsFilterTest.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Test CommonGramsQueryFilter in the case of a single word query */ public void testOneWordQuery() throws Exception { final String input = "monster"; MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false); wt.setReader(new StringReader(input)); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); TokenFilter nsf = new CommonGramsQueryFilter(cgf); assertTokenStreamContents(nsf, new String[] { "monster" }); }
Example #10
Source File: CommonGramsFilterTest.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Test CommonGramsQueryFilter when first and last words are stopwords. */ public void TestFirstAndLastStopWord() throws Exception { final String input = "the of"; MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false); wt.setReader(new StringReader(input)); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); TokenFilter nsf = new CommonGramsQueryFilter(cgf); assertTokenStreamContents(nsf, new String[] { "the_of" }); }
Example #11
Source File: TestDocInverterPerFieldErrorInfo.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(); if (fieldName.equals("distinctiveFieldName")) { TokenFilter tosser = new TokenFilter(tokenizer) { @Override public boolean incrementToken() throws IOException { throw new BadNews("Something is icky."); } }; return new TokenStreamComponents(tokenizer, tosser); } else { return new TokenStreamComponents(tokenizer); } }
Example #12
Source File: TreatmentCurator.java From hmftools with GNU General Public License v3.0 | 5 votes |
@NotNull private static Analyzer spellcheckAnalyzer(@NotNull SpellChecker spellChecker) { return new Analyzer() { @Override protected TokenStreamComponents createComponents(@NotNull String field) { Tokenizer source = new WhitespaceTokenizer(); source.setReader(new StringReader(field)); SpellCheckerTokenFilter spellCheckFilter = new SpellCheckerTokenFilter(defaultTokenFilter(source), spellChecker); TokenFilter concatenatingFilter = new ConcatenatingFilter(spellCheckFilter, ' '); return new TokenStreamComponents(source, concatenatingFilter); } }; }
Example #13
Source File: TreatmentCurator.java From hmftools with GNU General Public License v3.0 | 5 votes |
@NotNull private static Analyzer concatenatingAnalyzer() { return new Analyzer() { @Override protected TokenStreamComponents createComponents(@NotNull String field) { Tokenizer source = new WhitespaceTokenizer(); source.setReader(new StringReader(field)); TokenFilter concatenatingFilter = new ConcatenatingFilter(defaultTokenFilter(source), ' '); return new TokenStreamComponents(source, concatenatingFilter); } }; }
Example #14
Source File: SegmentationIcuTokenizerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
private static Analyzer createAnalyzer() { return new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, new DefaultIcuTokenizerConfig(false, true)); TokenFilter filter = new IcuNormalizerFilter(tokenizer, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)); return new TokenStreamComponents(tokenizer, filter); } }; }
Example #15
Source File: PatternCaptureGroupTokenFilterFactory.java From Elasticsearch with Apache License 2.0 | 4 votes |
@Override public TokenFilter create(TokenStream tokenStream) { return new PatternCaptureGroupTokenFilter(tokenStream, preserveOriginal, patterns); }
Example #16
Source File: TestMoreLikeThis.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testCustomFrequecy() throws IOException { // define an analyzer with delimited term frequency, e.g. "foo|2 bar|3" Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false, 100); MockTokenFilter filt = new MockTokenFilter(tokenizer, MockTokenFilter.EMPTY_STOPSET); return new TokenStreamComponents(tokenizer, addCustomTokenFilter(filt)); } TokenStream addCustomTokenFilter(TokenStream input) { return new TokenFilter(input) { final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); final TermFrequencyAttribute tfAtt = addAttribute(TermFrequencyAttribute.class); @Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { final char[] buffer = termAtt.buffer(); final int length = termAtt.length(); for (int i = 0; i < length; i++) { if (buffer[i] == '|') { termAtt.setLength(i); i++; tfAtt.setTermFrequency(ArrayUtil.parseInt(buffer, i, length - i)); return true; } } return true; } return false; } }; } }; mlt.setAnalyzer(analyzer); mlt.setFieldNames(new String[] {"text"}); mlt.setBoost(true); final double boost10 = ((BooleanQuery) mlt.like("text", new StringReader("lucene|10 release|1"))) .clauses() .stream() .map(BooleanClause::getQuery) .map(BoostQuery.class::cast) .filter(x -> ((TermQuery) x.getQuery()).getTerm().text().equals("lucene")) .mapToDouble(BoostQuery::getBoost) .sum(); final double boost1 = ((BooleanQuery) mlt.like("text", new StringReader("lucene|1 release|1"))) .clauses() .stream() .map(BooleanClause::getQuery) .map(BoostQuery.class::cast) .filter(x -> ((TermQuery) x.getQuery()).getTerm().text().equals("lucene")) .mapToDouble(BoostQuery::getBoost) .sum(); // mlt should use the custom frequencies provided by the analyzer so "lucene|10" should be boosted more than "lucene|1" assertTrue(String.format(Locale.ROOT, "%s should be grater than %s", boost10, boost1), boost10 > boost1); }
Example #17
Source File: TestTermAutomatonQuery.java From lucene-solr with Apache License 2.0 | 4 votes |
public RandomSynonymFilter(TokenFilter in) { super(in); }
Example #18
Source File: NGramFilterFactory.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override public TokenFilter create(TokenStream input) { return new NGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal); }
Example #19
Source File: EdgeNGramFilterFactory.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override public TokenFilter create(TokenStream input) { return new EdgeNGramTokenFilter(input, minGramSize, maxGramSize, preserveOriginal); }
Example #20
Source File: HyphenationCompoundWordTokenFilterFactory.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override public TokenFilter create(TokenStream input) { return new HyphenationCompoundWordTokenFilter(input, hyphenator, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); }
Example #21
Source File: ClassicFilterFactory.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override public TokenFilter create(TokenStream input) { return new ClassicFilter(input); }
Example #22
Source File: KStemFilterFactory.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override public TokenFilter create(TokenStream input) { return new KStemFilter(input); }
Example #23
Source File: CommonGramsFilterFactory.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override public TokenFilter create(TokenStream input) { CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords); return commonGrams; }
Example #24
Source File: CommonGramsQueryFilterFactory.java From lucene-solr with Apache License 2.0 | 4 votes |
/** * Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter */ @Override public TokenFilter create(TokenStream input) { CommonGramsFilter commonGrams = (CommonGramsFilter) super.create(input); return new CommonGramsQueryFilter(commonGrams); }
Example #25
Source File: WordDelimiterFilterFactory.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override public TokenFilter create(TokenStream input) { return new WordDelimiterFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable, flags, protectedWords); }
Example #26
Source File: WordDelimiterGraphFilterFactory.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override public TokenFilter create(TokenStream input) { return new WordDelimiterGraphFilter(input, adjustOffsets, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable, flags, protectedWords); }
Example #27
Source File: TestIndicNormalizer.java From lucene-solr with Apache License 2.0 | 4 votes |
private void check(String input, String output) throws IOException { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);; tokenizer.setReader(new StringReader(input)); TokenFilter tf = new IndicNormalizationFilter(tokenizer); assertTokenStreamContents(tf, new String[] { output }); }
Example #28
Source File: TestHindiStemmer.java From lucene-solr with Apache License 2.0 | 4 votes |
private void check(String input, String output) throws IOException { Tokenizer tokenizer = whitespaceMockTokenizer(input); TokenFilter tf = new HindiStemFilter(tokenizer); assertTokenStreamContents(tf, new String[] { output }); }
Example #29
Source File: TestHindiNormalizer.java From lucene-solr with Apache License 2.0 | 4 votes |
private void check(String input, String output) throws IOException { Tokenizer tokenizer = whitespaceMockTokenizer(input); TokenFilter tf = new HindiNormalizationFilter(tokenizer); assertTokenStreamContents(tf, new String[] { output }); }
Example #30
Source File: TestBengaliStemmer.java From lucene-solr with Apache License 2.0 | 4 votes |
private void check(String input, String output) throws IOException { Tokenizer tokenizer = whitespaceMockTokenizer(input); TokenFilter tf = new BengaliStemFilter(tokenizer); assertTokenStreamContents(tf, new String[] { output }); }