org.elasticsearch.index.analysis.TokenFilterFactory Java Examples

The following examples show how to use org.elasticsearch.index.analysis.TokenFilterFactory. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: IcuAnalysisTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testDefaultsIcuAnalysis() throws IOException {

        TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY,
                new BundlePlugin(Settings.EMPTY));

        CharFilterFactory charFilterFactory = analysis.charFilter.get("icu_normalizer");
        assertThat(charFilterFactory, instanceOf(IcuNormalizerCharFilterFactory.class));

        TokenizerFactory tf = analysis.tokenizer.get("icu_tokenizer");
        assertThat(tf, instanceOf(IcuTokenizerFactory.class));

        TokenFilterFactory filterFactory = analysis.tokenFilter.get("icu_normalizer");
        assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class));

        filterFactory = analysis.tokenFilter.get("icu_folding");
        assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class));

        filterFactory = analysis.tokenFilter.get("icu_transform");
        assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));

        Analyzer analyzer = analysis.indexAnalyzers.get( "icu_collation");
        assertThat(analyzer, instanceOf(NamedAnalyzer.class));
    }
 
Example #2
Source File: HyphenTokenizerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testFive() throws Exception {

        String source = "978-1-4493-5854-9";

        String[] expected = {
                "978-1-4493-5854-9"
        };

        String resource = "hyphen_tokenizer.json";
        Settings settings = Settings.builder()
                .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
                .build();
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                settings,
                new BundlePlugin(Settings.EMPTY));
        Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_tokenizer").create();
        tokenizer.setReader(new StringReader(source));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("hyphen");
        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }
 
Example #3
Source File: HyphenTokenizerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testTwo() throws Exception {

        String source = "Das E-Book muss dringend zum Buchbinder.";

        String[] expected = {
                "Das",
                "E-Book",
                "EBook",
                "Book",
                "muss",
                "dringend",
                "zum",
                "Buchbinder"
        };
        String resource = "hyphen_tokenizer.json";
        Settings settings = Settings.builder()
                .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
                .build();
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                settings,
                new BundlePlugin(Settings.EMPTY));
        Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer").create();
        tokenizer.setReader(new StringReader(source));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("hyphen");
        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }
 
Example #4
Source File: HyphenTokenizerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testOne() throws Exception {

        String source = "Das ist ein Bindestrich-Wort.";

        String[] expected = {
                "Das",
                "ist",
                "ein",
                "Bindestrich-Wort",
                "BindestrichWort",
                "Wort",
                "Bindestrich"
        };
        String resource = "hyphen_tokenizer.json";
        Settings settings = Settings.builder()
                .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
                .build();
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                settings,
                new BundlePlugin(Settings.EMPTY));
        Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_tokenizer").create();
        tokenizer.setReader(new StringReader(source));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("hyphen");
        TokenStream tokenStream = tokenFilter.create(tokenizer);
        assertTokenStreamContents(tokenStream, expected);
    }
 
Example #5
Source File: BaseformTokenFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testThree() throws Exception {

        String source = "wurde zum tollen gemacht";

        String[] expected = {
                "wurde",
                "werden",
                "zum",
                "zum",
                "tollen",
                "tollen",
                "gemacht",
                "machen"
        };
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                Settings.EMPTY,
                new BundlePlugin(Settings.EMPTY));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("baseform");
        Tokenizer tokenizer = analysis.tokenizer.get("standard").create();
        tokenizer.setReader(new StringReader(source));
        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }
 
Example #6
Source File: BaseformTokenFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testTwo() throws Exception {

        String source = "Das sind Autos, die Nudeln transportieren.";

        String[] expected = {
                "Das",
                "Das",
                "sind",
                "sind",
                "Autos",
                "Auto",
                "die",
                "der",
                "Nudeln",
                "Nudel",
                "transportieren",
                "transportieren"
        };
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                Settings.EMPTY,
                new BundlePlugin(Settings.EMPTY));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("baseform");
        Tokenizer tokenizer = analysis.tokenizer.get("standard").create();
        tokenizer.setReader(new StringReader(source));
        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }
 
Example #7
Source File: HyphenTokenizerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testSix() throws Exception {

        String source = "E-Book";

        String[] expected = {
                "E-Book",
                "EBook",
                "Book"
        };

        String resource = "hyphen_tokenizer.json";
        Settings settings = Settings.builder()
                .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
                .build();
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                settings,
                new BundlePlugin(Settings.EMPTY));
        Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_tokenizer").create();
        tokenizer.setReader(new StringReader(source));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("hyphen");
        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }
 
Example #8
Source File: IcuTransformFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testTransformRules() throws Exception {
    String source = "abacadaba";
    String[] expected = new String[] { "bcbcbdbcb" };
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_rules").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_rules");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example #9
Source File: IcuTransformFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testTransformNFD() throws Exception {
    String source = "Alphabētikós Katálogos";
    String[] expected = new String[] { "Alphabetikos", "Katalogos" };
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_nfd").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_nfd");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example #10
Source File: IcuTransformFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testTransformAnyLatin() throws Exception {
    String source = "Αλφαβητικός Κατάλογος";
    String[] expected = new String[] { "Alphabētikós", "Katálogos" };
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_any_latin").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_any_latin");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example #11
Source File: IcuTransformFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testTransformCyrillicLatinReverse() throws Exception {
    String source = "Rossijskaâ Federaciâ";
    String[] expected = new String[] { "Российская", "Федерация"};
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_cyr").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_cyr_reverse");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example #12
Source File: IcuTransformFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testTransformCyrillicLatin() throws Exception {
    String source = "Российская Федерация";
    String[] expected = new String[] { "Rossijskaâ", "Federaciâ" };
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_cyr").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_cyr");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example #13
Source File: IcuTransformFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testTransformKatakanaHiragana() throws Exception {
    String source = "ヒラガナ";
    String[] expected =  new String[] { "ひらがな" };
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_katakana").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_katakana");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example #14
Source File: IcuTransformFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testTransformHanLatin() throws Exception {
    String source = "中国";
    String[] expected =  new String[] { "zhōng guó" };
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_han").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_han");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example #15
Source File: IcuTransformFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testTransformTraditionalSimplified() throws Exception {
    String source = "簡化字";
    String[] expected =  new String[] { "简化", "字" };
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_ch").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_ch");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example #16
Source File: AnalysisModule.java    From crate with Apache License 2.0 6 votes vote down vote up
public AnalysisModule(Environment environment, List<AnalysisPlugin> plugins) throws IOException {
    NamedRegistry<AnalysisProvider<CharFilterFactory>> charFilters = setupCharFilters(plugins);
    NamedRegistry<org.apache.lucene.analysis.hunspell.Dictionary> hunspellDictionaries = setupHunspellDictionaries(plugins);
    HunspellService hunspellService = new HunspellService(environment.settings(), environment, hunspellDictionaries.getRegistry());
    NamedRegistry<AnalysisProvider<TokenFilterFactory>> tokenFilters = setupTokenFilters(plugins, hunspellService);
    NamedRegistry<AnalysisProvider<TokenizerFactory>> tokenizers = setupTokenizers(plugins);
    NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> analyzers = setupAnalyzers(plugins);
    NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> normalizers = setupNormalizers();

    Map<String, PreConfiguredCharFilter> preConfiguredCharFilters = setupPreConfiguredCharFilters(plugins);
    Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters = setupPreConfiguredTokenFilters(plugins);
    Map<String, PreConfiguredTokenizer> preConfiguredTokenizers = setupPreConfiguredTokenizers(plugins);
    Map<String, PreBuiltAnalyzerProviderFactory> preConfiguredAnalyzers = setupPreBuiltAnalyzerProviderFactories(plugins);

    analysisRegistry = new AnalysisRegistry(environment,
            charFilters.getRegistry(), tokenFilters.getRegistry(), tokenizers.getRegistry(),
            analyzers.getRegistry(), normalizers.getRegistry(),
            preConfiguredCharFilters, preConfiguredTokenFilters, preConfiguredTokenizers, preConfiguredAnalyzers);
}
 
Example #17
Source File: SuggestUtils.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
public static ShingleTokenFilterFactory.Factory getShingleFilterFactory(Analyzer analyzer) {
    if (analyzer instanceof NamedAnalyzer) {
        analyzer = ((NamedAnalyzer)analyzer).analyzer();
    }
    if (analyzer instanceof CustomAnalyzer) {
        final CustomAnalyzer a = (CustomAnalyzer) analyzer;
        final TokenFilterFactory[] tokenFilters = a.tokenFilters();
        for (TokenFilterFactory tokenFilterFactory : tokenFilters) {
            if (tokenFilterFactory instanceof ShingleTokenFilterFactory) {
                return ((ShingleTokenFilterFactory)tokenFilterFactory).getInnerFactory();
            } else if (tokenFilterFactory instanceof ShingleTokenFilterFactory.Factory) {
                return (ShingleTokenFilterFactory.Factory) tokenFilterFactory;
            }
        }
    }
    return null;
}
 
Example #18
Source File: CommonAnalysisPlugin.java    From crate with Apache License 2.0 6 votes vote down vote up
@Override
public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
    List<PreConfiguredTokenizer> tokenizers = new ArrayList<>();
    tokenizers.add(PreConfiguredTokenizer.singleton("keyword", KeywordTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("classic", ClassicTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("uax_url_email", UAX29URLEmailTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("path_hierarchy", PathHierarchyTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("letter", LetterTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("whitespace", WhitespaceTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("ngram", NGramTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("edge_ngram",
        () -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null));
    tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1), null));
    tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", XLowerCaseTokenizer::new, () -> new TokenFilterFactory() {
        @Override
        public String name() {
            return "lowercase";
        }

        @Override
        public TokenStream create(TokenStream tokenStream) {
            return new LowerCaseFilter(tokenStream);
        }
    }));

    // Temporary shim for aliases. TODO deprecate after they are moved
    tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new, null));

    return tokenizers;
}
 
Example #19
Source File: SynonymPlugin.java    From elasticsearch-analysis-synonym with Apache License 2.0 6 votes vote down vote up
@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
    final Map<String, AnalysisProvider<TokenFilterFactory>> extra = new HashMap<>();
    extra.put("synonym_filter", new AnalysisProvider<TokenFilterFactory>() {

        @Override
        public TokenFilterFactory get(final IndexSettings indexSettings, final Environment environment, final String name, final Settings settings)
                throws IOException {
            return new SynonymTokenFilterFactory(indexSettings, environment, name, settings, pluginComponent.getAnalysisRegistry());
        }

        @Override
        public boolean requiresAnalysisSettings() {
            return true;
        }
    });
    return extra;
}
 
Example #20
Source File: MultiplexerTokenFilterFactory.java    From crate with Apache License 2.0 6 votes vote down vote up
private TokenFilterFactory chainFilters(String name, List<TokenFilterFactory> filters) {
    return new TokenFilterFactory() {
        @Override
        public String name() {
            return name;
        }

        @Override
        public TokenStream create(TokenStream tokenStream) {
            for (TokenFilterFactory tff : filters) {
                tokenStream = tff.create(tokenStream);
            }
            return tokenStream;
        }
    };
}
 
Example #21
Source File: ASCIIFoldingTokenFilterFactory.java    From crate with Apache License 2.0 6 votes vote down vote up
@Override
public Object getMultiTermComponent() {
    if (preserveOriginal == false) {
        return this;
    } else {
        // See https://issues.apache.org/jira/browse/LUCENE-7536 for the reasoning
        return new TokenFilterFactory() {

            @Override
            public String name() {
                return ASCIIFoldingTokenFilterFactory.this.name();
            }

            @Override
            public TokenStream create(TokenStream tokenStream) {
                return new ASCIIFoldingFilter(tokenStream, false);
            }
        };
    }
}
 
Example #22
Source File: BaseformTokenFilterTests.java    From elasticsearch-analysis-baseform with Apache License 2.0 6 votes vote down vote up
@Test
public void testThree() throws IOException {

    String source = "wurde zum tollen gemacht";

    String[] expected = {
            "wurde",
            "werden",
            "zum",
            "zum",
            "tollen",
            "tollen",
            "gemacht",
            "machen"
    };
    AnalysisService analysisService = MapperTestUtils.analysisService();
    TokenFilterFactory tokenFilter = analysisService.tokenFilter("baseform");
    Tokenizer tokenizer = analysisService.tokenizer("standard").create();
    tokenizer.setReader(new StringReader(source));
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
 
Example #23
Source File: SymbolnameTokenFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testSimple() throws Exception {

        String source = "Programmieren mit C++";

        String[] expected = {
                "Programmieren",
                "mit",
                "C++",
                "C __PLUSSIGN__ __PLUSSIGN__",
                "C",
                "__PLUSSIGN__",
                "__PLUSSIGN__"
        };
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                Settings.EMPTY,
                new BundlePlugin(Settings.EMPTY));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("symbolname");
        Tokenizer tokenizer = analysis.tokenizer.get("whitespace").create();
        tokenizer.setReader(new StringReader(source));
        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }
 
Example #24
Source File: BaseformTokenFilterTests.java    From elasticsearch-analysis-baseform with Apache License 2.0 6 votes vote down vote up
@Test
public void testTwo() throws IOException {

    String source = "Das sind Autos, die Nudeln transportieren.";

    String[] expected = {
            "Das",
            "Das",
            "sind",
            "sind",
            "Autos",
            "Auto",
            "die",
            "der",
            "Nudeln",
            "Nudel",
            "transportieren",
            "transportieren"
    };
    AnalysisService analysisService = MapperTestUtils.analysisService();
    TokenFilterFactory tokenFilter = analysisService.tokenFilter("baseform");
    Tokenizer tokenizer = analysisService.tokenizer("standard").create();
    tokenizer.setReader(new StringReader(source));
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
 
Example #25
Source File: AnnotationIndicesAnalysis.java    From elasticsearch-analysis-annotation with Apache License 2.0 6 votes vote down vote up
@Inject
public AnnotationIndicesAnalysis(Settings settings,
		IndicesAnalysisService indicesAnalysisService) {
	super(settings);
	indicesAnalysisService.analyzerProviderFactories().put(
			"default",
			new PreBuiltAnalyzerProviderFactory("default",
					AnalyzerScope.INDICES, new AnnotationAnalyzer(
							Lucene.ANALYZER_VERSION)));

	indicesAnalysisService.tokenFilterFactories().put("annotation_filter",
			new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
				@Override
				public String name() {
					return "annotation_filter";
				}

				@Override
				public TokenStream create(TokenStream tokenStream) {
					return new InlineAnnotationFilter(tokenStream);
				}
			}));
}
 
Example #26
Source File: HyphenTokenizerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testNine() throws Exception {

        String source = "Das ist ein Punkt. Und noch ein Punkt für U.S.A. Oder? Nicht doch.";

        String[] expected = {
                "Das",
                "ist",
                "ein",
                "Punkt",
                "Und",
                "noch",
                "ein",
                "Punkt",
                "für",
                "U.S.A",
                "Oder",
                "Nicht",
                "doch"

        };
        String resource = "hyphen_tokenizer_without_subwords.json";
        Settings settings = Settings.builder()
                .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
                .build();
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                settings,
                new BundlePlugin(Settings.EMPTY));
        Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_tokenizer").create();
        tokenizer.setReader(new StringReader(source));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_hyphen_tokenfilter");
        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }
 
Example #27
Source File: AnalysisModule.java    From crate with Apache License 2.0 5 votes vote down vote up
private NamedRegistry<AnalysisProvider<TokenFilterFactory>> setupTokenFilters(List<AnalysisPlugin> plugins, HunspellService
    hunspellService) {
    NamedRegistry<AnalysisProvider<TokenFilterFactory>> tokenFilters = new NamedRegistry<>("token_filter");
    tokenFilters.register("stop", StopTokenFilterFactory::new);
    tokenFilters.register("standard", (indexSettings, environment, name, settings) -> {
        DEPRECATION_LOGGER.deprecatedAndMaybeLog("standard_deprecation",
            "The [standard] token filter name is deprecated and will be removed in a future version.");
        return new AbstractTokenFilterFactory(indexSettings, name, settings) {
            @Override
            public TokenStream create(TokenStream tokenStream) {
                return tokenStream;
            }
        };
    });
    tokenFilters.register("shingle", ShingleTokenFilterFactory::new);
    tokenFilters.register(
        "hunspell",
        requiresAnalysisSettings((indexSettings, env, name, settings) ->
            new HunspellTokenFilterFactory(
                indexSettings,
                name,
                settings,
                hunspellService
            )
        )
    );

    tokenFilters.extractAndRegister(plugins, AnalysisPlugin::getTokenFilters);
    return tokenFilters;
}
 
Example #28
Source File: ESTestCase.java    From crate with Apache License 2.0 5 votes vote down vote up
public TestAnalysis(IndexAnalyzers indexAnalyzers,
                    Map<String, TokenFilterFactory> tokenFilter,
                    Map<String, TokenizerFactory> tokenizer,
                    Map<String, CharFilterFactory> charFilter) {
    this.indexAnalyzers = indexAnalyzers;
    this.tokenFilter = tokenFilter;
    this.tokenizer = tokenizer;
    this.charFilter = charFilter;
}
 
Example #29
Source File: BaseformTokenFilterTests.java    From elasticsearch-analysis-baseform with Apache License 2.0 5 votes vote down vote up
@Test
public void testOne() throws IOException {

    String source = "Die Jahresfeier der Rechtsanwaltskanzleien auf dem Donaudampfschiff hat viel Ökosteuer gekostet";

    String[] expected = {
            "Die",
            "Die",
            "Jahresfeier",
            "Jahresfeier",
            "der",
            "der",
            "Rechtsanwaltskanzleien",
            "Rechtsanwaltskanzlei",
            "auf",
            "auf",
            "dem",
            "der",
            "Donaudampfschiff",
            "Donaudampfschiff",
            "hat",
            "haben",
            "viel",
            "viel",
            "Ökosteuer",
            "Ökosteuer",
            "gekostet",
            "kosten"
    };
    AnalysisService analysisService = MapperTestUtils.analysisService();
    TokenFilterFactory tokenFilter = analysisService.tokenFilter("baseform");
    Tokenizer tokenizer = analysisService.tokenizer("standard").create();
    tokenizer.setReader(new StringReader(source));
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
 
Example #30
Source File: HyphenTokenizerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testSeven() throws Exception {
    String source = "Procter & Gamble ist Procter&Gamble. Schwarz - weiss ist schwarz-weiss";

    String[] expected = {
            "Procter",
            "Gamble",
            "ist",
            "Procter&Gamble",
            "Schwarz",
            "weiss",
            "ist",
            "schwarz-weiss",
            "schwarzweiss",
            "weiss",
            "schwarz"
    };

    String resource = "hyphen_tokenizer.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_tokenizer").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("hyphen");
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}