Java Code Examples for org.elasticsearch.index.analysis.TokenFilterFactory

The following examples show how to use org.elasticsearch.index.analysis.TokenFilterFactory. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: Elasticsearch   Source File: SuggestUtils.java    License: Apache License 2.0 6 votes vote down vote up
public static ShingleTokenFilterFactory.Factory getShingleFilterFactory(Analyzer analyzer) {
    if (analyzer instanceof NamedAnalyzer) {
        analyzer = ((NamedAnalyzer)analyzer).analyzer();
    }
    if (analyzer instanceof CustomAnalyzer) {
        final CustomAnalyzer a = (CustomAnalyzer) analyzer;
        final TokenFilterFactory[] tokenFilters = a.tokenFilters();
        for (TokenFilterFactory tokenFilterFactory : tokenFilters) {
            if (tokenFilterFactory instanceof ShingleTokenFilterFactory) {
                return ((ShingleTokenFilterFactory)tokenFilterFactory).getInnerFactory();
            } else if (tokenFilterFactory instanceof ShingleTokenFilterFactory.Factory) {
                return (ShingleTokenFilterFactory.Factory) tokenFilterFactory;
            }
        }
    }
    return null;
}
 
Example 2
@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
    final Map<String, AnalysisProvider<TokenFilterFactory>> extra = new HashMap<>();
    extra.put("synonym_filter", new AnalysisProvider<TokenFilterFactory>() {

        @Override
        public TokenFilterFactory get(final IndexSettings indexSettings, final Environment environment, final String name, final Settings settings)
                throws IOException {
            return new SynonymTokenFilterFactory(indexSettings, environment, name, settings, pluginComponent.getAnalysisRegistry());
        }

        @Override
        public boolean requiresAnalysisSettings() {
            return true;
        }
    });
    return extra;
}
 
Example 3
public void testSimple() throws Exception {

        String source = "Programmieren mit C++";

        String[] expected = {
                "Programmieren",
                "mit",
                "C++",
                "C __PLUSSIGN__ __PLUSSIGN__",
                "C",
                "__PLUSSIGN__",
                "__PLUSSIGN__"
        };
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                Settings.EMPTY,
                new BundlePlugin(Settings.EMPTY));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("symbolname");
        Tokenizer tokenizer = analysis.tokenizer.get("whitespace").create();
        tokenizer.setReader(new StringReader(source));
        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }
 
Example 4
public void testDefaultsIcuAnalysis() throws IOException {

        TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY,
                new BundlePlugin(Settings.EMPTY));

        CharFilterFactory charFilterFactory = analysis.charFilter.get("icu_normalizer");
        assertThat(charFilterFactory, instanceOf(IcuNormalizerCharFilterFactory.class));

        TokenizerFactory tf = analysis.tokenizer.get("icu_tokenizer");
        assertThat(tf, instanceOf(IcuTokenizerFactory.class));

        TokenFilterFactory filterFactory = analysis.tokenFilter.get("icu_normalizer");
        assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class));

        filterFactory = analysis.tokenFilter.get("icu_folding");
        assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class));

        filterFactory = analysis.tokenFilter.get("icu_transform");
        assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));

        Analyzer analyzer = analysis.indexAnalyzers.get( "icu_collation");
        assertThat(analyzer, instanceOf(NamedAnalyzer.class));
    }
 
Example 5
public void testTransformTraditionalSimplified() throws Exception {
    String source = "簡化字";
    String[] expected =  new String[] { "简化", "字" };
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_ch").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_ch");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example 6
public void testTransformHanLatin() throws Exception {
    String source = "中国";
    String[] expected =  new String[] { "zhōng guó" };
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_han").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_han");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example 7
public void testTransformKatakanaHiragana() throws Exception {
    String source = "ヒラガナ";
    String[] expected =  new String[] { "ひらがな" };
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_katakana").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_katakana");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example 8
public void testTransformCyrillicLatin() throws Exception {
    String source = "Российская Федерация";
    String[] expected = new String[] { "Rossijskaâ", "Federaciâ" };
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_cyr").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_cyr");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example 9
public void testTransformCyrillicLatinReverse() throws Exception {
    String source = "Rossijskaâ Federaciâ";
    String[] expected = new String[] { "Российская", "Федерация"};
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_cyr").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_cyr_reverse");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example 10
public void testTransformAnyLatin() throws Exception {
    String source = "Αλφαβητικός Κατάλογος";
    String[] expected = new String[] { "Alphabētikós", "Katálogos" };
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_any_latin").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_any_latin");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example 11
public void testTransformNFD() throws Exception {
    String source = "Alphabētikós Katálogos";
    String[] expected = new String[] { "Alphabetikos", "Katalogos" };
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_nfd").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_nfd");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example 12
public void testTransformRules() throws Exception {
    String source = "abacadaba";
    String[] expected = new String[] { "bcbcbdbcb" };
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_rules").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_rules");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example 13
public void testTwo() throws Exception {

        String source = "Das sind Autos, die Nudeln transportieren.";

        String[] expected = {
                "Das",
                "Das",
                "sind",
                "sind",
                "Autos",
                "Auto",
                "die",
                "der",
                "Nudeln",
                "Nudel",
                "transportieren",
                "transportieren"
        };
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                Settings.EMPTY,
                new BundlePlugin(Settings.EMPTY));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("baseform");
        Tokenizer tokenizer = analysis.tokenizer.get("standard").create();
        tokenizer.setReader(new StringReader(source));
        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }
 
Example 14
public void testThree() throws Exception {

        String source = "wurde zum tollen gemacht";

        String[] expected = {
                "wurde",
                "werden",
                "zum",
                "zum",
                "tollen",
                "tollen",
                "gemacht",
                "machen"
        };
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                Settings.EMPTY,
                new BundlePlugin(Settings.EMPTY));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("baseform");
        Tokenizer tokenizer = analysis.tokenizer.get("standard").create();
        tokenizer.setReader(new StringReader(source));
        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }
 
Example 15
public void testOne() throws Exception {

        String source = "Das ist ein Bindestrich-Wort.";

        String[] expected = {
                "Das",
                "ist",
                "ein",
                "Bindestrich-Wort",
                "BindestrichWort",
                "Wort",
                "Bindestrich"
        };
        String resource = "hyphen_tokenizer.json";
        Settings settings = Settings.builder()
                .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
                .build();
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                settings,
                new BundlePlugin(Settings.EMPTY));
        Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_tokenizer").create();
        tokenizer.setReader(new StringReader(source));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("hyphen");
        TokenStream tokenStream = tokenFilter.create(tokenizer);
        assertTokenStreamContents(tokenStream, expected);
    }
 
Example 16
public void testTwo() throws Exception {

        String source = "Das E-Book muss dringend zum Buchbinder.";

        String[] expected = {
                "Das",
                "E-Book",
                "EBook",
                "Book",
                "muss",
                "dringend",
                "zum",
                "Buchbinder"
        };
        String resource = "hyphen_tokenizer.json";
        Settings settings = Settings.builder()
                .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
                .build();
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                settings,
                new BundlePlugin(Settings.EMPTY));
        Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer").create();
        tokenizer.setReader(new StringReader(source));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("hyphen");
        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }
 
Example 17
public void testFive() throws Exception {

        String source = "978-1-4493-5854-9";

        String[] expected = {
                "978-1-4493-5854-9"
        };

        String resource = "hyphen_tokenizer.json";
        Settings settings = Settings.builder()
                .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
                .build();
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                settings,
                new BundlePlugin(Settings.EMPTY));
        Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_tokenizer").create();
        tokenizer.setReader(new StringReader(source));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("hyphen");
        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }
 
Example 18
public void testSix() throws Exception {

        String source = "E-Book";

        String[] expected = {
                "E-Book",
                "EBook",
                "Book"
        };

        String resource = "hyphen_tokenizer.json";
        Settings settings = Settings.builder()
                .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
                .build();
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                settings,
                new BundlePlugin(Settings.EMPTY));
        Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_tokenizer").create();
        tokenizer.setReader(new StringReader(source));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("hyphen");
        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }
 
Example 19
@Inject
public AnnotationIndicesAnalysis(Settings settings,
		IndicesAnalysisService indicesAnalysisService) {
	super(settings);
	indicesAnalysisService.analyzerProviderFactories().put(
			"default",
			new PreBuiltAnalyzerProviderFactory("default",
					AnalyzerScope.INDICES, new AnnotationAnalyzer(
							Lucene.ANALYZER_VERSION)));

	indicesAnalysisService.tokenFilterFactories().put("annotation_filter",
			new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
				@Override
				public String name() {
					return "annotation_filter";
				}

				@Override
				public TokenStream create(TokenStream tokenStream) {
					return new InlineAnnotationFilter(tokenStream);
				}
			}));
}
 
Example 20
@Test
public void testTwo() throws IOException {

    String source = "Das sind Autos, die Nudeln transportieren.";

    String[] expected = {
            "Das",
            "Das",
            "sind",
            "sind",
            "Autos",
            "Auto",
            "die",
            "der",
            "Nudeln",
            "Nudel",
            "transportieren",
            "transportieren"
    };
    AnalysisService analysisService = MapperTestUtils.analysisService();
    TokenFilterFactory tokenFilter = analysisService.tokenFilter("baseform");
    Tokenizer tokenizer = analysisService.tokenizer("standard").create();
    tokenizer.setReader(new StringReader(source));
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
 
Example 21
@Test
public void testThree() throws IOException {

    String source = "wurde zum tollen gemacht";

    String[] expected = {
            "wurde",
            "werden",
            "zum",
            "zum",
            "tollen",
            "tollen",
            "gemacht",
            "machen"
    };
    AnalysisService analysisService = MapperTestUtils.analysisService();
    TokenFilterFactory tokenFilter = analysisService.tokenFilter("baseform");
    Tokenizer tokenizer = analysisService.tokenizer("standard").create();
    tokenizer.setReader(new StringReader(source));
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
 
Example 22
Source Project: crate   Source File: AnalysisModule.java    License: Apache License 2.0 6 votes vote down vote up
public AnalysisModule(Environment environment, List<AnalysisPlugin> plugins) throws IOException {
    NamedRegistry<AnalysisProvider<CharFilterFactory>> charFilters = setupCharFilters(plugins);
    NamedRegistry<org.apache.lucene.analysis.hunspell.Dictionary> hunspellDictionaries = setupHunspellDictionaries(plugins);
    HunspellService hunspellService = new HunspellService(environment.settings(), environment, hunspellDictionaries.getRegistry());
    NamedRegistry<AnalysisProvider<TokenFilterFactory>> tokenFilters = setupTokenFilters(plugins, hunspellService);
    NamedRegistry<AnalysisProvider<TokenizerFactory>> tokenizers = setupTokenizers(plugins);
    NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> analyzers = setupAnalyzers(plugins);
    NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> normalizers = setupNormalizers();

    Map<String, PreConfiguredCharFilter> preConfiguredCharFilters = setupPreConfiguredCharFilters(plugins);
    Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters = setupPreConfiguredTokenFilters(plugins);
    Map<String, PreConfiguredTokenizer> preConfiguredTokenizers = setupPreConfiguredTokenizers(plugins);
    Map<String, PreBuiltAnalyzerProviderFactory> preConfiguredAnalyzers = setupPreBuiltAnalyzerProviderFactories(plugins);

    analysisRegistry = new AnalysisRegistry(environment,
            charFilters.getRegistry(), tokenFilters.getRegistry(), tokenizers.getRegistry(),
            analyzers.getRegistry(), normalizers.getRegistry(),
            preConfiguredCharFilters, preConfiguredTokenFilters, preConfiguredTokenizers, preConfiguredAnalyzers);
}
 
Example 23
Source Project: crate   Source File: ASCIIFoldingTokenFilterFactory.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public Object getMultiTermComponent() {
    if (preserveOriginal == false) {
        return this;
    } else {
        // See https://issues.apache.org/jira/browse/LUCENE-7536 for the reasoning
        return new TokenFilterFactory() {

            @Override
            public String name() {
                return ASCIIFoldingTokenFilterFactory.this.name();
            }

            @Override
            public TokenStream create(TokenStream tokenStream) {
                return new ASCIIFoldingFilter(tokenStream, false);
            }
        };
    }
}
 
Example 24
Source Project: crate   Source File: MultiplexerTokenFilterFactory.java    License: Apache License 2.0 6 votes vote down vote up
private TokenFilterFactory chainFilters(String name, List<TokenFilterFactory> filters) {
    return new TokenFilterFactory() {
        @Override
        public String name() {
            return name;
        }

        @Override
        public TokenStream create(TokenStream tokenStream) {
            for (TokenFilterFactory tff : filters) {
                tokenStream = tff.create(tokenStream);
            }
            return tokenStream;
        }
    };
}
 
Example 25
@Override
public Map<String, AnalysisModule.AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
    Map<String, AnalysisModule.AnalysisProvider<TokenFilterFactory>> tokenFilters = new HashMap<>();

    tokenFilters.put(PLUGIN_NAME, requiresAnalysisSettings((is, env, name, settings) -> new DynamicSynonymTokenFilterFactory(is, env, name, settings)));

    return tokenFilters;
}
 
Example 26
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer tokenizer = tokenizerFactory.create();
    TokenStream tokenStream = tokenizer;
    for (TokenFilterFactory tokenFilter : Collections.singletonList(stdnumTokenFilterFactory)) {
        tokenStream = tokenFilter.create(tokenStream);
    }
    return new TokenStreamComponents(tokenizer, tokenStream);
}
 
Example 27
@Override
public void build(final Map<String, TokenizerFactory> tokenizers,
                  final Map<String, CharFilterFactory> charFilters,
                  final Map<String, TokenFilterFactory> tokenFilters) {
    List<CharFilterFactory> myCharFilters = new ArrayList<>();
    List<String> charFilterNames = analyzerSettings.getAsList("char_filter");
    for (String charFilterName : charFilterNames) {
        CharFilterFactory charFilter = charFilters.get(charFilterName);
        if (charFilter == null) {
            throw new IllegalArgumentException("hyphen analyzer [" + name()
                    + "] failed to find char_filter under name [" + charFilterName + "]");
        }
        myCharFilters.add(charFilter);
    }
    List<TokenFilterFactory> myTokenFilters = new ArrayList<>();
    myTokenFilters.add(tokenFilterFactory);
    List<String> tokenFilterNames = analyzerSettings.getAsList("filter");
    for (String tokenFilterName : tokenFilterNames) {
        TokenFilterFactory tokenFilter = tokenFilters.get(tokenFilterName);
        if (tokenFilter == null) {
            throw new IllegalArgumentException("hyphen analyzer [" + name()
                    + "] failed to find filter under name [" + tokenFilterName + "]");
        }
        myTokenFilters.add(tokenFilter);
    }
    int positionOffsetGap = analyzerSettings.getAsInt("position_offset_gap", 0);
    int offsetGap = analyzerSettings.getAsInt("offset_gap", -1);
    this.customAnalyzer = new CustomAnalyzer(name(), tokenizerFactory,
            myCharFilters.toArray(new CharFilterFactory[myCharFilters.size()]),
            myTokenFilters.toArray(new TokenFilterFactory[myTokenFilters.size()]),
            positionOffsetGap,
            offsetGap
    );
}
 
Example 28
@Override
public void build(final Map<String, TokenizerFactory> tokenizers,
                  final Map<String, CharFilterFactory> charFilters,
                  final Map<String, TokenFilterFactory> tokenFilters) {
    List<CharFilterFactory> myCharFilters = new ArrayList<>();
    List<String> charFilterNames = analyzerSettings.getAsList("char_filter");
    for (String charFilterName : charFilterNames) {
        CharFilterFactory charFilter = charFilters.get(charFilterName);
        if (charFilter == null) {
            throw new IllegalArgumentException("Sortform Analyzer [" + name() +
                    "] failed to find char_filter under name [" + charFilterName + "]");
        }
        myCharFilters.add(charFilter);
    }
    List<TokenFilterFactory> myTokenFilters = new ArrayList<>();
    List<String> tokenFilterNames = analyzerSettings.getAsList("filter");
    for (String tokenFilterName : tokenFilterNames) {
        TokenFilterFactory tokenFilter = tokenFilters.get(tokenFilterName);
        if (tokenFilter == null) {
            throw new IllegalArgumentException("Sortform Analyzer [" + name() +
                    "] failed to find filter under name [" + tokenFilterName + "]");
        }
        myTokenFilters.add(tokenFilter);
    }
    int positionOffsetGap = analyzerSettings.getAsInt("position_offset_gap", 0);
    int offsetGap = analyzerSettings.getAsInt("offset_gap", -1);
    this.customAnalyzer = new CustomAnalyzer(name(), tokenizerFactory,
            myCharFilters.toArray(new CharFilterFactory[myCharFilters.size()]),
            myTokenFilters.toArray(new TokenFilterFactory[myTokenFilters.size()]),
            positionOffsetGap,
            offsetGap
    );
}
 
Example 29
public void testPunctuation() throws Exception {

        String source = "Programmieren mit C++ Version 2.0";

        String[] expected = {
                "Programmieren",
                "mit",
                "C++",
                "C __PLUSSIGN__ __PLUSSIGN__",
                "C",
                "__PLUSSIGN__",
                "__PLUSSIGN__",
                "Version",
                "2.0",
                "__DIGITTWO__ __FULLSTOP__ __DIGITZERO__",
                "__DIGITTWO__",
                "__FULLSTOP__",
                "__DIGITZERO__"
        };
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                Settings.EMPTY,
                new BundlePlugin(Settings.EMPTY));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("symbolname");
        Tokenizer tokenizer = analysis.tokenizer.get("whitespace").create();
        tokenizer.setReader(new StringReader(source));
        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }
 
Example 30
public void testSingleSymbols() throws Exception {

        String source = "Programmieren mit + und - ist toll, oder?";

        String[] expected = {
                "Programmieren",
                "mit",
                "+",
                "__PLUSSIGN__",
                "und",
                "-",
                "__HYPHENMINUS__",
                "ist",
                "toll,",
                "toll __COMMA__",
                "toll",
                "__COMMA__",
                "oder?",
                "oder __QUESTIONMARK__",
                "oder",
                "__QUESTIONMARK__"
        };
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                Settings.EMPTY,
                new BundlePlugin(Settings.EMPTY));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("symbolname");
        Tokenizer tokenizer = analysis.tokenizer.get("whitespace").create();
        tokenizer.setReader(new StringReader(source));
        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }