org.apache.lucene.analysis.fr.FrenchAnalyzer Java Examples

The following examples show how to use org.apache.lucene.analysis.fr.FrenchAnalyzer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source Project: Elasticsearch   Author: baidu   File: FrenchAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public FrenchAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new FrenchAnalyzer(Analysis.parseStopWords(env, settings, FrenchAnalyzer.getDefaultStopSet()),
                                  Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example #2
Source Project: lucene-solr   Author: apache   File: ElisionFilterFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void inform(ResourceLoader loader) throws IOException {
  if (articlesFile == null) {
    articles = FrenchAnalyzer.DEFAULT_ARTICLES;
  } else {
    articles = getWordSet(loader, articlesFile, ignoreCase);
  }
}
 
Example #3
Source Project: lucene-solr   Author: apache   File: TestElision.java    License: Apache License 2.0 5 votes vote down vote up
public void testEmptyTerm() throws IOException {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new KeywordTokenizer();
      return new TokenStreamComponents(tokenizer, new ElisionFilter(tokenizer, FrenchAnalyzer.DEFAULT_ARTICLES));
    }
  };
  checkOneTerm(a, "", "");
  a.close();
}
 
Example #4
Source Project: crate   Author: crate   File: CommonAnalysisPlugin.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
    List<PreConfiguredTokenFilter> filters = new ArrayList<>();
    filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton(
        "common_grams",
        false,
        input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false, input ->
            new DelimitedPayloadTokenFilter(input,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
    filters.add(PreConfiguredTokenFilter.singleton("delimited_payload", false, input ->
            new DelimitedPayloadTokenFilter(input,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
    filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer())));
    filters.add(PreConfiguredTokenFilter.singleton(
        "edge_ngram",
        false,
        input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilterFactory.SIDE_FRONT, EdgeNGramTokenFilterFactory.SIDE_BACK, EdgeNGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL)));
    filters.add(PreConfiguredTokenFilter.singleton(
        "elision",
        true,
        input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES))
    );
    filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer())));
    filters.add(PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("length", false, input ->
        new LengthFilter(input, 0, Integer.MAX_VALUE)));  // TODO this one seems useless
    filters.add(PreConfiguredTokenFilter.singleton(
        "limit",
        false,
        input -> new LimitTokenCountFilter(
            input,
            LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT,
            LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)
        )
    );
    filters.add(PreConfiguredTokenFilter.singleton("ngram", false, reader -> new NGramTokenFilter(reader, 1, 2, false)));
    filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian")));
    filters.add(PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> {
        TokenStream ts = new ShingleFilter(input);
        /**
         * We disable the graph analysis on this token stream
         * because it produces shingles of different size.
         * Graph analysis on such token stream is useless and dangerous as it may create too many paths
         * since shingles of different size are not aligned in terms of positions.
         */
        ts.addAttribute(DisableGraphAttribute.class);
        return ts;
    }));
    filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English")));
    filters.add(PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new));
    // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common
    filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("trim", true, TrimFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10)));
    filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input ->
            new WordDelimiterFilter(input,
                    WordDelimiterFilter.GENERATE_WORD_PARTS
                  | WordDelimiterFilter.GENERATE_NUMBER_PARTS
                  | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE
                  | WordDelimiterFilter.SPLIT_ON_NUMERICS
                  | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null)));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, input ->
            new WordDelimiterGraphFilter(input,
                    WordDelimiterGraphFilter.GENERATE_WORD_PARTS
                  | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS
                  | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE
                  | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS
                  | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null)));
    return filters;
}
 
Example #5
Source Project: crate   Author: crate   File: FrenchAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
FrenchAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new FrenchAnalyzer(
        Analysis.parseStopWords(env, settings, FrenchAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example #6
Source Project: freehealth-connector   Author: taktik   File: DrugsDAOImpl.java    License: GNU Affero General Public License v3.0 4 votes vote down vote up
private void buildAnalyzerMap() {
    analyzers = new HashMap<>();
    analyzers.put("fr", new FrenchAnalyzer(Version.LUCENE_47));
    analyzers.put("nl", new DutchAnalyzer(Version.LUCENE_47));
    analyzers.put("en", new StandardAnalyzer(Version.LUCENE_47));
}
 
Example #7
Source Project: freehealth-connector   Author: taktik   File: IndexCreator.java    License: GNU Affero General Public License v3.0 4 votes vote down vote up
private void buildAnalyzerMap() {
	analyzers = new HashMap<String, Analyzer>();
	analyzers.put("fr", new FrenchAnalyzer(Version.LUCENE_47));
	analyzers.put("nl", new DutchAnalyzer(Version.LUCENE_47));
	analyzers.put("en", new StandardAnalyzer(Version.LUCENE_47));
}
 
Example #8
Source Project: icure-backend   Author: taktik   File: DrugsDAOImpl.java    License: GNU General Public License v2.0 4 votes vote down vote up
private void buildAnalyzerMap() {
	analyzers = new HashMap<>();
	analyzers.put("fr", new FrenchAnalyzer(Version.LUCENE_47));
	analyzers.put("nl", new DutchAnalyzer(Version.LUCENE_47));
	analyzers.put("en", new StandardAnalyzer(Version.LUCENE_47));
}
 
Example #9
Source Project: icure-backend   Author: taktik   File: IndexCreator.java    License: GNU General Public License v2.0 4 votes vote down vote up
private void buildAnalyzerMap() {
	analyzers = new HashMap<String, Analyzer>();
	analyzers.put("fr", new FrenchAnalyzer(Version.LUCENE_47));
	analyzers.put("nl", new DutchAnalyzer(Version.LUCENE_47));
	analyzers.put("en", new StandardAnalyzer(Version.LUCENE_47));
}
 
Example #10
Source Project: Elasticsearch   Author: baidu   File: FrenchAnalyzerProvider.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public FrenchAnalyzer get() {
    return this.analyzer;
}
 
Example #11
Source Project: yes-cart   Author: inspire-software   File: LuceneSearchUtil.java    License: Apache License 2.0 4 votes vote down vote up
private static void initAnalysis() {
    LANGUAGE_SPECIFIC.put("ru", new Analysis() {
        @Override
        protected Analyzer initialValue() {
            return new RussianAnalyzer();
        }
    });
    LANGUAGE_SPECIFIC.put("uk", new Analysis() {
        @Override
        protected Analyzer initialValue() {
            return new RussianAnalyzer();
        }
    });
    LANGUAGE_SPECIFIC.put("de", new Analysis() {
        @Override
        protected Analyzer initialValue() {
            return new GermanAnalyzer();
        }
    });
    LANGUAGE_SPECIFIC.put("fr", new Analysis() {
        @Override
        protected Analyzer initialValue() {
            return new FrenchAnalyzer();
        }
    });
    LANGUAGE_SPECIFIC.put("it", new Analysis() {
        @Override
        protected Analyzer initialValue() {
            return new ItalianAnalyzer();
        }
    });
    LANGUAGE_SPECIFIC.put("en", new Analysis() {
        @Override
        protected Analyzer initialValue() {
            return new StandardAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
        }
    });
    LANGUAGE_SPECIFIC.put("default", new Analysis() {
        @Override
        protected Analyzer initialValue() {
            return new StandardAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
        }
    });
}
 
Example #12
Source Project: stratio-cassandra   Author: Stratio   File: SnowballAnalyzerBuilder.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * Returns the default stopwords set used by Lucene language analyzer for the specified language.
 *
 * @param language The language for which the stopwords are. The supported languages are English, French, Spanish,
 *                 Portuguese, Italian, Romanian, German, Dutch, Swedish, Norwegian, Danish, Russian, Finnish,
 *                 Irish, Hungarian, Turkish, Armenian, Basque and Catalan.
 * @return The default stopwords set used by Lucene language analyzers.
 */
private static CharArraySet getDefaultStopwords(String language) {
    switch (language) {
        case "English":
            return EnglishAnalyzer.getDefaultStopSet();
        case "French":
            return FrenchAnalyzer.getDefaultStopSet();
        case "Spanish":
            return SpanishAnalyzer.getDefaultStopSet();
        case "Portuguese":
            return PortugueseAnalyzer.getDefaultStopSet();
        case "Italian":
            return ItalianAnalyzer.getDefaultStopSet();
        case "Romanian":
            return RomanianAnalyzer.getDefaultStopSet();
        case "German":
            return GermanAnalyzer.getDefaultStopSet();
        case "Dutch":
            return DutchAnalyzer.getDefaultStopSet();
        case "Swedish":
            return SwedishAnalyzer.getDefaultStopSet();
        case "Norwegian":
            return NorwegianAnalyzer.getDefaultStopSet();
        case "Danish":
            return DanishAnalyzer.getDefaultStopSet();
        case "Russian":
            return RussianAnalyzer.getDefaultStopSet();
        case "Finnish":
            return FinnishAnalyzer.getDefaultStopSet();
        case "Irish":
            return IrishAnalyzer.getDefaultStopSet();
        case "Hungarian":
            return HungarianAnalyzer.getDefaultStopSet();
        case "Turkish":
            return SpanishAnalyzer.getDefaultStopSet();
        case "Armenian":
            return SpanishAnalyzer.getDefaultStopSet();
        case "Basque":
            return BasqueAnalyzer.getDefaultStopSet();
        case "Catalan":
            return CatalanAnalyzer.getDefaultStopSet();
        default:
            return CharArraySet.EMPTY_SET;
    }
}
 
Example #13
Source Project: crate   Author: crate   File: CommonAnalysisPlugin.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public List<PreBuiltAnalyzerProviderFactory> getPreBuiltAnalyzerProviderFactories() {
    List<PreBuiltAnalyzerProviderFactory> analyzers = new ArrayList<>();
    analyzers.add(new PreBuiltAnalyzerProviderFactory("standard_html_strip", CachingStrategy.ELASTICSEARCH,
        () -> new StandardHtmlStripAnalyzer(CharArraySet.EMPTY_SET)));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("pattern", CachingStrategy.ELASTICSEARCH,
        () -> new PatternAnalyzer(Regex.compile("\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/, null), true,
        CharArraySet.EMPTY_SET)));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("snowball", CachingStrategy.LUCENE,
        () -> new SnowballAnalyzer("English", EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)));

    // Language analyzers:
    analyzers.add(new PreBuiltAnalyzerProviderFactory("arabic", CachingStrategy.LUCENE, ArabicAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("armenian", CachingStrategy.LUCENE, ArmenianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("basque", CachingStrategy.LUCENE, BasqueAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("bengali", CachingStrategy.LUCENE, BengaliAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("brazilian", CachingStrategy.LUCENE, BrazilianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("bulgarian", CachingStrategy.LUCENE, BulgarianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("catalan", CachingStrategy.LUCENE, CatalanAnalyzer::new));
    // chinese analyzer: only for old indices, best effort
    analyzers.add(new PreBuiltAnalyzerProviderFactory("chinese", CachingStrategy.ONE, StandardAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("cjk", CachingStrategy.LUCENE, CJKAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("czech", CachingStrategy.LUCENE, CzechAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("danish", CachingStrategy.LUCENE, DanishAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("dutch", CachingStrategy.LUCENE, DutchAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("english", CachingStrategy.LUCENE, EnglishAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("finnish", CachingStrategy.LUCENE, FinnishAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("french", CachingStrategy.LUCENE, FrenchAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("galician", CachingStrategy.LUCENE, GalicianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("german", CachingStrategy.LUCENE, GermanAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("greek", CachingStrategy.LUCENE, GreekAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("hindi", CachingStrategy.LUCENE, HindiAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("hungarian", CachingStrategy.LUCENE, HungarianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("indonesian", CachingStrategy.LUCENE, IndonesianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("irish", CachingStrategy.LUCENE, IrishAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("italian", CachingStrategy.LUCENE, ItalianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("latvian", CachingStrategy.LUCENE, LatvianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("lithuanian", CachingStrategy.LUCENE, LithuanianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("norwegian", CachingStrategy.LUCENE, NorwegianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("persian", CachingStrategy.LUCENE, PersianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("portuguese", CachingStrategy.LUCENE, PortugueseAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("romanian", CachingStrategy.LUCENE, RomanianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("russian", CachingStrategy.LUCENE, RussianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("sorani", CachingStrategy.LUCENE, SoraniAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("spanish", CachingStrategy.LUCENE, SpanishAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("swedish", CachingStrategy.LUCENE, SwedishAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("turkish", CachingStrategy.LUCENE, TurkishAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("thai", CachingStrategy.LUCENE, ThaiAnalyzer::new));
    return analyzers;
}
 
Example #14
Source Project: crate   Author: crate   File: FrenchAnalyzerProvider.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public FrenchAnalyzer get() {
    return this.analyzer;
}