Java Code Examples for org.apache.lucene.analysis.fr.FrenchAnalyzer

The following examples show how to use org.apache.lucene.analysis.fr.FrenchAnalyzer. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: Elasticsearch   Source File: FrenchAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public FrenchAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new FrenchAnalyzer(Analysis.parseStopWords(env, settings, FrenchAnalyzer.getDefaultStopSet()),
                                  Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example 2
Source Project: lucene-solr   Source File: ElisionFilterFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void inform(ResourceLoader loader) throws IOException {
  if (articlesFile == null) {
    articles = FrenchAnalyzer.DEFAULT_ARTICLES;
  } else {
    articles = getWordSet(loader, articlesFile, ignoreCase);
  }
}
 
Example 3
Source Project: lucene-solr   Source File: TestElision.java    License: Apache License 2.0 5 votes vote down vote up
public void testEmptyTerm() throws IOException {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new KeywordTokenizer();
      return new TokenStreamComponents(tokenizer, new ElisionFilter(tokenizer, FrenchAnalyzer.DEFAULT_ARTICLES));
    }
  };
  checkOneTerm(a, "", "");
  a.close();
}
 
Example 4
Source Project: crate   Source File: CommonAnalysisPlugin.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
    List<PreConfiguredTokenFilter> filters = new ArrayList<>();
    filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton(
        "common_grams",
        false,
        input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false, input ->
            new DelimitedPayloadTokenFilter(input,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
    filters.add(PreConfiguredTokenFilter.singleton("delimited_payload", false, input ->
            new DelimitedPayloadTokenFilter(input,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
    filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer())));
    filters.add(PreConfiguredTokenFilter.singleton(
        "edge_ngram",
        false,
        input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilterFactory.SIDE_FRONT, EdgeNGramTokenFilterFactory.SIDE_BACK, EdgeNGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL)));
    filters.add(PreConfiguredTokenFilter.singleton(
        "elision",
        true,
        input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES))
    );
    filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer())));
    filters.add(PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("length", false, input ->
        new LengthFilter(input, 0, Integer.MAX_VALUE)));  // TODO this one seems useless
    filters.add(PreConfiguredTokenFilter.singleton(
        "limit",
        false,
        input -> new LimitTokenCountFilter(
            input,
            LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT,
            LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)
        )
    );
    filters.add(PreConfiguredTokenFilter.singleton("ngram", false, reader -> new NGramTokenFilter(reader, 1, 2, false)));
    filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian")));
    filters.add(PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> {
        TokenStream ts = new ShingleFilter(input);
        /**
         * We disable the graph analysis on this token stream
         * because it produces shingles of different size.
         * Graph analysis on such token stream is useless and dangerous as it may create too many paths
         * since shingles of different size are not aligned in terms of positions.
         */
        ts.addAttribute(DisableGraphAttribute.class);
        return ts;
    }));
    filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English")));
    filters.add(PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new));
    // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common
    filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("trim", true, TrimFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10)));
    filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input ->
            new WordDelimiterFilter(input,
                    WordDelimiterFilter.GENERATE_WORD_PARTS
                  | WordDelimiterFilter.GENERATE_NUMBER_PARTS
                  | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE
                  | WordDelimiterFilter.SPLIT_ON_NUMERICS
                  | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null)));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, input ->
            new WordDelimiterGraphFilter(input,
                    WordDelimiterGraphFilter.GENERATE_WORD_PARTS
                  | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS
                  | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE
                  | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS
                  | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null)));
    return filters;
}
 
Example 5
Source Project: crate   Source File: FrenchAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
FrenchAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new FrenchAnalyzer(
        Analysis.parseStopWords(env, settings, FrenchAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example 6
private void buildAnalyzerMap() {
    analyzers = new HashMap<>();
    analyzers.put("fr", new FrenchAnalyzer(Version.LUCENE_47));
    analyzers.put("nl", new DutchAnalyzer(Version.LUCENE_47));
    analyzers.put("en", new StandardAnalyzer(Version.LUCENE_47));
}
 
Example 7
private void buildAnalyzerMap() {
	analyzers = new HashMap<String, Analyzer>();
	analyzers.put("fr", new FrenchAnalyzer(Version.LUCENE_47));
	analyzers.put("nl", new DutchAnalyzer(Version.LUCENE_47));
	analyzers.put("en", new StandardAnalyzer(Version.LUCENE_47));
}
 
Example 8
Source Project: icure-backend   Source File: DrugsDAOImpl.java    License: GNU General Public License v2.0 4 votes vote down vote up
private void buildAnalyzerMap() {
	analyzers = new HashMap<>();
	analyzers.put("fr", new FrenchAnalyzer(Version.LUCENE_47));
	analyzers.put("nl", new DutchAnalyzer(Version.LUCENE_47));
	analyzers.put("en", new StandardAnalyzer(Version.LUCENE_47));
}
 
Example 9
Source Project: icure-backend   Source File: IndexCreator.java    License: GNU General Public License v2.0 4 votes vote down vote up
private void buildAnalyzerMap() {
	analyzers = new HashMap<String, Analyzer>();
	analyzers.put("fr", new FrenchAnalyzer(Version.LUCENE_47));
	analyzers.put("nl", new DutchAnalyzer(Version.LUCENE_47));
	analyzers.put("en", new StandardAnalyzer(Version.LUCENE_47));
}
 
Example 10
Source Project: Elasticsearch   Source File: FrenchAnalyzerProvider.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public FrenchAnalyzer get() {
    return this.analyzer;
}
 
Example 11
Source Project: yes-cart   Source File: LuceneSearchUtil.java    License: Apache License 2.0 4 votes vote down vote up
private static void initAnalysis() {
    LANGUAGE_SPECIFIC.put("ru", new Analysis() {
        @Override
        protected Analyzer initialValue() {
            return new RussianAnalyzer();
        }
    });
    LANGUAGE_SPECIFIC.put("uk", new Analysis() {
        @Override
        protected Analyzer initialValue() {
            return new RussianAnalyzer();
        }
    });
    LANGUAGE_SPECIFIC.put("de", new Analysis() {
        @Override
        protected Analyzer initialValue() {
            return new GermanAnalyzer();
        }
    });
    LANGUAGE_SPECIFIC.put("fr", new Analysis() {
        @Override
        protected Analyzer initialValue() {
            return new FrenchAnalyzer();
        }
    });
    LANGUAGE_SPECIFIC.put("it", new Analysis() {
        @Override
        protected Analyzer initialValue() {
            return new ItalianAnalyzer();
        }
    });
    LANGUAGE_SPECIFIC.put("en", new Analysis() {
        @Override
        protected Analyzer initialValue() {
            return new StandardAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
        }
    });
    LANGUAGE_SPECIFIC.put("default", new Analysis() {
        @Override
        protected Analyzer initialValue() {
            return new StandardAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
        }
    });
}
 
Example 12
Source Project: stratio-cassandra   Source File: SnowballAnalyzerBuilder.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * Returns the default stopwords set used by Lucene language analyzer for the specified language.
 *
 * @param language The language for which the stopwords are. The supported languages are English, French, Spanish,
 *                 Portuguese, Italian, Romanian, German, Dutch, Swedish, Norwegian, Danish, Russian, Finnish,
 *                 Irish, Hungarian, Turkish, Armenian, Basque and Catalan.
 * @return The default stopwords set used by Lucene language analyzers.
 */
private static CharArraySet getDefaultStopwords(String language) {
    switch (language) {
        case "English":
            return EnglishAnalyzer.getDefaultStopSet();
        case "French":
            return FrenchAnalyzer.getDefaultStopSet();
        case "Spanish":
            return SpanishAnalyzer.getDefaultStopSet();
        case "Portuguese":
            return PortugueseAnalyzer.getDefaultStopSet();
        case "Italian":
            return ItalianAnalyzer.getDefaultStopSet();
        case "Romanian":
            return RomanianAnalyzer.getDefaultStopSet();
        case "German":
            return GermanAnalyzer.getDefaultStopSet();
        case "Dutch":
            return DutchAnalyzer.getDefaultStopSet();
        case "Swedish":
            return SwedishAnalyzer.getDefaultStopSet();
        case "Norwegian":
            return NorwegianAnalyzer.getDefaultStopSet();
        case "Danish":
            return DanishAnalyzer.getDefaultStopSet();
        case "Russian":
            return RussianAnalyzer.getDefaultStopSet();
        case "Finnish":
            return FinnishAnalyzer.getDefaultStopSet();
        case "Irish":
            return IrishAnalyzer.getDefaultStopSet();
        case "Hungarian":
            return HungarianAnalyzer.getDefaultStopSet();
        case "Turkish":
            return SpanishAnalyzer.getDefaultStopSet();
        case "Armenian":
            return SpanishAnalyzer.getDefaultStopSet();
        case "Basque":
            return BasqueAnalyzer.getDefaultStopSet();
        case "Catalan":
            return CatalanAnalyzer.getDefaultStopSet();
        default:
            return CharArraySet.EMPTY_SET;
    }
}
 
Example 13
Source Project: crate   Source File: CommonAnalysisPlugin.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public List<PreBuiltAnalyzerProviderFactory> getPreBuiltAnalyzerProviderFactories() {
    List<PreBuiltAnalyzerProviderFactory> analyzers = new ArrayList<>();
    analyzers.add(new PreBuiltAnalyzerProviderFactory("standard_html_strip", CachingStrategy.ELASTICSEARCH,
        () -> new StandardHtmlStripAnalyzer(CharArraySet.EMPTY_SET)));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("pattern", CachingStrategy.ELASTICSEARCH,
        () -> new PatternAnalyzer(Regex.compile("\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/, null), true,
        CharArraySet.EMPTY_SET)));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("snowball", CachingStrategy.LUCENE,
        () -> new SnowballAnalyzer("English", EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)));

    // Language analyzers:
    analyzers.add(new PreBuiltAnalyzerProviderFactory("arabic", CachingStrategy.LUCENE, ArabicAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("armenian", CachingStrategy.LUCENE, ArmenianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("basque", CachingStrategy.LUCENE, BasqueAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("bengali", CachingStrategy.LUCENE, BengaliAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("brazilian", CachingStrategy.LUCENE, BrazilianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("bulgarian", CachingStrategy.LUCENE, BulgarianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("catalan", CachingStrategy.LUCENE, CatalanAnalyzer::new));
    // chinese analyzer: only for old indices, best effort
    analyzers.add(new PreBuiltAnalyzerProviderFactory("chinese", CachingStrategy.ONE, StandardAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("cjk", CachingStrategy.LUCENE, CJKAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("czech", CachingStrategy.LUCENE, CzechAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("danish", CachingStrategy.LUCENE, DanishAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("dutch", CachingStrategy.LUCENE, DutchAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("english", CachingStrategy.LUCENE, EnglishAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("finnish", CachingStrategy.LUCENE, FinnishAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("french", CachingStrategy.LUCENE, FrenchAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("galician", CachingStrategy.LUCENE, GalicianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("german", CachingStrategy.LUCENE, GermanAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("greek", CachingStrategy.LUCENE, GreekAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("hindi", CachingStrategy.LUCENE, HindiAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("hungarian", CachingStrategy.LUCENE, HungarianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("indonesian", CachingStrategy.LUCENE, IndonesianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("irish", CachingStrategy.LUCENE, IrishAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("italian", CachingStrategy.LUCENE, ItalianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("latvian", CachingStrategy.LUCENE, LatvianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("lithuanian", CachingStrategy.LUCENE, LithuanianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("norwegian", CachingStrategy.LUCENE, NorwegianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("persian", CachingStrategy.LUCENE, PersianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("portuguese", CachingStrategy.LUCENE, PortugueseAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("romanian", CachingStrategy.LUCENE, RomanianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("russian", CachingStrategy.LUCENE, RussianAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("sorani", CachingStrategy.LUCENE, SoraniAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("spanish", CachingStrategy.LUCENE, SpanishAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("swedish", CachingStrategy.LUCENE, SwedishAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("turkish", CachingStrategy.LUCENE, TurkishAnalyzer::new));
    analyzers.add(new PreBuiltAnalyzerProviderFactory("thai", CachingStrategy.LUCENE, ThaiAnalyzer::new));
    return analyzers;
}
 
Example 14
Source Project: crate   Source File: FrenchAnalyzerProvider.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public FrenchAnalyzer get() {
    return this.analyzer;
}