Java Code Examples for org.elasticsearch.index.analysis.Analysis

The following examples show how to use org.elasticsearch.index.analysis.Analysis. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: Elasticsearch   Source File: MoreLikeThisQueryParser.java    License: Apache License 2.0 5 votes vote down vote up
private static List<String> removeUnsupportedFields(List<String> moreLikeFields, Analyzer analyzer, boolean failOnUnsupportedField) throws IOException {
    for (Iterator<String> it = moreLikeFields.iterator(); it.hasNext(); ) {
        final String fieldName = it.next();
        if (!Analysis.generatesCharacterTokenStream(analyzer, fieldName)) {
            if (failOnUnsupportedField) {
                throw new IllegalArgumentException("more_like_this doesn't support binary/numeric fields: [" + fieldName + "]");
            } else {
                it.remove();
            }
        }
    }
    return moreLikeFields;
}
 
Example 2
public AbstractCompoundWordTokenFilterFactory(Index index, Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettings, name, settings);

    minWordSize = settings.getAsInt("min_word_size", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);
    minSubwordSize = settings.getAsInt("min_subword_size", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
    maxSubwordSize = settings.getAsInt("max_subword_size", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
    onlyLongestMatch = settings.getAsBoolean("only_longest_match", false);
    wordList = Analysis.getWordSet(env, settings, "word_list");
    if (wordList == null) {
        throw new IllegalArgumentException("word_list must be provided for [" + name + "], either as a path to a file, or directly");
    }
}
 
Example 3
public WordDelimiterFilterFactory(IndexSettings indexSettings, Environment environment, String name,
                                  Settings settings) {
    super(indexSettings, name, settings);

    // Sample Format for the type table:
    // $ => DIGIT
    // % => DIGIT
    // . => DIGIT
    // \u002C => DIGIT
    // \u200D => ALPHANUM
    List<String> charTypeTableValues = Analysis.getWordList(environment, settings, "type_table");
    if (charTypeTableValues == null) {
        this.typeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
    } else {
        this.typeTable = parseTypes(charTypeTableValues);
    }

    // If 1, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
    flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true);
    // If 1, causes number subwords to be generated: "500-42" => "500" "42"
    flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true);
    // If 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
    flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false);
    // If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
    flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false);
    // If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
    flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false);
    // If 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
    flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true);
    // If 1, causes "j2se" to be three tokens; "j" "2" "se"
    flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true);
    // If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
    flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false);
    // If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
    flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
    // If not null is the set of tokens to protect from being delimited
    List<String> protoWords = Analysis.getWordList(environment, settings, "protected_words");
    protectedWords = protoWords == null ? null : new HashSet<>(protoWords);
}
 
Example 4
Source Project: crate   Source File: RomanianAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
RomanianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new RomanianAnalyzer(
        Analysis.parseStopWords(env, settings, RomanianAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example 5
Source Project: crate   Source File: BasqueAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
BasqueAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new BasqueAnalyzer(
        Analysis.parseStopWords(env, settings, BasqueAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example 6
Source Project: crate   Source File: StandardHtmlStripAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
StandardHtmlStripAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    final CharArraySet defaultStopwords = CharArraySet.EMPTY_SET;
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);
    analyzer = new StandardHtmlStripAnalyzer(stopWords);
    analyzer.setVersion(version);
}
 
Example 7
Source Project: crate   Source File: IndonesianAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
IndonesianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new IndonesianAnalyzer(
        Analysis.parseStopWords(env, settings, IndonesianAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example 8
Source Project: crate   Source File: ArabicAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
ArabicAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    arabicAnalyzer = new ArabicAnalyzer(
        Analysis.parseStopWords(env, settings, ArabicAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    arabicAnalyzer.setVersion(version);
}
 
Example 9
Source Project: crate   Source File: SnowballAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
SnowballAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);

    String language = settings.get("language", settings.get("name", "English"));
    CharArraySet defaultStopwords = DEFAULT_LANGUAGE_STOPWORDS.getOrDefault(language, CharArraySet.EMPTY_SET);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);

    analyzer = new SnowballAnalyzer(language, stopWords);
    analyzer.setVersion(version);
}
 
Example 10
Source Project: crate   Source File: SwedishAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
SwedishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new SwedishAnalyzer(
        Analysis.parseStopWords(env, settings, SwedishAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example 11
Source Project: crate   Source File: StemmerOverrideTokenFilterFactory.java    License: Apache License 2.0 5 votes vote down vote up
StemmerOverrideTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException {
    super(indexSettings, name, settings);

    List<String> rules = Analysis.getWordList(env, settings, "rules");
    if (rules == null) {
        throw new IllegalArgumentException("stemmer override filter requires either `rules` or `rules_path` to be configured");
    }

    StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false);
    parseRules(rules, builder, "=>");
    overrideMap = builder.build();

}
 
Example 12
Source Project: crate   Source File: SpanishAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
SpanishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new SpanishAnalyzer(
        Analysis.parseStopWords(env, settings, SpanishAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example 13
Source Project: crate   Source File: CommonGramsTokenFilterFactory.java    License: Apache License 2.0 5 votes vote down vote up
CommonGramsTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    this.ignoreCase = settings.getAsBoolean("ignore_case", false);
    this.queryMode = settings.getAsBoolean("query_mode", false);
    this.words = Analysis.parseCommonWords(env, settings, null, ignoreCase);

    if (this.words == null) {
        throw new IllegalArgumentException(
                "missing or empty [common_words] or [common_words_path] configuration for common_grams token filter");
    }
}
 
Example 14
Source Project: crate   Source File: MappingCharFilterFactory.java    License: Apache License 2.0 5 votes vote down vote up
MappingCharFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name);

    List<String> rules = Analysis.getWordList(env, settings, "mappings");
    if (rules == null) {
        throw new IllegalArgumentException("mapping requires either `mappings` or `mappings_path` to be configured");
    }

    NormalizeCharMap.Builder normMapBuilder = new NormalizeCharMap.Builder();
    parseRules(rules, normMapBuilder);
    normMap = normMapBuilder.build();
}
 
Example 15
Source Project: crate   Source File: PatternAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
PatternAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);

    final CharArraySet defaultStopwords = CharArraySet.EMPTY_SET;
    boolean lowercase = settings.getAsBoolean("lowercase", true);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);

    String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/);
    if (sPattern == null) {
        throw new IllegalArgumentException("Analyzer [" + name + "] of type pattern must have a `pattern` set");
    }
    Pattern pattern = Regex.compile(sPattern, settings.get("flags"));

    analyzer = new PatternAnalyzer(pattern, lowercase, stopWords);
}
 
Example 16
Source Project: crate   Source File: WordDelimiterTokenFilterFactory.java    License: Apache License 2.0 5 votes vote down vote up
public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment env,
        String name, Settings settings) {
    super(indexSettings, name, settings);

    // Sample Format for the type table:
    // $ => DIGIT
    // % => DIGIT
    // . => DIGIT
    // \u002C => DIGIT
    // \u200D => ALPHANUM
    List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
    if (charTypeTableValues == null) {
        this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
    } else {
        this.charTypeTable = parseTypes(charTypeTableValues);
    }
    int flags = 0;
    // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
    flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true);
    // If set, causes number subwords to be generated: "500-42" => "500" "42"
    flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true);
    // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
    flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false);
    // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042"
    flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false);
    // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
    flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false);
    // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
    flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true);
    // If set, includes original words in subwords: "500-42" => "500" "42" "500-42"
    flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false);
    // 1, causes "j2se" to be three tokens; "j" "2" "se"
    flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true);
    // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
    flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
    // If not null is the set of tokens to protect from being delimited
    Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
    this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
    this.flags = flags;
}
 
Example 17
Source Project: crate   Source File: PortugueseAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
PortugueseAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new PortugueseAnalyzer(
        Analysis.parseStopWords(env, settings, PortugueseAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example 18
Source Project: crate   Source File: DanishAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
DanishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new DanishAnalyzer(
        Analysis.parseStopWords(env, settings, DanishAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example 19
Source Project: crate   Source File: ArmenianAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
ArmenianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new ArmenianAnalyzer(
        Analysis.parseStopWords(env, settings, ArmenianAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example 20
Source Project: crate   Source File: CjkAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
CjkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    CharArraySet stopWords = Analysis.parseStopWords(
        env, settings, CJKAnalyzer.getDefaultStopSet());

    analyzer = new CJKAnalyzer(stopWords);
    analyzer.setVersion(version);
}
 
Example 21
Source Project: crate   Source File: GalicianAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
GalicianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new GalicianAnalyzer(
        Analysis.parseStopWords(env, settings, GalicianAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example 22
Source Project: crate   Source File: GermanAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
GermanAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new GermanAnalyzer(
        Analysis.parseStopWords(env, settings, GermanAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example 23
Source Project: crate   Source File: EnglishAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
EnglishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new EnglishAnalyzer(
        Analysis.parseStopWords(env, settings, EnglishAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example 24
Source Project: crate   Source File: BrazilianAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
BrazilianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new BrazilianAnalyzer(
        Analysis.parseStopWords(env, settings, BrazilianAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example 25
protected AbstractCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);

    minWordSize = settings.getAsInt("min_word_size", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);
    minSubwordSize = settings.getAsInt("min_subword_size", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
    maxSubwordSize = settings.getAsInt("max_subword_size", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
    onlyLongestMatch = settings.getAsBoolean("only_longest_match", false);
    wordList = Analysis.getWordSet(env, settings, "word_list");
    if (wordList == null) {
        throw new IllegalArgumentException("word_list must be provided for [" + name + "], either as a path to a file, or directly");
    }
}
 
Example 26
Source Project: crate   Source File: ItalianAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
ItalianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new ItalianAnalyzer(
        Analysis.parseStopWords(env, settings, ItalianAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example 27
Source Project: crate   Source File: BengaliAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
BengaliAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new BengaliAnalyzer(
        Analysis.parseStopWords(env, settings, BengaliAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example 28
public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);

    // Sample Format for the type table:
    // $ => DIGIT
    // % => DIGIT
    // . => DIGIT
    // \u002C => DIGIT
    // \u200D => ALPHANUM
    List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
    if (charTypeTableValues == null) {
        this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
    } else {
        this.charTypeTable = parseTypes(charTypeTableValues);
    }
    int flags = 0;
    // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
    flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true);
    // If set, causes number subwords to be generated: "500-42" => "500" "42"
    flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true);
    // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
    flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false);
    // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042"
    flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false);
    // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
    flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false);
    // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
    flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true);
    // If set, includes original words in subwords: "500-42" => "500" "42" "500-42"
    flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false);
    // 1, causes "j2se" to be three tokens; "j" "2" "se"
    flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true);
    // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
    flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
    // If not null is the set of tokens to protect from being delimited
    Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
    this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
    this.flags = flags;
    this.adjustOffsets = settings.getAsBoolean("adjust_offsets", true);
}
 
Example 29
Source Project: crate   Source File: FrenchAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
FrenchAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new FrenchAnalyzer(
        Analysis.parseStopWords(env, settings, FrenchAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}
 
Example 30
Source Project: crate   Source File: HungarianAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
HungarianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new HungarianAnalyzer(
        Analysis.parseStopWords(env, settings, HungarianAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}