org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator Java Exaples

Source File: WordDelimiterTokenFilterFactory.java From Elasticsearch with Apache License 2.0

6 votes

/**
 * parses a list of MappingCharFilter style rules into a custom byte[] type table
 */
private byte[] parseTypes(Collection<String> rules) {
    SortedMap<Character, Byte> typeMap = new TreeMap<>();
    for (String rule : rules) {
        Matcher m = typePattern.matcher(rule);
        if (!m.find())
            throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]");
        String lhs = parseString(m.group(1).trim());
        Byte rhs = parseType(m.group(2).trim());
        if (lhs.length() != 1)
            throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed.");
        if (rhs == null)
            throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type.");
        typeMap.put(lhs.charAt(0), rhs);
    }

    // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance
    byte types[] = new byte[Math.max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)];
    for (int i = 0; i < types.length; i++)
        types[i] = WordDelimiterIterator.getType(i);
    for (Map.Entry<Character, Byte> mapping : typeMap.entrySet())
        types[mapping.getKey()] = mapping.getValue();
    return types;
}

Source File: WordDelimiterTokenFilterFactory.java From Elasticsearch with Apache License 2.0

5 votes

@Inject
public WordDelimiterTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);

    // Sample Format for the type table:
    // $ => DIGIT
    // % => DIGIT
    // . => DIGIT
    // \u002C => DIGIT
    // \u200D => ALPHANUM
    List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
    if (charTypeTableValues == null) {
        this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
    } else {
        this.charTypeTable = parseTypes(charTypeTableValues);
    }
    int flags = 0;
    // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
    flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true);
    // If set, causes number subwords to be generated: "500-42" => "500" "42"
    flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true);
    // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
    flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false);
    // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042"
    flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false);
    // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
    flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false);
    // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
    flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true);
    // If set, includes original words in subwords: "500-42" => "500" "42" "500-42"
    flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false);
    // 1, causes "j2se" to be three tokens; "j" "2" "se"
    flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true);
    // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
    flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
    // If not null is the set of tokens to protect from being delimited
    Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
    this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
    this.flags = flags;
}

Source File: WordDelimiterFilter2Factory.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

private byte[] parseTypes(List<String> rules) {
    SortedMap<Character, Byte> typeMap = new TreeMap<>();
    for (String rule : rules) {
        Matcher m = typePattern.matcher(rule);
        if (!m.find()) {
            throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]");
        }
        String lhs = parseString(m.group(1).trim());
        Byte rhs = parseType(m.group(2).trim());
        if (lhs.length() != 1) {
            throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed.");
        }
        if (rhs == null) {
            throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]. Illegal type.");
        }
        typeMap.put(lhs.charAt(0), rhs);
    }

    // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance
    byte types[] = new byte[Math.max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)];
    for (int i = 0; i < types.length; i++) {
        types[i] = WordDelimiterIterator.getType(i);
    }
    for (Map.Entry<Character, Byte> mapping : typeMap.entrySet()) {
        types[mapping.getKey()] = mapping.getValue();
    }
    return types;
}

Source File: WordDelimiterTokenFilterFactory.java From crate with Apache License 2.0

5 votes

public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment env,
        String name, Settings settings) {
    super(indexSettings, name, settings);

    // Sample Format for the type table:
    // $ => DIGIT
    // % => DIGIT
    // . => DIGIT
    // \u002C => DIGIT
    // \u200D => ALPHANUM
    List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
    if (charTypeTableValues == null) {
        this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
    } else {
        this.charTypeTable = parseTypes(charTypeTableValues);
    }
    int flags = 0;
    // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
    flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true);
    // If set, causes number subwords to be generated: "500-42" => "500" "42"
    flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true);
    // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
    flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false);
    // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042"
    flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false);
    // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
    flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false);
    // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
    flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true);
    // If set, includes original words in subwords: "500-42" => "500" "42" "500-42"
    flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false);
    // 1, causes "j2se" to be three tokens; "j" "2" "se"
    flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true);
    // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
    flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
    // If not null is the set of tokens to protect from being delimited
    Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
    this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
    this.flags = flags;
}

Source File: WordDelimiterTokenFilterFactory.java From crate with Apache License 2.0

5 votes

/**
 * parses a list of MappingCharFilter style rules into a custom byte[] type table
 */
static byte[] parseTypes(Collection<String> rules) {
    SortedMap<Character, Byte> typeMap = new TreeMap<>();
    for (String rule : rules) {
        Matcher m = typePattern.matcher(rule);
        if (!m.find())
            throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]");
        String lhs = parseString(m.group(1).trim());
        Byte rhs = parseType(m.group(2).trim());
        if (lhs.length() != 1)
            throw new RuntimeException("Invalid Mapping Rule : ["
                    + rule + "]. Only a single character is allowed.");
        if (rhs == null)
            throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type.");
        typeMap.put(lhs.charAt(0), rhs);
    }

    // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance
    byte[] types = new byte[Math.max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)];
    for (int i = 0; i < types.length; i++) {
        types[i] = WordDelimiterIterator.getType(i);
    }
    for (Map.Entry<Character, Byte> mapping : typeMap.entrySet()) {
        types[mapping.getKey()] = mapping.getValue();
    }
    return types;
}

Source File: WordDelimiterGraphTokenFilterFactory.java From crate with Apache License 2.0

5 votes

public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);

    // Sample Format for the type table:
    // $ => DIGIT
    // % => DIGIT
    // . => DIGIT
    // \u002C => DIGIT
    // \u200D => ALPHANUM
    List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
    if (charTypeTableValues == null) {
        this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
    } else {
        this.charTypeTable = parseTypes(charTypeTableValues);
    }
    int flags = 0;
    // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
    flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true);
    // If set, causes number subwords to be generated: "500-42" => "500" "42"
    flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true);
    // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
    flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false);
    // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042"
    flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false);
    // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
    flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false);
    // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
    flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true);
    // If set, includes original words in subwords: "500-42" => "500" "42" "500-42"
    flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false);
    // 1, causes "j2se" to be three tokens; "j" "2" "se"
    flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true);
    // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
    flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
    // If not null is the set of tokens to protect from being delimited
    Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
    this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
    this.flags = flags;
    this.adjustOffsets = settings.getAsBoolean("adjust_offsets", true);
}

Source File: WordDelimiterFilter2Factory.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

4 votes

public WordDelimiterFilter2Factory(IndexSettings indexSettings, Environment environment, String name,
                                   Settings settings) {
    super(indexSettings, name, settings);

    // Sample Format for the type table:
    // $ => DIGIT
    // % => DIGIT
    // . => DIGIT
    // \u002C => DIGIT
    // \u200D => ALPHANUM
    List<String> charTypeTableValues = Analysis.getWordList(environment, settings, "type_table");
    if (charTypeTableValues == null) {
        this.typeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
    } else {
        this.typeTable = parseTypes(charTypeTableValues);
    }

    // If 1, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
    flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true);
    // If 1, causes number subwords to be generated: "500-42" => "500" "42"
    flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true);
    // If 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
    flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false);
    // If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
    flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false);
    // If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
    flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false);
    // If 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
    flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true);
    // If 1, causes "j2se" to be three tokens; "j" "2" "se"
    flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true);
    // If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
    flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false);
    // If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
    flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
    // If 1, causes generated subwords to stick at the same position, they otherwise take a new position
    flags |= getFlag(ALL_PARTS_AT_SAME_POSITION, settings, "all_parts_at_same_position", false);
    // If not null is the set of tokens to protect from being delimited
    List<String> protoWords = Analysis.getWordList(environment, settings, "protected_words");
    protectedWords = protoWords == null ? null : new HashSet<>(protoWords);
}

org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator Java Examples