org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator Java Examples

The following examples show how to use org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: WordDelimiterTokenFilterFactory.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
/**
 * parses a list of MappingCharFilter style rules into a custom byte[] type table
 */
private byte[] parseTypes(Collection<String> rules) {
    SortedMap<Character, Byte> typeMap = new TreeMap<>();
    for (String rule : rules) {
        Matcher m = typePattern.matcher(rule);
        if (!m.find())
            throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]");
        String lhs = parseString(m.group(1).trim());
        Byte rhs = parseType(m.group(2).trim());
        if (lhs.length() != 1)
            throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed.");
        if (rhs == null)
            throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type.");
        typeMap.put(lhs.charAt(0), rhs);
    }

    // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance
    byte types[] = new byte[Math.max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)];
    for (int i = 0; i < types.length; i++)
        types[i] = WordDelimiterIterator.getType(i);
    for (Map.Entry<Character, Byte> mapping : typeMap.entrySet())
        types[mapping.getKey()] = mapping.getValue();
    return types;
}
 
Example #2
Source File: WordDelimiterTokenFilterFactory.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Inject
public WordDelimiterTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);

    // Sample Format for the type table:
    // $ => DIGIT
    // % => DIGIT
    // . => DIGIT
    // \u002C => DIGIT
    // \u200D => ALPHANUM
    List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
    if (charTypeTableValues == null) {
        this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
    } else {
        this.charTypeTable = parseTypes(charTypeTableValues);
    }
    int flags = 0;
    // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
    flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true);
    // If set, causes number subwords to be generated: "500-42" => "500" "42"
    flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true);
    // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
    flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false);
    // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042"
    flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false);
    // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
    flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false);
    // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
    flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true);
    // If set, includes original words in subwords: "500-42" => "500" "42" "500-42"
    flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false);
    // 1, causes "j2se" to be three tokens; "j" "2" "se"
    flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true);
    // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
    flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
    // If not null is the set of tokens to protect from being delimited
    Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
    this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
    this.flags = flags;
}
 
Example #3
Source File: WordDelimiterFilter2Factory.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
private byte[] parseTypes(List<String> rules) {
    SortedMap<Character, Byte> typeMap = new TreeMap<>();
    for (String rule : rules) {
        Matcher m = typePattern.matcher(rule);
        if (!m.find()) {
            throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]");
        }
        String lhs = parseString(m.group(1).trim());
        Byte rhs = parseType(m.group(2).trim());
        if (lhs.length() != 1) {
            throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed.");
        }
        if (rhs == null) {
            throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "]. Illegal type.");
        }
        typeMap.put(lhs.charAt(0), rhs);
    }

    // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance
    byte types[] = new byte[Math.max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)];
    for (int i = 0; i < types.length; i++) {
        types[i] = WordDelimiterIterator.getType(i);
    }
    for (Map.Entry<Character, Byte> mapping : typeMap.entrySet()) {
        types[mapping.getKey()] = mapping.getValue();
    }
    return types;
}
 
Example #4
Source File: WordDelimiterTokenFilterFactory.java    From crate with Apache License 2.0 5 votes vote down vote up
public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment env,
        String name, Settings settings) {
    super(indexSettings, name, settings);

    // Sample Format for the type table:
    // $ => DIGIT
    // % => DIGIT
    // . => DIGIT
    // \u002C => DIGIT
    // \u200D => ALPHANUM
    List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
    if (charTypeTableValues == null) {
        this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
    } else {
        this.charTypeTable = parseTypes(charTypeTableValues);
    }
    int flags = 0;
    // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
    flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true);
    // If set, causes number subwords to be generated: "500-42" => "500" "42"
    flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true);
    // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
    flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false);
    // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042"
    flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false);
    // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
    flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false);
    // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
    flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true);
    // If set, includes original words in subwords: "500-42" => "500" "42" "500-42"
    flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false);
    // 1, causes "j2se" to be three tokens; "j" "2" "se"
    flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true);
    // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
    flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
    // If not null is the set of tokens to protect from being delimited
    Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
    this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
    this.flags = flags;
}
 
Example #5
Source File: WordDelimiterTokenFilterFactory.java    From crate with Apache License 2.0 5 votes vote down vote up
/**
 * parses a list of MappingCharFilter style rules into a custom byte[] type table
 */
static byte[] parseTypes(Collection<String> rules) {
    SortedMap<Character, Byte> typeMap = new TreeMap<>();
    for (String rule : rules) {
        Matcher m = typePattern.matcher(rule);
        if (!m.find())
            throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]");
        String lhs = parseString(m.group(1).trim());
        Byte rhs = parseType(m.group(2).trim());
        if (lhs.length() != 1)
            throw new RuntimeException("Invalid Mapping Rule : ["
                    + rule + "]. Only a single character is allowed.");
        if (rhs == null)
            throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type.");
        typeMap.put(lhs.charAt(0), rhs);
    }

    // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance
    byte[] types = new byte[Math.max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)];
    for (int i = 0; i < types.length; i++) {
        types[i] = WordDelimiterIterator.getType(i);
    }
    for (Map.Entry<Character, Byte> mapping : typeMap.entrySet()) {
        types[mapping.getKey()] = mapping.getValue();
    }
    return types;
}
 
Example #6
Source File: WordDelimiterGraphTokenFilterFactory.java    From crate with Apache License 2.0 5 votes vote down vote up
public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);

    // Sample Format for the type table:
    // $ => DIGIT
    // % => DIGIT
    // . => DIGIT
    // \u002C => DIGIT
    // \u200D => ALPHANUM
    List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
    if (charTypeTableValues == null) {
        this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
    } else {
        this.charTypeTable = parseTypes(charTypeTableValues);
    }
    int flags = 0;
    // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
    flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true);
    // If set, causes number subwords to be generated: "500-42" => "500" "42"
    flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true);
    // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
    flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false);
    // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042"
    flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false);
    // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
    flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false);
    // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
    flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true);
    // If set, includes original words in subwords: "500-42" => "500" "42" "500-42"
    flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false);
    // 1, causes "j2se" to be three tokens; "j" "2" "se"
    flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true);
    // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
    flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
    // If not null is the set of tokens to protect from being delimited
    Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
    this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
    this.flags = flags;
    this.adjustOffsets = settings.getAsBoolean("adjust_offsets", true);
}
 
Example #7
Source File: WordDelimiterFilter2Factory.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 4 votes vote down vote up
public WordDelimiterFilter2Factory(IndexSettings indexSettings, Environment environment, String name,
                                   Settings settings) {
    super(indexSettings, name, settings);

    // Sample Format for the type table:
    // $ => DIGIT
    // % => DIGIT
    // . => DIGIT
    // \u002C => DIGIT
    // \u200D => ALPHANUM
    List<String> charTypeTableValues = Analysis.getWordList(environment, settings, "type_table");
    if (charTypeTableValues == null) {
        this.typeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
    } else {
        this.typeTable = parseTypes(charTypeTableValues);
    }

    // If 1, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
    flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true);
    // If 1, causes number subwords to be generated: "500-42" => "500" "42"
    flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true);
    // If 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
    flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false);
    // If 1, causes maximum runs of number parts to be catenated: "500-42" => "50042"
    flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false);
    // If 1, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
    flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false);
    // If 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
    flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true);
    // If 1, causes "j2se" to be three tokens; "j" "2" "se"
    flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true);
    // If 1, includes original words in subwords: "500-42" => "500" "42" "500-42"
    flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false);
    // If 1, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
    flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
    // If 1, causes generated subwords to stick at the same position, they otherwise take a new position
    flags |= getFlag(ALL_PARTS_AT_SAME_POSITION, settings, "all_parts_at_same_position", false);
    // If not null is the set of tokens to protect from being delimited
    List<String> protoWords = Analysis.getWordList(environment, settings, "protected_words");
    protectedWords = protoWords == null ? null : new HashSet<>(protoWords);
}