org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter Java Examples

The following examples show how to use org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: NGramTokenFilterTest.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testInvalidOffsets() throws Exception {
  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
      filters = new NGramTokenFilter(filters, 2, 2, false);
      return new TokenStreamComponents(tokenizer, filters);
    }
  };
  assertAnalyzesTo(analyzer, "mosfellsbær",
      new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" },
      new int[]    {    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
      new int[]    {   11,   11,   11,   11,   11,   11,   11,   11,   11,   11,   11 },
      new int[]    {     1,   0,    0,    0,    0,    0,    0,    0,    0,    0,    0  });
  analyzer.close();
}
 
Example #2
Source File: LumongoSegment.java    From lumongo with Apache License 2.0 6 votes vote down vote up
private static String getFoldedString(String text) {
	char[] textChar = text.toCharArray();
	char[] output = new char[textChar.length * 4];
	int outputPos = ASCIIFoldingFilter.foldToASCII(textChar, 0, output, 0, textChar.length);
	text = new String(output, 0, outputPos);
	return text;
}
 
Example #3
Source File: ASCIIFoldingTokenFilterFactory.java    From crate with Apache License 2.0 6 votes vote down vote up
@Override
public Object getMultiTermComponent() {
    if (preserveOriginal == false) {
        return this;
    } else {
        // See https://issues.apache.org/jira/browse/LUCENE-7536 for the reasoning
        return new TokenFilterFactory() {

            @Override
            public String name() {
                return ASCIIFoldingTokenFilterFactory.this.name();
            }

            @Override
            public TokenStream create(TokenStream tokenStream) {
                return new ASCIIFoldingFilter(tokenStream, false);
            }
        };
    }
}
 
Example #4
Source File: MtasBasicParser.java    From mtas with Apache License 2.0 5 votes vote down vote up
/**
 * Compute filtered prefixed value.
 *
 * @param type the type
 * @param value the value
 * @param filter the filter
 * @param prefix the prefix
 * @return the string
 * @throws MtasConfigException the mtas config exception
 */
private String computeFilteredPrefixedValue(String type, String value,
    String filter, String prefix) throws MtasConfigException {
  String localValue = value;
  // do magic with filter
  if (filter != null) {
    String[] filters = filter.split(",");
    for (String item : filters) {
      if (item.trim().equals(MAPPING_FILTER_UPPERCASE)) {
        localValue = localValue == null ? null : localValue.toUpperCase();
      } else if (item.trim().equals(MAPPING_FILTER_LOWERCASE)) {
        localValue = localValue == null ? null : localValue.toLowerCase();
      } else if (item.trim().equals(MAPPING_FILTER_ASCII)) {
        if (localValue != null) {
          char[] old = localValue.toCharArray();
          char[] ascii = new char[4 * old.length];
          ASCIIFoldingFilter.foldToASCII(old, 0, ascii, 0,
              localValue.length());
          localValue = new String(ascii);
        }
      } else if (item.trim()
          .matches(Pattern.quote(MAPPING_FILTER_SPLIT) + "\\([0-9\\-]+\\)")) {
        if (!type.equals(MtasParserMapping.PARSER_TYPE_TEXT_SPLIT)) {
          throw new MtasConfigException(
              "split filter not allowed for " + type);
        }
      } else {
        throw new MtasConfigException(
            "unknown filter " + item + " for value " + localValue);
      }
    }
  }
  if (localValue != null && prefix != null) {
    localValue = prefix + localValue;
  }
  return localValue;
}
 
Example #5
Source File: DefaultLuceneQueryBuilder.java    From javaee-lab with Apache License 2.0 5 votes vote down vote up
/**
 * Apply same filtering as "custom" analyzer. Lowercase is done by QueryParser for fuzzy search.
 *
 * @param word word
 * @return word escaped
 */
private String escapeForFuzzy(String word) {
    int length = word.length();
    char[] tmp = new char[length * 4];
    length = ASCIIFoldingFilter.foldToASCII(word.toCharArray(), 0, tmp, 0, length);
    return new String(tmp, 0, length);
}
 
Example #6
Source File: CommonAnalysisPlugin.java    From crate with Apache License 2.0 5 votes vote down vote up
@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
    List<PreConfiguredTokenFilter> filters = new ArrayList<>();
    filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton(
        "common_grams",
        false,
        input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false, input ->
            new DelimitedPayloadTokenFilter(input,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
    filters.add(PreConfiguredTokenFilter.singleton("delimited_payload", false, input ->
            new DelimitedPayloadTokenFilter(input,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
    filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer())));
    filters.add(PreConfiguredTokenFilter.singleton(
        "edge_ngram",
        false,
        input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilterFactory.SIDE_FRONT, EdgeNGramTokenFilterFactory.SIDE_BACK, EdgeNGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL)));
    filters.add(PreConfiguredTokenFilter.singleton(
        "elision",
        true,
        input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES))
    );
    filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer())));
    filters.add(PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("length", false, input ->
        new LengthFilter(input, 0, Integer.MAX_VALUE)));  // TODO this one seems useless
    filters.add(PreConfiguredTokenFilter.singleton(
        "limit",
        false,
        input -> new LimitTokenCountFilter(
            input,
            LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT,
            LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)
        )
    );
    filters.add(PreConfiguredTokenFilter.singleton("ngram", false, reader -> new NGramTokenFilter(reader, 1, 2, false)));
    filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian")));
    filters.add(PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> {
        TokenStream ts = new ShingleFilter(input);
        /**
         * We disable the graph analysis on this token stream
         * because it produces shingles of different size.
         * Graph analysis on such token stream is useless and dangerous as it may create too many paths
         * since shingles of different size are not aligned in terms of positions.
         */
        ts.addAttribute(DisableGraphAttribute.class);
        return ts;
    }));
    filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English")));
    filters.add(PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new));
    // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common
    filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("trim", true, TrimFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10)));
    filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input ->
            new WordDelimiterFilter(input,
                    WordDelimiterFilter.GENERATE_WORD_PARTS
                  | WordDelimiterFilter.GENERATE_NUMBER_PARTS
                  | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE
                  | WordDelimiterFilter.SPLIT_ON_NUMERICS
                  | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null)));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, input ->
            new WordDelimiterGraphFilter(input,
                    WordDelimiterGraphFilter.GENERATE_WORD_PARTS
                  | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS
                  | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE
                  | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS
                  | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null)));
    return filters;
}
 
Example #7
Source File: ASCIIFoldingTokenFilterFactory.java    From Elasticsearch with Apache License 2.0 4 votes vote down vote up
@Override
public TokenStream create(TokenStream tokenStream) {
    return new ASCIIFoldingFilter(tokenStream, preserveOriginal);
}
 
Example #8
Source File: LiteralAnalyzer.java    From AGDISTIS with GNU Affero General Public License v3.0 4 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
	final Tokenizer source = new LowerCaseTokenizer(matchVersion, reader);
	return new TokenStreamComponents(source, new ASCIIFoldingFilter(source));

}
 
Example #9
Source File: ASCIIFoldingTokenFilterFactory.java    From crate with Apache License 2.0 4 votes vote down vote up
@Override
public TokenStream create(TokenStream tokenStream) {
    return new ASCIIFoldingFilter(tokenStream, preserveOriginal);
}