Java Code Examples for org.apache.lucene.analysis.LowerCaseFilter

The following examples show how to use org.apache.lucene.analysis.LowerCaseFilter. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: lucene-solr   Source File: TestTeeSinkTokenFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testMultipleSources() throws Exception {
  final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(whitespaceMockTokenizer(buffer1.toString()));
  final TokenStream source1 = new CachingTokenFilter(tee1);

  tee1.addAttribute(CheckClearAttributesAttribute.class);

  MockTokenizer tokenizer = new MockTokenizer(tee1.getAttributeFactory(), MockTokenizer.WHITESPACE, false);
  tokenizer.setReader(new StringReader(buffer2.toString()));
  final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(tokenizer);
  final TokenStream source2 = tee2;

  assertTokenStreamContents(source1, tokens1);
  assertTokenStreamContents(source2, tokens2);

  TokenStream lowerCasing = new LowerCaseFilter(source1);
  String[] lowerCaseTokens = new String[tokens1.length];
  for (int i = 0; i < tokens1.length; i++)
    lowerCaseTokens[i] = tokens1[i].toLowerCase(Locale.ROOT);
  assertTokenStreamContents(lowerCasing, lowerCaseTokens);
}
 
Example 2
Source Project: lucene-solr   Source File: TestCharTokenizers.java    License: Apache License 2.0 6 votes vote down vote up
public void testReadSupplementaryChars() throws IOException {
  StringBuilder builder = new StringBuilder();
  // create random input
  int num = 1024 + random().nextInt(1024);
  num *= RANDOM_MULTIPLIER;
  for (int i = 1; i < num; i++) {
    builder.append("\ud801\udc1cabc");
    if((i % 10) == 0)
      builder.append(" ");
  }
  // internal buffer size is 1024 make sure we have a surrogate pair right at the border
  builder.insert(1023, "\ud801\udc1c");
  Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory());
  tokenizer.setReader(new StringReader(builder.toString()));
  assertTokenStreamContents(new LowerCaseFilter(tokenizer), builder.toString().toLowerCase(Locale.ROOT).split(" "));
}
 
Example 3
Source Project: lucene-solr   Source File: TestAnalyzers.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Test that LowercaseFilter handles the lowercasing correctly if the term
 * buffer has a trailing surrogate character leftover and the current term in
 * the buffer ends with a corresponding leading surrogate.
 */
public void testLowerCaseFilterLowSurrogateLeftover() throws IOException {
  // test if the limit of the termbuffer is correctly used with supplementary
  // chars
  WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
  tokenizer.setReader(new StringReader("BogustermBogusterm\udc16"));
  LowerCaseFilter filter = new LowerCaseFilter(tokenizer);
  assertTokenStreamContents(filter, new String[] {"bogustermbogusterm\udc16"});
  filter.reset();
  String highSurEndingUpper = "BogustermBoguster\ud801";
  String highSurEndingLower = "bogustermboguster\ud801";
  tokenizer.setReader(new StringReader(highSurEndingUpper));
  assertTokenStreamContents(filter, new String[] {highSurEndingLower});
  assertTrue(filter.hasAttribute(CharTermAttribute.class));
  char[] termBuffer = filter.getAttribute(CharTermAttribute.class).buffer();
  int length = highSurEndingLower.length();
  assertEquals('\ud801', termBuffer[length - 1]);
}
 
Example 4
Source Project: subsonic   Source File: SearchService.java    License: GNU General Public License v3.0 6 votes vote down vote up
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    class SavedStreams {
        StandardTokenizer tokenStream;
        TokenStream filteredTokenStream;
    }

    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        setPreviousTokenStream(streams);
        streams.tokenStream = new StandardTokenizer(LUCENE_VERSION, reader);
        streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
        streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
        streams.filteredTokenStream = new StopFilter(true, streams.filteredTokenStream, STOP_WORDS_SET);
        streams.filteredTokenStream = new ASCIIFoldingFilter(streams.filteredTokenStream);
    } else {
        streams.tokenStream.reset(reader);
    }
    streams.tokenStream.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);

    return streams.filteredTokenStream;
}
 
Example 5
Source Project: pyramid   Source File: PhraseCountQueryBuilder.java    License: Apache License 2.0 6 votes vote down vote up
protected Query doToQuery(QueryShardContext context) throws IOException {
//        Analyzer analyzer = context.getMapperService().searchAnalyzer();
        Analyzer analyzer = new WhitespaceAnalyzer();
        try (TokenStream source = analyzer.tokenStream(fieldName, value.toString())) {
            CachingTokenFilter stream = new CachingTokenFilter(new LowerCaseFilter(source));
            TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
            if (termAtt == null) {
                return null;
            }
            List<CustomSpanTermQuery> clauses = new ArrayList<>();
            stream.reset();
            while (stream.incrementToken()) {
                Term term = new Term(fieldName, termAtt.getBytesRef());
                    clauses.add(new CustomSpanTermQuery(term));
            }
            return new PhraseCountQuery(clauses.toArray(new CustomSpanTermQuery[clauses.size()]), slop, inOrder, weightedCount);
        } catch (IOException e) {
            throw new RuntimeException("Error analyzing query text", e);
        }


    }
 
Example 6
Source Project: crate   Source File: AnalysisModule.java    License: Apache License 2.0 6 votes vote down vote up
static Map<String, PreConfiguredTokenFilter> setupPreConfiguredTokenFilters(List<AnalysisPlugin> plugins) {
    NamedRegistry<PreConfiguredTokenFilter> preConfiguredTokenFilters = new NamedRegistry<>("pre-configured token_filter");

    // Add filters available in lucene-core
    preConfiguredTokenFilters.register("lowercase", PreConfiguredTokenFilter.singleton("lowercase", true, LowerCaseFilter::new));
    preConfiguredTokenFilters.register(
        "standard",
        PreConfiguredTokenFilter.singletonWithVersion("standard", false, (reader, version) -> {
            DEPRECATION_LOGGER.deprecatedAndMaybeLog("standard_deprecation",
                "The [standard] token filter is deprecated and will be removed in a future version.");
            return reader;
        }));
    /* Note that "stop" is available in lucene-core but it's pre-built
     * version uses a set of English stop words that are in
     * lucene-analyzers-common so "stop" is defined in the analysis-common
     * module. */

    for (AnalysisPlugin plugin: plugins) {
        for (PreConfiguredTokenFilter filter : plugin.getPreConfiguredTokenFilters()) {
            preConfiguredTokenFilters.register(filter.getName(), filter);
        }
    }
    return unmodifiableMap(preConfiguredTokenFilters.getRegistry());
}
 
Example 7
Source Project: uyuni   Source File: NGramAnalyzer.java    License: GNU General Public License v2.0 5 votes vote down vote up
/**
 * @param fieldName ignored param
 * @param reader contains data to parse
 * @return TokenStream of ngrams
 */
public TokenStream tokenStream(String fieldName, Reader reader) {
    return new NGramTokenFilter(
            new LowerCaseFilter(
                new StandardFilter(
                    new StandardTokenizer(reader))), min_ngram, max_ngram);
}
 
Example 8
Source Project: lucene-solr   Source File: TestGermanStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void setUp() throws Exception {
  super.setUp();
  analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer t = new MockTokenizer(MockTokenizer.KEYWORD, false);
      return new TokenStreamComponents(t,
          new GermanStemFilter(new LowerCaseFilter(t)));
    }
  };
}
 
Example 9
Source Project: lucene-solr   Source File: TestGermanAnalyzer.java    License: Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet( 1, true);
  set.add("fischen");
  final Tokenizer in = new LetterTokenizer();
  in.setReader(new StringReader("Fischen Trinken"));
  GermanStemFilter filter = new GermanStemFilter(
      new SetKeywordMarkerFilter(new LowerCaseFilter(in), set));
  assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
}
 
Example 10
Source Project: lucene-solr   Source File: TestCharTokenizers.java    License: Apache License 2.0 5 votes vote down vote up
public void testExtendCharBuffer() throws IOException {
  for (int i = 0; i < 40; i++) {
    StringBuilder builder = new StringBuilder();
    for (int j = 0; j < 1+i; j++) {
      builder.append("a");
    }
    builder.append("\ud801\udc1cabc");
    Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory());
    tokenizer.setReader(new StringReader(builder.toString()));
    assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[] {builder.toString().toLowerCase(Locale.ROOT)});
  }
}
 
Example 11
Source Project: lucene-solr   Source File: TestCharTokenizers.java    License: Apache License 2.0 5 votes vote down vote up
public void testMaxWordLength() throws IOException {
  StringBuilder builder = new StringBuilder();

  for (int i = 0; i < 255; i++) {
    builder.append("A");
  }
  Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory());
  tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
  assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
}
 
Example 12
Source Project: lucene-solr   Source File: TestCharTokenizers.java    License: Apache License 2.0 5 votes vote down vote up
public void testMaxWordLengthWithSupplementary() throws IOException {
  StringBuilder builder = new StringBuilder();

  for (int i = 0; i < 254; i++) {
    builder.append("A");
  }
  builder.append("\ud801\udc1c");
  Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory());
  tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
  assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
}
 
Example 13
Source Project: yes-cart   Source File: AsIsAnalyzer.java    License: Apache License 2.0 5 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    if (toLowerCase) {
        final UnicodeWhitespaceTokenizer tokenizer = new UnicodeWhitespaceTokenizer();
        return new TokenStreamComponents(tokenizer, new LowerCaseFilter(tokenizer));
    }
    return new TokenStreamComponents(new UnicodeWhitespaceTokenizer());
}
 
Example 14
Source Project: spacewalk   Source File: NGramAnalyzer.java    License: GNU General Public License v2.0 5 votes vote down vote up
/**
 * @param fieldName ignored param
 * @param reader contains data to parse
 * @return TokenStream of ngrams
 */
public TokenStream tokenStream(String fieldName, Reader reader) {
    return new NGramTokenFilter(
            new LowerCaseFilter(
                new StandardFilter(
                    new StandardTokenizer(reader))), min_ngram, max_ngram);
}
 
Example 15
Source Project: crate   Source File: LowerCaseTokenFilterFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public TokenStream create(TokenStream tokenStream) {
    if (lang == null) {
        return new LowerCaseFilter(tokenStream);
    } else if (lang.equalsIgnoreCase("greek")) {
        return new GreekLowerCaseFilter(tokenStream);
    } else if (lang.equalsIgnoreCase("irish")) {
        return new IrishLowerCaseFilter(tokenStream);
    } else if (lang.equalsIgnoreCase("turkish")) {
        return new TurkishLowerCaseFilter(tokenStream);
    } else {
        throw new IllegalArgumentException("language [" + lang + "] not support for lower case");
    }
}
 
Example 16
Source Project: crate   Source File: CommonAnalysisPlugin.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
    List<PreConfiguredTokenizer> tokenizers = new ArrayList<>();
    tokenizers.add(PreConfiguredTokenizer.singleton("keyword", KeywordTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("classic", ClassicTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("uax_url_email", UAX29URLEmailTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("path_hierarchy", PathHierarchyTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("letter", LetterTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("whitespace", WhitespaceTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("ngram", NGramTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("edge_ngram",
        () -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null));
    tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1), null));
    tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", XLowerCaseTokenizer::new, () -> new TokenFilterFactory() {
        @Override
        public String name() {
            return "lowercase";
        }

        @Override
        public TokenStream create(TokenStream tokenStream) {
            return new LowerCaseFilter(tokenStream);
        }
    }));

    // Temporary shim for aliases. TODO deprecate after they are moved
    tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new, null));

    return tokenizers;
}
 
Example 17
Source Project: tephra   Source File: CharAnalyzer.java    License: MIT License 4 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    Tokenizer tokenizer = new CharTokenizer();
    return new TokenStreamComponents(tokenizer, new LowerCaseFilter(tokenizer));
}
 
Example 18
Source Project: tephra   Source File: CharAnalyzer.java    License: MIT License 4 votes vote down vote up
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
    return new LowerCaseFilter(in);
}
 
Example 19
Source Project: lucene-solr   Source File: SmartChineseAnalyzer.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
  return new LowerCaseFilter(in);
}
 
Example 20
Source Project: lucene-solr   Source File: PolishAnalyzer.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
  return new LowerCaseFilter(in);
}
 
Example 21
Source Project: lucene-solr   Source File: KoreanAnalyzer.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
  return new LowerCaseFilter(in);
}
 
Example 22
Source Project: lucene-solr   Source File: GermanAnalyzer.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
  TokenStream result = new LowerCaseFilter(in);
  result = new GermanNormalizationFilter(result);
  return result;
}
 
Example 23
Source Project: lucene-solr   Source File: NorwegianAnalyzer.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
  return new LowerCaseFilter(in);
}
 
Example 24
Source Project: lucene-solr   Source File: FinnishAnalyzer.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
  return new LowerCaseFilter(in);
}
 
Example 25
Source Project: lucene-solr   Source File: ArmenianAnalyzer.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
  return new LowerCaseFilter(in);
}
 
Example 26
Source Project: lucene-solr   Source File: LatvianAnalyzer.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
  return new LowerCaseFilter(in);
}
 
Example 27
Source Project: lucene-solr   Source File: DutchAnalyzer.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
  return new LowerCaseFilter(in);
}
 
Example 28
Source Project: lucene-solr   Source File: RussianAnalyzer.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
  return new LowerCaseFilter(in);
}
 
Example 29
Source Project: lucene-solr   Source File: SwedishAnalyzer.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
  return new LowerCaseFilter(in);
}
 
Example 30
Source Project: lucene-solr   Source File: HungarianAnalyzer.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
  return new LowerCaseFilter(in);
}