org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory Java Examples

The following examples show how to use org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestCustomAnalyzer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testWhitespaceFactoryWithFolding() throws Exception {
  CustomAnalyzer a = CustomAnalyzer.builder()
      .withTokenizer(WhitespaceTokenizerFactory.class)
      .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "true")
      .addTokenFilter(LowerCaseFilterFactory.class)
      .build();
  
  assertSame(WhitespaceTokenizerFactory.class, a.getTokenizerFactory().getClass());
  assertEquals(Collections.emptyList(), a.getCharFilterFactories());
  List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
  assertEquals(2, tokenFilters.size());
  assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass());
  assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
  assertEquals(0, a.getPositionIncrementGap("dummy"));
  assertEquals(1, a.getOffsetGap("dummy"));
  assertSame(Version.LATEST, a.getVersion());

  assertAnalyzesTo(a, "foo bar FOO BAR", 
      new String[] { "foo", "bar", "foo", "bar" },
      new int[]    { 1,     1,     1,     1});
  assertAnalyzesTo(a, "föó bär FÖÖ BAR", 
      new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" },
      new int[]    { 1,     0,     1,     0,     1,     0,     1});
  a.close();
}
 
Example #2
Source File: TestCustomAnalyzer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testWhitespaceWithFolding() throws Exception {
  CustomAnalyzer a = CustomAnalyzer.builder()
      .withTokenizer("whitespace")
      .addTokenFilter("asciifolding", "preserveOriginal", "true")
      .addTokenFilter("lowercase")
      .build();
  
  assertSame(WhitespaceTokenizerFactory.class, a.getTokenizerFactory().getClass());
  assertEquals(Collections.emptyList(), a.getCharFilterFactories());
  List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
  assertEquals(2, tokenFilters.size());
  assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass());
  assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
  assertEquals(0, a.getPositionIncrementGap("dummy"));
  assertEquals(1, a.getOffsetGap("dummy"));
  assertSame(Version.LATEST, a.getVersion());

  assertAnalyzesTo(a, "foo bar FOO BAR", 
      new String[] { "foo", "bar", "foo", "bar" },
      new int[]    { 1,     1,     1,     1});
  assertAnalyzesTo(a, "föó bär FÖÖ BAR", 
      new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" },
      new int[]    { 1,     0,     1,     0,     1,     0,     1});
  a.close();
}
 
Example #3
Source File: AnalyzerFactory.java    From airsonic-advanced with GNU General Public License v3.0 5 votes vote down vote up
private Builder createDefaultAnalyzerBuilder() throws IOException {
    Builder builder = CustomAnalyzer.builder()
            .withTokenizer(StandardTokenizerFactory.class)
            .addTokenFilter(CJKWidthFilterFactory.class)
            .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false")
            .addTokenFilter(LowerCaseFilterFactory.class)
            .addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS)
            .addTokenFilter(EnglishPossessiveFilterFactory.class);
    addTokenFilterForUnderscoreRemovalAroundToken(builder);
    return builder;
}
 
Example #4
Source File: AnalyzerFactory.java    From airsonic-advanced with GNU General Public License v3.0 5 votes vote down vote up
private Builder createArtistAnalyzerBuilder() throws IOException {
    Builder builder = CustomAnalyzer.builder()
            .withTokenizer(StandardTokenizerFactory.class)
            .addTokenFilter(CJKWidthFilterFactory.class)
            .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false")
            .addTokenFilter(LowerCaseFilterFactory.class)
            .addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS_ARTIST)
            .addTokenFilter(EnglishPossessiveFilterFactory.class);
    addTokenFilterForUnderscoreRemovalAroundToken(builder);
    return builder;
}
 
Example #5
Source File: BibleSearchIndex.java    From Quelea with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Create a new empty search index.
 */
public BibleSearchIndex() {
    chapters = new HashMap<>();
    try {
        analyzer = CustomAnalyzer.builder()
                .withTokenizer(StandardTokenizerFactory.class)
                .addTokenFilter(LowerCaseFilterFactory.class)
                .addTokenFilter(ASCIIFoldingFilterFactory.class)
                .build();
        index = new MMapDirectory(Files.createTempDirectory("quelea-mmap-bible").toAbsolutePath());
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, "Couldn't create song search index");
        throw new RuntimeException("Couldn't create song search index", ex);
    }
}
 
Example #6
Source File: SongSearchIndex.java    From Quelea with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Create a new empty search index.
 */
public SongSearchIndex() {
    songs = new HashMap<>();
    try {
        analyzer = CustomAnalyzer.builder()
                .withTokenizer(StandardTokenizerFactory.class)
                .addTokenFilter(LowerCaseFilterFactory.class)
                .addTokenFilter(ASCIIFoldingFilterFactory.class)
                .build();
        index = new MMapDirectory(Files.createTempDirectory("quelea-mmap-song").toAbsolutePath());
    }
    catch(IOException ex) {
        LOGGER.log(Level.SEVERE, "Couldn't create song search index");
        throw new RuntimeException("Couldn't create song search index", ex);
    }
}
 
Example #7
Source File: AnalyzerFactory.java    From airsonic with GNU General Public License v3.0 5 votes vote down vote up
private Builder createDefaultAnalyzerBuilder() throws IOException {
    Builder builder = CustomAnalyzer.builder()
            .withTokenizer(StandardTokenizerFactory.class)
            .addTokenFilter(CJKWidthFilterFactory.class)
            .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false")
            .addTokenFilter(LowerCaseFilterFactory.class)
            .addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS)
            .addTokenFilter(EnglishPossessiveFilterFactory.class);
    addTokenFilterForUnderscoreRemovalAroundToken(builder);
    return builder;
}
 
Example #8
Source File: AnalyzerFactory.java    From airsonic with GNU General Public License v3.0 5 votes vote down vote up
private Builder createArtistAnalyzerBuilder() throws IOException {
    Builder builder = CustomAnalyzer.builder()
            .withTokenizer(StandardTokenizerFactory.class)
            .addTokenFilter(CJKWidthFilterFactory.class)
            .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false")
            .addTokenFilter(LowerCaseFilterFactory.class)
            .addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS_ARTIST)
            .addTokenFilter(EnglishPossessiveFilterFactory.class);
    addTokenFilterForUnderscoreRemovalAroundToken(builder);
    return builder;
}
 
Example #9
Source File: TestCustomAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testFactoryHtmlStripClassicFolding() throws Exception {
  CustomAnalyzer a = CustomAnalyzer.builder()
      .withDefaultMatchVersion(LUCENE_8_0_0)
      .addCharFilter(HTMLStripCharFilterFactory.class)
      .withTokenizer(ClassicTokenizerFactory.class)
      .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "true")
      .addTokenFilter(LowerCaseFilterFactory.class)
      .withPositionIncrementGap(100)
      .withOffsetGap(1000)
      .build();
  
  assertSame(ClassicTokenizerFactory.class, a.getTokenizerFactory().getClass());
  List<CharFilterFactory> charFilters = a.getCharFilterFactories();
  assertEquals(1, charFilters.size());
  assertEquals(HTMLStripCharFilterFactory.class, charFilters.get(0).getClass());
  List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
  assertEquals(2, tokenFilters.size());
  assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass());
  assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
  assertEquals(100, a.getPositionIncrementGap("dummy"));
  assertEquals(1000, a.getOffsetGap("dummy"));
  assertSame(LUCENE_8_0_0, a.getVersion());

  assertAnalyzesTo(a, "<p>foo bar</p> FOO BAR", 
      new String[] { "foo", "bar", "foo", "bar" },
      new int[]    { 1,     1,     1,     1});
  assertAnalyzesTo(a, "<p><b>föó</b> bär     FÖÖ BAR</p>", 
      new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" },
      new int[]    { 1,     0,     1,     0,     1,     0,     1});
  a.close();
}
 
Example #10
Source File: TestCustomAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testHtmlStripClassicFolding() throws Exception {
  CustomAnalyzer a = CustomAnalyzer.builder()
      .withDefaultMatchVersion(LUCENE_8_0_0)
      .addCharFilter("htmlstrip")
      .withTokenizer("classic")
      .addTokenFilter("asciifolding", "preserveOriginal", "true")
      .addTokenFilter("lowercase")
      .withPositionIncrementGap(100)
      .withOffsetGap(1000)
      .build();
  
  assertSame(ClassicTokenizerFactory.class, a.getTokenizerFactory().getClass());
  List<CharFilterFactory> charFilters = a.getCharFilterFactories();
  assertEquals(1, charFilters.size());
  assertEquals(HTMLStripCharFilterFactory.class, charFilters.get(0).getClass());
  List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
  assertEquals(2, tokenFilters.size());
  assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass());
  assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
  assertEquals(100, a.getPositionIncrementGap("dummy"));
  assertEquals(1000, a.getOffsetGap("dummy"));
  assertSame(LUCENE_8_0_0, a.getVersion());

  assertAnalyzesTo(a, "<p>foo bar</p> FOO BAR", 
      new String[] { "foo", "bar", "foo", "bar" },
      new int[]    { 1,     1,     1,     1});
  assertAnalyzesTo(a, "<p><b>föó</b> bär     FÖÖ BAR</p>", 
      new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" },
      new int[]    { 1,     0,     1,     0,     1,     0,     1});
  a.close();
}
 
Example #11
Source File: TestCustomAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testNormalizationWithMultipleTokenFilters() throws IOException {
  CustomAnalyzer analyzer = CustomAnalyzer.builder()
      // none of these components are multi-term aware so they should not be applied
      .withTokenizer(WhitespaceTokenizerFactory.class, Collections.emptyMap())
      .addTokenFilter(LowerCaseFilterFactory.class, Collections.emptyMap())
      .addTokenFilter(ASCIIFoldingFilterFactory.class, Collections.emptyMap())
      .build();
  assertEquals(new BytesRef("a b e"), analyzer.normalize("dummy", "À B é"));
}
 
Example #12
Source File: TokenizerChainTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Test
@SuppressWarnings({"unchecked"})
public void testNormalization() throws Exception {
  String fieldName = "f";
  TokenFilterFactory[] tff = new TokenFilterFactory[2];
  tff[0] = new LowerCaseFilterFactory(Collections.EMPTY_MAP);
  tff[1] = new ASCIIFoldingFilterFactory(Collections.EMPTY_MAP);
  TokenizerChain tokenizerChain = new TokenizerChain(
      new MockTokenizerFactory(Collections.EMPTY_MAP),
      tff);
  assertEquals(new BytesRef("fooba"),
      tokenizerChain.normalize(fieldName, "FOOB\u00c4"));
  tokenizerChain.close();
}
 
Example #13
Source File: TestCustomAnalyzer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public TokenStream normalize(TokenStream input) {
  return new ASCIIFoldingFilterFactory(Collections.emptyMap()).normalize(input);
}