Java Code Examples for org.apache.lucene.analysis.core.WhitespaceTokenizerFactory

The following examples show how to use org.apache.lucene.analysis.core.WhitespaceTokenizerFactory. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: lucene-solr   Source File: TestCustomAnalyzer.java    License: Apache License 2.0 6 votes vote down vote up
public void testWhitespaceFactoryWithFolding() throws Exception {
  CustomAnalyzer a = CustomAnalyzer.builder()
      .withTokenizer(WhitespaceTokenizerFactory.class)
      .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "true")
      .addTokenFilter(LowerCaseFilterFactory.class)
      .build();
  
  assertSame(WhitespaceTokenizerFactory.class, a.getTokenizerFactory().getClass());
  assertEquals(Collections.emptyList(), a.getCharFilterFactories());
  List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
  assertEquals(2, tokenFilters.size());
  assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass());
  assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
  assertEquals(0, a.getPositionIncrementGap("dummy"));
  assertEquals(1, a.getOffsetGap("dummy"));
  assertSame(Version.LATEST, a.getVersion());

  assertAnalyzesTo(a, "foo bar FOO BAR", 
      new String[] { "foo", "bar", "foo", "bar" },
      new int[]    { 1,     1,     1,     1});
  assertAnalyzesTo(a, "föó bär FÖÖ BAR", 
      new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" },
      new int[]    { 1,     0,     1,     0,     1,     0,     1});
  a.close();
}
 
Example 2
Source Project: lucene-solr   Source File: TestCustomAnalyzer.java    License: Apache License 2.0 6 votes vote down vote up
public void testWhitespaceWithFolding() throws Exception {
  CustomAnalyzer a = CustomAnalyzer.builder()
      .withTokenizer("whitespace")
      .addTokenFilter("asciifolding", "preserveOriginal", "true")
      .addTokenFilter("lowercase")
      .build();
  
  assertSame(WhitespaceTokenizerFactory.class, a.getTokenizerFactory().getClass());
  assertEquals(Collections.emptyList(), a.getCharFilterFactories());
  List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
  assertEquals(2, tokenFilters.size());
  assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass());
  assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
  assertEquals(0, a.getPositionIncrementGap("dummy"));
  assertEquals(1, a.getOffsetGap("dummy"));
  assertSame(Version.LATEST, a.getVersion());

  assertAnalyzesTo(a, "foo bar FOO BAR", 
      new String[] { "foo", "bar", "foo", "bar" },
      new int[]    { 1,     1,     1,     1});
  assertAnalyzesTo(a, "föó bär FÖÖ BAR", 
      new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" },
      new int[]    { 1,     0,     1,     0,     1,     0,     1});
  a.close();
}
 
Example 3
Source Project: lucene-solr   Source File: TestCustomAnalyzer.java    License: Apache License 2.0 6 votes vote down vote up
public void testStopWordsFromClasspath() throws Exception {
  CustomAnalyzer a = CustomAnalyzer.builder()
      .withTokenizer(WhitespaceTokenizerFactory.class)
      .addTokenFilter("stop",
          "ignoreCase", "true",
          "words", "org/apache/lucene/analysis/custom/teststop.txt",
          "format", "wordset")
      .build();
  
  assertSame(WhitespaceTokenizerFactory.class, a.getTokenizerFactory().getClass());
  assertEquals(Collections.emptyList(), a.getCharFilterFactories());
  List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
  assertEquals(1, tokenFilters.size());
  assertSame(StopFilterFactory.class, tokenFilters.get(0).getClass());
  assertEquals(0, a.getPositionIncrementGap("dummy"));
  assertEquals(1, a.getOffsetGap("dummy"));
  assertSame(Version.LATEST, a.getVersion());

  assertAnalyzesTo(a, "foo Foo Bar", new String[0]);
  a.close();
}
 
Example 4
Source Project: lucene-solr   Source File: TestCustomAnalyzer.java    License: Apache License 2.0 5 votes vote down vote up
public void testStopWordsFromClasspathWithMap() throws Exception {
  Map<String,String> stopConfig1 = new HashMap<>();
  stopConfig1.put("ignoreCase", "true");
  stopConfig1.put("words", "org/apache/lucene/analysis/custom/teststop.txt");
  stopConfig1.put("format", "wordset");
  
  Map<String,String> stopConfig2 = new HashMap<>(stopConfig1);
  Map<String,String> stopConfigImmutable = Collections.unmodifiableMap(new HashMap<>(stopConfig1));

  CustomAnalyzer a = CustomAnalyzer.builder()
      .withTokenizer("whitespace")
      .addTokenFilter("stop", stopConfig1)
      .build();
  assertTrue(stopConfig1.isEmpty());
  assertAnalyzesTo(a, "foo Foo Bar", new String[0]);
  
  a = CustomAnalyzer.builder()
      .withTokenizer(WhitespaceTokenizerFactory.class)
      .addTokenFilter(StopFilterFactory.class, stopConfig2)
      .build();
  assertTrue(stopConfig2.isEmpty());
  assertAnalyzesTo(a, "foo Foo Bar", new String[0]);
  
  // try with unmodifiableMap, should fail
  expectThrows(UnsupportedOperationException.class, () -> {
    CustomAnalyzer.builder()
        .withTokenizer("whitespace")
        .addTokenFilter("stop", stopConfigImmutable)
        .build();
  });
  a.close();
}
 
Example 5
Source Project: lucene-solr   Source File: TestCustomAnalyzer.java    License: Apache License 2.0 5 votes vote down vote up
public void testNormalizationWithMultipleTokenFilters() throws IOException {
  CustomAnalyzer analyzer = CustomAnalyzer.builder()
      // none of these components are multi-term aware so they should not be applied
      .withTokenizer(WhitespaceTokenizerFactory.class, Collections.emptyMap())
      .addTokenFilter(LowerCaseFilterFactory.class, Collections.emptyMap())
      .addTokenFilter(ASCIIFoldingFilterFactory.class, Collections.emptyMap())
      .build();
  assertEquals(new BytesRef("a b e"), analyzer.normalize("dummy", "À B é"));
}
 
Example 6
Source Project: lucene-solr   Source File: TestCustomAnalyzer.java    License: Apache License 2.0 5 votes vote down vote up
public void testNormalizationWithMultiplCharFilters() throws IOException {
  CustomAnalyzer analyzer = CustomAnalyzer.builder()
      // none of these components are multi-term aware so they should not be applied
      .withTokenizer(WhitespaceTokenizerFactory.class, Collections.emptyMap())
      .addCharFilter(MappingCharFilterFactory.class, new HashMap<>(Collections.singletonMap("mapping", "org/apache/lucene/analysis/custom/mapping1.txt")))
      .addCharFilter(MappingCharFilterFactory.class, new HashMap<>(Collections.singletonMap("mapping", "org/apache/lucene/analysis/custom/mapping2.txt")))
      .build();
  assertEquals(new BytesRef("e f c"), analyzer.normalize("dummy", "a b c"));
}
 
Example 7
Source Project: lucene-solr   Source File: TestAbstractAnalysisFactory.java    License: Apache License 2.0 4 votes vote down vote up
public void testLookupTokenizerSPIName() throws NoSuchFieldException, IllegalAccessException {
  assertEquals("whitespace", AnalysisSPILoader.lookupSPIName(WhitespaceTokenizerFactory.class));
  assertEquals("whitespace", TokenizerFactory.findSPIName(WhitespaceTokenizerFactory.class));
}
 
Example 8
Source Project: lucene-solr   Source File: TestAnalysisSPILoader.java    License: Apache License 2.0 4 votes vote down vote up
public void testLookupTokenizer() {
  assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.forName("Whitespace", versionArgOnly()).getClass());
  assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.forName("WHITESPACE", versionArgOnly()).getClass());
  assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.forName("whitespace", versionArgOnly()).getClass());
}
 
Example 9
Source Project: lucene-solr   Source File: TestAnalysisSPILoader.java    License: Apache License 2.0 4 votes vote down vote up
public void testLookupTokenizerClass() {
  assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.lookupClass("Whitespace"));
  assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.lookupClass("WHITESPACE"));
  assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.lookupClass("whitespace"));
}
 
Example 10
Source Project: solr-researcher   Source File: ReSearcherUtils.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * Separates tokens from query. Treats each quote as a separate token, since that makes it easier to examine the query.
 * 
 * @param queryString .
 * @param tokens .
 * @return number of quotes in the query
 */
public static int tokenizeQueryString(String queryString, List<String> tokens) {
  int countOfQuotes = 0;
  
  try {
    // first tokenize words and treat each quote as a separate token
    Map<String,String> args = new HashMap<String, String>();
    args.put(WhitespaceTokenizerFactory.LUCENE_MATCH_VERSION_PARAM, Version.LUCENE_6_3_0.toString());
    WhitespaceTokenizerFactory f = new WhitespaceTokenizerFactory(args);
    
    WhitespaceTokenizer s = (WhitespaceTokenizer)f.create(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
    s.setReader(new StringReader(queryString));
    s.reset();
    
    while (true) {
      CharTermAttribute t = s.getAttribute(CharTermAttribute.class);

      if (t == null) {
        break;
      }
      
      String tokentText = new String(t.toString());
      
      if (tokentText.equals("\"")) {
        tokens.add("\"");
        countOfQuotes++;
      } else if (tokentText.startsWith("\"")) {
        tokens.add("\"");
        countOfQuotes++;
        
        if (tokentText.endsWith("\"")) {
          tokens.add(tokentText.substring(1, tokentText.length() - 1));
          tokens.add("\"");
          countOfQuotes++;
        } else {
          tokens.add(tokentText.substring(1));
        }
      } else if (tokentText.endsWith("\"")) {
        tokens.add(tokentText.substring(0, tokentText.length() - 1));
        tokens.add("\"");
        countOfQuotes++;
      } else if (!tokentText.trim().equals("")) {
        // take into account only if different than empty string
        tokens.add(tokentText);
      }
      
      if (!s.incrementToken()) {
        break;
      }
    }
    s.end();
    s.close();
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
  return countOfQuotes;
}