org.apache.lucene.analysis.core.WhitespaceTokenizerFactory Java Examples

The following examples show how to use org.apache.lucene.analysis.core.WhitespaceTokenizerFactory. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestCustomAnalyzer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testWhitespaceFactoryWithFolding() throws Exception {
  CustomAnalyzer a = CustomAnalyzer.builder()
      .withTokenizer(WhitespaceTokenizerFactory.class)
      .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "true")
      .addTokenFilter(LowerCaseFilterFactory.class)
      .build();
  
  assertSame(WhitespaceTokenizerFactory.class, a.getTokenizerFactory().getClass());
  assertEquals(Collections.emptyList(), a.getCharFilterFactories());
  List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
  assertEquals(2, tokenFilters.size());
  assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass());
  assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
  assertEquals(0, a.getPositionIncrementGap("dummy"));
  assertEquals(1, a.getOffsetGap("dummy"));
  assertSame(Version.LATEST, a.getVersion());

  assertAnalyzesTo(a, "foo bar FOO BAR", 
      new String[] { "foo", "bar", "foo", "bar" },
      new int[]    { 1,     1,     1,     1});
  assertAnalyzesTo(a, "föó bär FÖÖ BAR", 
      new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" },
      new int[]    { 1,     0,     1,     0,     1,     0,     1});
  a.close();
}
 
Example #2
Source File: TestCustomAnalyzer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testWhitespaceWithFolding() throws Exception {
  CustomAnalyzer a = CustomAnalyzer.builder()
      .withTokenizer("whitespace")
      .addTokenFilter("asciifolding", "preserveOriginal", "true")
      .addTokenFilter("lowercase")
      .build();
  
  assertSame(WhitespaceTokenizerFactory.class, a.getTokenizerFactory().getClass());
  assertEquals(Collections.emptyList(), a.getCharFilterFactories());
  List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
  assertEquals(2, tokenFilters.size());
  assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass());
  assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
  assertEquals(0, a.getPositionIncrementGap("dummy"));
  assertEquals(1, a.getOffsetGap("dummy"));
  assertSame(Version.LATEST, a.getVersion());

  assertAnalyzesTo(a, "foo bar FOO BAR", 
      new String[] { "foo", "bar", "foo", "bar" },
      new int[]    { 1,     1,     1,     1});
  assertAnalyzesTo(a, "föó bär FÖÖ BAR", 
      new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" },
      new int[]    { 1,     0,     1,     0,     1,     0,     1});
  a.close();
}
 
Example #3
Source File: TestCustomAnalyzer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testStopWordsFromClasspath() throws Exception {
  CustomAnalyzer a = CustomAnalyzer.builder()
      .withTokenizer(WhitespaceTokenizerFactory.class)
      .addTokenFilter("stop",
          "ignoreCase", "true",
          "words", "org/apache/lucene/analysis/custom/teststop.txt",
          "format", "wordset")
      .build();
  
  assertSame(WhitespaceTokenizerFactory.class, a.getTokenizerFactory().getClass());
  assertEquals(Collections.emptyList(), a.getCharFilterFactories());
  List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
  assertEquals(1, tokenFilters.size());
  assertSame(StopFilterFactory.class, tokenFilters.get(0).getClass());
  assertEquals(0, a.getPositionIncrementGap("dummy"));
  assertEquals(1, a.getOffsetGap("dummy"));
  assertSame(Version.LATEST, a.getVersion());

  assertAnalyzesTo(a, "foo Foo Bar", new String[0]);
  a.close();
}
 
Example #4
Source File: TestCustomAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testStopWordsFromClasspathWithMap() throws Exception {
  Map<String,String> stopConfig1 = new HashMap<>();
  stopConfig1.put("ignoreCase", "true");
  stopConfig1.put("words", "org/apache/lucene/analysis/custom/teststop.txt");
  stopConfig1.put("format", "wordset");
  
  Map<String,String> stopConfig2 = new HashMap<>(stopConfig1);
  Map<String,String> stopConfigImmutable = Collections.unmodifiableMap(new HashMap<>(stopConfig1));

  CustomAnalyzer a = CustomAnalyzer.builder()
      .withTokenizer("whitespace")
      .addTokenFilter("stop", stopConfig1)
      .build();
  assertTrue(stopConfig1.isEmpty());
  assertAnalyzesTo(a, "foo Foo Bar", new String[0]);
  
  a = CustomAnalyzer.builder()
      .withTokenizer(WhitespaceTokenizerFactory.class)
      .addTokenFilter(StopFilterFactory.class, stopConfig2)
      .build();
  assertTrue(stopConfig2.isEmpty());
  assertAnalyzesTo(a, "foo Foo Bar", new String[0]);
  
  // try with unmodifiableMap, should fail
  expectThrows(UnsupportedOperationException.class, () -> {
    CustomAnalyzer.builder()
        .withTokenizer("whitespace")
        .addTokenFilter("stop", stopConfigImmutable)
        .build();
  });
  a.close();
}
 
Example #5
Source File: TestCustomAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testNormalizationWithMultipleTokenFilters() throws IOException {
  CustomAnalyzer analyzer = CustomAnalyzer.builder()
      // none of these components are multi-term aware so they should not be applied
      .withTokenizer(WhitespaceTokenizerFactory.class, Collections.emptyMap())
      .addTokenFilter(LowerCaseFilterFactory.class, Collections.emptyMap())
      .addTokenFilter(ASCIIFoldingFilterFactory.class, Collections.emptyMap())
      .build();
  assertEquals(new BytesRef("a b e"), analyzer.normalize("dummy", "À B é"));
}
 
Example #6
Source File: TestCustomAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testNormalizationWithMultiplCharFilters() throws IOException {
  CustomAnalyzer analyzer = CustomAnalyzer.builder()
      // none of these components are multi-term aware so they should not be applied
      .withTokenizer(WhitespaceTokenizerFactory.class, Collections.emptyMap())
      .addCharFilter(MappingCharFilterFactory.class, new HashMap<>(Collections.singletonMap("mapping", "org/apache/lucene/analysis/custom/mapping1.txt")))
      .addCharFilter(MappingCharFilterFactory.class, new HashMap<>(Collections.singletonMap("mapping", "org/apache/lucene/analysis/custom/mapping2.txt")))
      .build();
  assertEquals(new BytesRef("e f c"), analyzer.normalize("dummy", "a b c"));
}
 
Example #7
Source File: TestAbstractAnalysisFactory.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testLookupTokenizerSPIName() throws NoSuchFieldException, IllegalAccessException {
  assertEquals("whitespace", AnalysisSPILoader.lookupSPIName(WhitespaceTokenizerFactory.class));
  assertEquals("whitespace", TokenizerFactory.findSPIName(WhitespaceTokenizerFactory.class));
}
 
Example #8
Source File: TestAnalysisSPILoader.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testLookupTokenizer() {
  assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.forName("Whitespace", versionArgOnly()).getClass());
  assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.forName("WHITESPACE", versionArgOnly()).getClass());
  assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.forName("whitespace", versionArgOnly()).getClass());
}
 
Example #9
Source File: TestAnalysisSPILoader.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testLookupTokenizerClass() {
  assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.lookupClass("Whitespace"));
  assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.lookupClass("WHITESPACE"));
  assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.lookupClass("whitespace"));
}
 
Example #10
Source File: ReSearcherUtils.java    From solr-researcher with Apache License 2.0 4 votes vote down vote up
/**
 * Separates tokens from query. Treats each quote as a separate token, since that makes it easier to examine the query.
 * 
 * @param queryString .
 * @param tokens .
 * @return number of quotes in the query
 */
public static int tokenizeQueryString(String queryString, List<String> tokens) {
  int countOfQuotes = 0;
  
  try {
    // first tokenize words and treat each quote as a separate token
    Map<String,String> args = new HashMap<String, String>();
    args.put(WhitespaceTokenizerFactory.LUCENE_MATCH_VERSION_PARAM, Version.LUCENE_6_3_0.toString());
    WhitespaceTokenizerFactory f = new WhitespaceTokenizerFactory(args);
    
    WhitespaceTokenizer s = (WhitespaceTokenizer)f.create(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
    s.setReader(new StringReader(queryString));
    s.reset();
    
    while (true) {
      CharTermAttribute t = s.getAttribute(CharTermAttribute.class);

      if (t == null) {
        break;
      }
      
      String tokentText = new String(t.toString());
      
      if (tokentText.equals("\"")) {
        tokens.add("\"");
        countOfQuotes++;
      } else if (tokentText.startsWith("\"")) {
        tokens.add("\"");
        countOfQuotes++;
        
        if (tokentText.endsWith("\"")) {
          tokens.add(tokentText.substring(1, tokentText.length() - 1));
          tokens.add("\"");
          countOfQuotes++;
        } else {
          tokens.add(tokentText.substring(1));
        }
      } else if (tokentText.endsWith("\"")) {
        tokens.add(tokentText.substring(0, tokentText.length() - 1));
        tokens.add("\"");
        countOfQuotes++;
      } else if (!tokentText.trim().equals("")) {
        // take into account only if different than empty string
        tokens.add(tokentText);
      }
      
      if (!s.incrementToken()) {
        break;
      }
    }
    s.end();
    s.close();
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
  return countOfQuotes;
}