Java Code Examples for org.apache.lucene.analysis.BaseTokenStreamTestCase#assertTokenStreamContents()

The following examples show how to use org.apache.lucene.analysis.BaseTokenStreamTestCase#assertTokenStreamContents() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestUAX29URLEmailTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testHugeDoc() throws IOException {
  StringBuilder sb = new StringBuilder();
  char whitespace[] = new char[4094];
  Arrays.fill(whitespace, ' ');
  sb.append(whitespace);
  sb.append("testing 1234");
  String input = sb.toString();
  UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory());
  tokenizer.setReader(new StringReader(input));
  BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
 
Example 2
Source File: TestStandardAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testHugeDoc() throws IOException {
  StringBuilder sb = new StringBuilder();
  char whitespace[] = new char[4094];
  Arrays.fill(whitespace, ' ');
  sb.append(whitespace);
  sb.append("testing 1234");
  String input = sb.toString();
  StandardTokenizer tokenizer = new StandardTokenizer();
  tokenizer.setReader(new StringReader(input));
  BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
 
Example 3
Source File: ProtectedTermFilterFactoryTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testBasic() throws Exception {
  String text = "Wuthering FooBar distant goldeN ABC compote";
  Map<String,String> args = new HashMap<>();
  args.put("ignoreCase", "true");
  args.put("protected", "protected-1.txt,protected-2.txt");  // Protected: foobar, jaxfopbuz, golden, compote
  args.put("wrappedFilters", "lowercase");

  ResourceLoader loader = new SolrResourceLoader(TEST_PATH().resolve("collection1"));
  ProtectedTermFilterFactory factory = new ProtectedTermFilterFactory(args);
  factory.inform(loader);

  TokenStream ts = factory.create(whitespaceMockTokenizer(text));
  BaseTokenStreamTestCase.assertTokenStreamContents(ts,
      new String[] { "wuthering", "FooBar", "distant", "goldeN", "abc", "compote" });
}
 
Example 4
Source File: SimplePhoneticAnalysisTests.java    From crate with Apache License 2.0 5 votes vote down vote up
public void testPhoneticTokenFilterBeiderMorseNoLanguage() throws IOException {
    TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilter");
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("ABADIAS"));
    String[] expected = new String[] { "abYdias", "abYdios", "abadia", "abadiaS", "abadias", "abadio", "abadioS", "abadios", "abodia",
            "abodiaS", "abodias", "abodio", "abodioS", "abodios", "avadias", "avadios", "avodias", "avodios", "obadia", "obadiaS",
            "obadias", "obadio", "obadioS", "obadios", "obodia", "obodiaS", "obodias", "obodioS" };
    BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
}
 
Example 5
Source File: SimplePhoneticAnalysisTests.java    From crate with Apache License 2.0 5 votes vote down vote up
public void testPhoneticTokenFilterBeiderMorseWithLanguage() throws IOException {
    TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilterfrench");
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("Rimbault"));
    String[] expected = new String[] { "rimbD", "rimbDlt", "rimba", "rimbalt", "rimbo", "rimbolt", "rimbu", "rimbult", "rmbD", "rmbDlt",
            "rmba", "rmbalt", "rmbo", "rmbolt", "rmbu", "rmbult" };
    BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
}
 
Example 6
Source File: SimplePhoneticAnalysisTests.java    From crate with Apache License 2.0 5 votes vote down vote up
public void testPhoneticTokenFilterDaitchMotokoff() throws IOException {
    TokenFilterFactory filterFactory = analysis.tokenFilter.get("daitch_mokotoff");
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("chauptman"));
    String[] expected = new String[] { "473660", "573660" };
    assertThat(filterFactory.create(tokenizer), instanceOf(DaitchMokotoffSoundexFilter.class));
    BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
}
 
Example 7
Source File: TestEmptyTokenStream.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testConsume2() throws IOException {
  BaseTokenStreamTestCase.assertTokenStreamContents(new EmptyTokenStream(), new String[0]);
}
 
Example 8
Source File: TestWordDelimiterFilterFactory.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Test
public void testCustomTypes() throws Exception {
  String testText = "I borrowed $5,400.00 at 25% interest-rate";
  ResourceLoader loader = new SolrResourceLoader(TEST_PATH().resolve("collection1"));
  Map<String,String> args = new HashMap<>();
  args.put("luceneMatchVersion", Version.LATEST.toString());
  args.put("generateWordParts", "1");
  args.put("generateNumberParts", "1");
  args.put("catenateWords", "1");
  args.put("catenateNumbers", "1");
  args.put("catenateAll", "0");
  args.put("splitOnCaseChange", "1");
  
  /* default behavior */
  WordDelimiterFilterFactory factoryDefault = new WordDelimiterFilterFactory(args);
  factoryDefault.inform(loader);
  
  TokenStream ts = factoryDefault.create(whitespaceMockTokenizer(testText));
  BaseTokenStreamTestCase.assertTokenStreamContents(ts, 
      new String[] { "I", "borrowed", "5", "540000", "400", "00", "at", "25", "interest", "interestrate", "rate" });

  ts = factoryDefault.create(whitespaceMockTokenizer("foo\u200Dbar"));
  BaseTokenStreamTestCase.assertTokenStreamContents(ts, 
      new String[] { "foo", "foobar", "bar" });

  
  /* custom behavior */
  args = new HashMap<>();
  // use a custom type mapping
  args.put("luceneMatchVersion", Version.LATEST.toString());
  args.put("generateWordParts", "1");
  args.put("generateNumberParts", "1");
  args.put("catenateWords", "1");
  args.put("catenateNumbers", "1");
  args.put("catenateAll", "0");
  args.put("splitOnCaseChange", "1");
  args.put("types", "wdftypes.txt");
  WordDelimiterFilterFactory factoryCustom = new WordDelimiterFilterFactory(args);
  factoryCustom.inform(loader);
  
  ts = factoryCustom.create(whitespaceMockTokenizer(testText));
  BaseTokenStreamTestCase.assertTokenStreamContents(ts, 
      new String[] { "I", "borrowed", "$5,400.00", "at", "25%", "interest", "interestrate", "rate" });
  
  /* test custom behavior with a char > 0x7F, because we had to make a larger byte[] */
  ts = factoryCustom.create(whitespaceMockTokenizer("foo\u200Dbar"));
  BaseTokenStreamTestCase.assertTokenStreamContents(ts, 
      new String[] { "foo\u200Dbar" });
}