Java Code Examples for org.apache.lucene.analysis.MockTokenizer#WHITESPACE

The following examples show how to use org.apache.lucene.analysis.MockTokenizer#WHITESPACE . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestWordDelimiterFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testOnlyNumbers() throws Exception {
  int flags = GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;
  Analyzer a = new Analyzer() {
      
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null));
    }
  };

  assertAnalyzesTo(a, "7-586", 
                   new String[] {},
                   new int[] {},
                   new int[] {},
                   null,
                   new int[] {},
                   null,
                   false);
}
 
Example 2
Source File: TestCompoundWordTokenFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testReset() throws Exception {
  CharArraySet dict = makeDictionary("Rind", "Fleisch", "Draht", "Schere", "Gesetz",
      "Aufgabe", "Überwachung");

  MockTokenizer wsTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wsTokenizer.setEnableChecks(false); // we will reset in a strange place
  wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
  DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
      wsTokenizer, dict,
      CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
  
  CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
  tf.reset();
  assertTrue(tf.incrementToken());
  assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
  assertTrue(tf.incrementToken());
  assertEquals("Rind", termAtt.toString());
  tf.end();
  tf.close();
  wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
  tf.reset();
  assertTrue(tf.incrementToken());
  assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
}
 
Example 3
Source File: TestICUNormalizer2CharFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testTokenStream() throws IOException {
  // '℃', '№', '㈱', '㌘', 'サ'+'<<', 'ソ'+'<<', '㌰'+'<<'
  String input = "℃ № ㈱ ㌘ ザ ゾ ㌰゙";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenStream.setReader(reader);

  assertTokenStreamContents(tokenStream,
    new String[] {"°C", "No", "(株)", "グラム", "ザ", "ゾ", "ピゴ"},
    new int[] {0, 2, 4, 6, 8, 11, 14},
    new int[] {1, 3, 5, 7, 10, 13, 16},
    input.length());
}
 
Example 4
Source File: TestConcatenateGraphFilterFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void test() throws Exception {
  for (final boolean consumeAll : new boolean[]{true, false}) {
    final String input = "A1 B2 A1 D4 C3";
    Reader reader = new StringReader(input);
    MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    tokenizer.setReader(reader);
    tokenizer.setEnableChecks(consumeAll);
    TokenStream stream = tokenizer;
    stream = tokenFilterFactory("ConcatenateGraph",
        "tokenSeparator", "\u001F"
    ).create(stream);
    assertTokenStreamContents(stream, new String[]{input.replace(' ', (char) ConcatenateGraphFilter.SEP_LABEL)});
  }
}
 
Example 5
Source File: TestConcatenatingTokenStream.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testBasic() throws IOException {

    AttributeFactory factory = newAttributeFactory();

    final MockTokenizer first = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
    first.setReader(new StringReader("first words "));
    final MockTokenizer second = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
    second.setReader(new StringReader("second words"));
    final MockTokenizer third = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
    third.setReader(new StringReader(" third words"));

    TokenStream ts = new ConcatenatingTokenStream(first, second, new EmptyTokenStream(), third);
    assertTokenStreamContents(ts,
        new String[] { "first", "words", "second", "words", "third", "words" },
        new int[]{ 0, 6, 12, 19, 25, 31 },
        new int[]{ 5, 11, 18, 24, 30, 36 });

    // test re-use
    first.setReader(new StringReader("first words "));
    second.setReader(new StringReader("second words"));
    third.setReader(new StringReader(" third words"));
    assertTokenStreamContents(ts,
        new String[] { "first", "words", "second", "words", "third", "words" },
        new int[]{ 0, 6, 12, 19, 25, 31 },
        new int[]{ 5, 11, 18, 24, 30, 36 },
        new int[]{ 1, 1, 1, 1, 1, 1 });

  }
 
Example 6
Source File: TestApostropheFilterFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Ensure the filter actually removes characters after an apostrophe.
 */
public void testApostrophes() throws Exception {
  Reader reader = new StringReader("Türkiye'de 2003'te Van Gölü'nü gördüm");
  TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  ((Tokenizer) stream).setReader(reader);
  stream = tokenFilterFactory("Apostrophe").create(stream);
  assertTokenStreamContents(stream, new String[]{"Türkiye", "2003", "Van", "Gölü", "gördüm"});
}
 
Example 7
Source File: TestLatvianStemFilterFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testStemming() throws Exception {
  Reader reader = new StringReader("tirgiem tirgus");
  TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  ((Tokenizer)stream).setReader(reader);
  stream = tokenFilterFactory("LatvianStem").create(stream);
  assertTokenStreamContents(stream, new String[] { "tirg", "tirg" });
}
 
Example 8
Source File: TestICUNormalizer2Filter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void setUp() throws Exception {
  super.setUp();
  a = new Analyzer() {
    @Override
    public TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new ICUNormalizer2Filter(tokenizer));
    }
  };
}
 
Example 9
Source File: QueryParserTestBase.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testQueryStringEscaping() throws Exception {
  Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);

  assertEscapedQueryEquals("a-b:c", a, "a\\-b\\:c");
  assertEscapedQueryEquals("a+b:c", a, "a\\+b\\:c");
  assertEscapedQueryEquals("a:b:c", a, "a\\:b\\:c");
  assertEscapedQueryEquals("a\\b:c", a, "a\\\\b\\:c");

  assertEscapedQueryEquals("a:b-c", a, "a\\:b\\-c");
  assertEscapedQueryEquals("a:b+c", a, "a\\:b\\+c");
  assertEscapedQueryEquals("a:b:c", a, "a\\:b\\:c");
  assertEscapedQueryEquals("a:b\\c", a, "a\\:b\\\\c");

  assertEscapedQueryEquals("a:b-c*", a, "a\\:b\\-c\\*");
  assertEscapedQueryEquals("a:b+c*", a, "a\\:b\\+c\\*");
  assertEscapedQueryEquals("a:b:c*", a, "a\\:b\\:c\\*");

  assertEscapedQueryEquals("a:b\\\\c*", a, "a\\:b\\\\\\\\c\\*");

  assertEscapedQueryEquals("a:b-?c", a, "a\\:b\\-\\?c");
  assertEscapedQueryEquals("a:b+?c", a, "a\\:b\\+\\?c");
  assertEscapedQueryEquals("a:b:?c", a, "a\\:b\\:\\?c");

  assertEscapedQueryEquals("a:b?c", a, "a\\:b\\?c");

  assertEscapedQueryEquals("a:b-c~", a, "a\\:b\\-c\\~");
  assertEscapedQueryEquals("a:b+c~", a, "a\\:b\\+c\\~");
  assertEscapedQueryEquals("a:b:c~", a, "a\\:b\\:c\\~");
  assertEscapedQueryEquals("a:b\\c~", a, "a\\:b\\\\c\\~");

  assertEscapedQueryEquals("[ a - TO a+ ]", null, "\\[ a \\- TO a\\+ \\]");
  assertEscapedQueryEquals("[ a : TO a~ ]", null, "\\[ a \\: TO a\\~ \\]");
  assertEscapedQueryEquals("[ a\\ TO a* ]", null, "\\[ a\\\\ TO a\\* \\]");
  
  // LUCENE-881
  assertEscapedQueryEquals("|| abc ||", a, "\\|\\| abc \\|\\|");
  assertEscapedQueryEquals("&& abc &&", a, "\\&\\& abc \\&\\&");
}
 
Example 10
Source File: ShingleAnalyzerWrapperTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testNonDefaultMinAndSameMaxShingleSize() throws Exception {
  ShingleAnalyzerWrapper analyzer
    = new ShingleAnalyzerWrapper(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3);
  assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
                        new String[] { "please",   "please divide this", 
                                       "divide",   "divide this sentence", 
                                       "this",     "this sentence into",
                                       "sentence", "sentence into shingles",
                                       "into",
                                       "shingles" },
                        new int[] { 0,  0,  7,  7, 14, 14, 19, 19, 28, 33 },
                        new int[] { 6, 18, 13, 27, 18, 32, 27, 41, 32, 41 },
                        new int[] { 1,  0,  1,  0,  1,  0,  1,  0,  1,  1 });
  analyzer.close();

  analyzer = new ShingleAnalyzerWrapper(
      new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false), 3, 3,
      ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, ShingleFilter.DEFAULT_FILLER_TOKEN);
  assertAnalyzesTo(analyzer, "please divide this sentence into shingles",
                        new String[] { "please divide this", 
                                       "divide this sentence", 
                                       "this sentence into",
                                       "sentence into shingles" },
                        new int[] {  0,  7, 14, 19 },
                        new int[] { 18, 27, 32, 41 },
                        new int[] {  1,  1,  1,  1 });
  analyzer.close();
}
 
Example 11
Source File: TestLatvianStemmer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void setUp() throws Exception {
  super.setUp();
  a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new LatvianStemFilter(tokenizer));
    }
  };
}
 
Example 12
Source File: TestGalicianMinimalStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void setUp() throws Exception {
  super.setUp();
  a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new GalicianMinimalStemFilter(tokenizer));
    }
  };
}
 
Example 13
Source File: TestSerbianNormalizationFilterFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testRegularStemming() throws Exception {
  Reader reader = new StringReader("ђура");
  TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  ((Tokenizer)stream).setReader(reader);
  stream = tokenFilterFactory("SerbianNormalization", "haircut", "regular").create(stream);
  assertTokenStreamContents(stream, new String[] { "đura" });
}
 
Example 14
Source File: CommonGramsFilterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Test CommonGramsQueryFilter in the case of a single word query
 */
public void testOneWordQuery() throws Exception {
  final String input = "monster";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "monster" });
}
 
Example 15
Source File: FuzzySuggesterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * basic "standardanalyzer" test with stopword removal
 */
public void testStandard() throws Exception {
  Input keys[] = new Input[] {
      new Input("the ghost of christmas past", 50),
  };
  
  Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
  Directory tempDir = getDirectory();
  FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy", standard, standard, AnalyzingSuggester.EXACT_FIRST | AnalyzingSuggester.PRESERVE_SEP, 256, -1, false, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS,
      FuzzySuggester.DEFAULT_NON_FUZZY_PREFIX, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH, FuzzySuggester.DEFAULT_UNICODE_AWARE);
  suggester.build(new InputArrayIterator(keys));
  
  List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1);
  assertEquals(1, results.size());
  assertEquals("the ghost of christmas past", results.get(0).key.toString());
  assertEquals(50, results.get(0).value, 0.01F);

  // omit the 'the' since it's a stopword, it's suggested anyway
  results = suggester.lookup(TestUtil.stringToCharSequence("ghost of chris", random()), false, 1);
  assertEquals(1, results.size());
  assertEquals("the ghost of christmas past", results.get(0).key.toString());
  assertEquals(50, results.get(0).value, 0.01F);

  // omit the 'the' and 'of' since they are stopwords, it's suggested anyway
  results = suggester.lookup(TestUtil.stringToCharSequence("ghost chris", random()), false, 1);
  assertEquals(1, results.size());
  assertEquals("the ghost of christmas past", results.get(0).key.toString());
  assertEquals(50, results.get(0).value, 0.01F);
  
  IOUtils.close(standard, tempDir);
}
 
Example 16
Source File: HighlighterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void makeIndex() throws IOException {
  IndexWriter writer = new IndexWriter(dir1, new IndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
  writer.addDocument( doc( "t_text1", "random words for highlighting tests del" ) );
  writer.addDocument( doc( "t_text1", "more random words for second field del" ) );
  writer.addDocument( doc( "t_text1", "random words for highlighting tests del" ) );
  writer.addDocument( doc( "t_text1", "more random words for second field" ) );
  writer.forceMerge(1);
  writer.close();
}
 
Example 17
Source File: AnalyzingInfixSuggesterTest.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testBinaryContext() throws Exception {
  byte[] context1 = new byte[4];
  byte[] context2 = new byte[5];
  byte[] context3 = new byte[1];
  context3[0] = (byte) 0xff;

  Input keys[] = new Input[] {
    new Input("lend me your ear", 8, new BytesRef("foobar"), asSet(context1, context2)),
    new Input("a penny saved is a penny earned", 10, new BytesRef("foobaz"), asSet(context1, context3))
  };

  Path tempDir = createTempDir("analyzingInfixContext");

  for(int iter=0;iter<2;iter++) {
    AnalyzingInfixSuggester suggester;
    Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
    if (iter == 0) {
      suggester = new AnalyzingInfixSuggester(newFSDirectory(tempDir), a, a, 3, false);
      suggester.build(new InputArrayIterator(keys));
    } else {
      // Test again, after close/reopen:
      suggester = new AnalyzingInfixSuggester(newFSDirectory(tempDir), a, a, 3, false);
    }

    // Both have context1:
    List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("ear", random()), asSet(context1), 10, true, true);
    assertEquals(2, results.size());

    LookupResult result = results.get(0);
    assertEquals("a penny saved is a penny earned", result.key);
    assertEquals("a penny saved is a penny <b>ear</b>ned", result.highlightKey);
    assertEquals(10, result.value);
    assertEquals(new BytesRef("foobaz"), result.payload);
    assertNotNull(result.contexts);
    assertEquals(2, result.contexts.size());
    assertTrue(result.contexts.contains(new BytesRef(context1)));
    assertTrue(result.contexts.contains(new BytesRef(context3)));

    result = results.get(1);
    assertEquals("lend me your ear", result.key);
    assertEquals("lend me your <b>ear</b>", result.highlightKey);
    assertEquals(8, result.value);
    assertEquals(new BytesRef("foobar"), result.payload);
    assertNotNull(result.contexts);
    assertEquals(2, result.contexts.size());
    assertTrue(result.contexts.contains(new BytesRef(context1)));
    assertTrue(result.contexts.contains(new BytesRef(context2)));

    suggester.close();
    a.close();
  }
}
 
Example 18
Source File: TestMultiAnalyzer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public TokenStreamComponents createComponents(String fieldName) {
  Tokenizer result = new MockTokenizer(MockTokenizer.WHITESPACE, true);
  return new TokenStreamComponents(result, new TestPosIncrementFilter(result));
}
 
Example 19
Source File: TestSynonymMapFilter.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testMatching() throws Exception {
  b = new SynonymMap.Builder(true);
  final boolean keepOrig = false;
  add("a b", "ab", keepOrig);
  add("a c", "ac", keepOrig);
  add("a", "aa", keepOrig);
  add("b", "bb", keepOrig);
  add("z x c v", "zxcv", keepOrig);
  add("x c", "xc", keepOrig);
  final SynonymMap map = b.build();
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
    }
  };

  checkOneTerm(a, "$", "$");
  checkOneTerm(a, "a", "aa");
  checkOneTerm(a, "b", "bb");
  
  assertAnalyzesTo(a, "a $",
     new String[] { "aa", "$" },
     new int[] { 1, 1 });
  
  assertAnalyzesTo(a, "$ a",
      new String[] { "$", "aa" },
      new int[] { 1, 1 });
  
  assertAnalyzesTo(a, "a a",
      new String[] { "aa", "aa" },
      new int[] { 1, 1 });
  
  assertAnalyzesTo(a, "z x c v",
      new String[] { "zxcv" },
      new int[] { 1 });
  
  assertAnalyzesTo(a, "z x c $",
      new String[] { "z", "xc", "$" },
      new int[] { 1, 1, 1 });
  a.close();
}
 
Example 20
Source File: TestCoreParser.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
protected Analyzer newAnalyzer() {
  // TODO: rewrite test (this needs to set QueryParser.enablePositionIncrements, too, for work with CURRENT):
  return new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
}