Java Code Examples for org.apache.lucene.analysis.MockTokenizer#KEYWORD

The following examples show how to use org.apache.lucene.analysis.MockTokenizer#KEYWORD . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: FuzzySuggesterTest.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testRandomEdits() throws IOException {
  List<Input> keys = new ArrayList<>();
  int numTerms = atLeast(100);
  for (int i = 0; i < numTerms; i++) {
    keys.add(new Input("boo" + TestUtil.randomSimpleString(random()), 1 + random().nextInt(100)));
  }
  keys.add(new Input("foo bar boo far", 12));
  MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
  Directory tempDir = getDirectory();
  FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy", analyzer, analyzer, FuzzySuggester.EXACT_FIRST | FuzzySuggester.PRESERVE_SEP, 256, -1, true, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS,
                                                0, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH, FuzzySuggester.DEFAULT_UNICODE_AWARE);
  suggester.build(new InputArrayIterator(keys));
  int numIters = atLeast(10);
  for (int i = 0; i < numIters; i++) {
    String addRandomEdit = addRandomEdit("foo bar boo", FuzzySuggester.DEFAULT_NON_FUZZY_PREFIX);
    List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2);
    assertEquals(addRandomEdit, 1, results.size());
    assertEquals("foo bar boo far", results.get(0).key.toString());
    assertEquals(12, results.get(0).value, 0.01F);  
  }
  IOUtils.close(analyzer, tempDir);
}
 
Example 2
Source File: FuzzySuggesterTest.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testNonLatinRandomEdits() throws IOException {
  List<Input> keys = new ArrayList<>();
  int numTerms = atLeast(100);
  for (int i = 0; i < numTerms; i++) {
    keys.add(new Input("буу" + TestUtil.randomSimpleString(random()), 1 + random().nextInt(100)));
  }
  keys.add(new Input("фуу бар буу фар", 12));
  MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
  Directory tempDir = getDirectory();
  FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy",analyzer, analyzer, FuzzySuggester.EXACT_FIRST | FuzzySuggester.PRESERVE_SEP, 256, -1, true, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS,
      0, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH, true);
  suggester.build(new InputArrayIterator(keys));
  int numIters = atLeast(10);
  for (int i = 0; i < numIters; i++) {
    String addRandomEdit = addRandomEdit("фуу бар буу", 0);
    List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2);
    assertEquals(addRandomEdit, 1, results.size());
    assertEquals("фуу бар буу фар", results.get(0).key.toString());
    assertEquals(12, results.get(0).value, 0.01F);
  }
  IOUtils.close(analyzer, tempDir);
}
 
Example 3
Source File: MockTokenizerFactory.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** Creates a new MockTokenizerFactory */
public MockTokenizerFactory(Map<String,String> args) {
  super(args);
  String patternArg = get(args, "pattern", Arrays.asList("keyword", "simple", "whitespace"));
  if ("keyword".equalsIgnoreCase(patternArg)) {
    pattern = MockTokenizer.KEYWORD;
  } else if ("simple".equalsIgnoreCase(patternArg)) {
    pattern = MockTokenizer.SIMPLE;
  } else {
    pattern = MockTokenizer.WHITESPACE;
  }
  
  enableChecks = getBoolean(args, "enableChecks", true);
  if (!args.isEmpty()) {
    throw new IllegalArgumentException("Unknown parameters: " + args);
  }
}
 
Example 4
Source File: TestBeiderMorseFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testCustomAttribute() throws IOException {
  TokenStream stream = new MockTokenizer(MockTokenizer.KEYWORD, false);
  ((Tokenizer)stream).setReader(new StringReader("D'Angelo"));
  stream = new PatternKeywordMarkerFilter(stream, Pattern.compile(".*"));
  stream = new BeiderMorseFilter(stream, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true));
  KeywordAttribute keyAtt = stream.addAttribute(KeywordAttribute.class);
  stream.reset();
  int i = 0;
  while(stream.incrementToken()) {
    assertTrue(keyAtt.isKeyword());
    i++;
  }
  assertEquals(12, i);
  stream.end();
  stream.close();
}
 
Example 5
Source File: WordDelimiterFilter2Tests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;
    flags |= (stemPossessive == 1) ? STEM_ENGLISH_POSSESSIVE : 0;
    Tokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
    tokenizer.setReader(new StringReader(input));
    WordDelimiterFilter2 wdf = new WordDelimiterFilter2(tokenizer, flags, null);
    assertTokenStreamContents(wdf, output);
}
 
Example 6
Source File: TestMoreLikeThis.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testMultiValues() throws Exception {
  Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
  mlt.setAnalyzer(analyzer);
  mlt.setFieldNames(new String[] {"text"});
  
  BooleanQuery query = (BooleanQuery) mlt.like("text",
      new StringReader("lucene"), new StringReader("lucene release"),
      new StringReader("apache"), new StringReader("apache lucene"));
  Collection<BooleanClause> clauses = query.clauses();
  assertEquals("Expected 2 clauses only!", 2, clauses.size());
  for (BooleanClause clause : clauses) {
    Term term = ((TermQuery) clause.getQuery()).getTerm();
    assertTrue(Arrays.asList(new Term("text", "lucene"), new Term("text", "apache")).contains(term));
  }
}
 
Example 7
Source File: WordDelimiterFilter2Tests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void doSplit(final String input, String... output) throws Exception {
    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    Tokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
    tokenizer.setReader(new StringReader(input));
    WordDelimiterFilter2 wdf = new WordDelimiterFilter2(tokenizer, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);
    assertTokenStreamContents(wdf, output);
}
 
Example 8
Source File: TestFuzzyQuery.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void test2() throws Exception {
  Directory directory = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), directory, new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));
  addDoc("LANGE", writer);
  addDoc("LUETH", writer);
  addDoc("PIRSING", writer);
  addDoc("RIEGEL", writer);
  addDoc("TRZECZIAK", writer);
  addDoc("WALKER", writer);
  addDoc("WBR", writer);
  addDoc("WE", writer);
  addDoc("WEB", writer);
  addDoc("WEBE", writer);
  addDoc("WEBER", writer);
  addDoc("WEBERE", writer);
  addDoc("WEBREE", writer);
  addDoc("WEBEREI", writer);
  addDoc("WBRE", writer);
  addDoc("WITTKOPF", writer);
  addDoc("WOJNAROWSKI", writer);
  addDoc("WRICKE", writer);

  IndexReader reader = writer.getReader();
  IndexSearcher searcher = newSearcher(reader);
  writer.close();

  FuzzyQuery query = new FuzzyQuery(new Term("field", "WEBER"), 2, 1);
  //query.setRewriteMethod(FuzzyQuery.SCORING_BOOLEAN_QUERY_REWRITE);
  ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
  assertEquals(8, hits.length);

  reader.close();
  directory.close();
}
 
Example 9
Source File: TestSimpleQueryParser.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** helper to parse a query with keyword analyzer across "field" */
private Query parseKeyword(String text, int flags) {
  Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
  SimpleQueryParser parser = new SimpleQueryParser(analyzer,
      Collections.singletonMap("field", 1f),
      flags);
  return parser.parse(text);
}
 
Example 10
Source File: TestTrimFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
  Analyzer a = new Analyzer() {

    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
      return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer));
    } 
  };
  checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER);
  a.close();
}
 
Example 11
Source File: TestCapitalizationFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
static void assertCapitalizesToKeyword(String input, String expected,
    boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
    Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
    int maxTokenLength) throws IOException {
  final MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
  tokenizer.setReader(new StringReader(input));
  assertCapitalizesTo(tokenizer,
      new String[] { expected }, onlyFirstWord, keep, forceFirstLetter, okPrefix,
      minWordLength, maxWordCount, maxTokenLength);    
}
 
Example 12
Source File: TestPorterStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void setUp() throws Exception {
  super.setUp();
  a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer t = new MockTokenizer( MockTokenizer.KEYWORD, false);
      return new TokenStreamComponents(t, new PorterStemFilter(t));
    }
  };
}
 
Example 13
Source File: TestSoraniNormalizationFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void setUp() throws Exception {
  super.setUp();
  a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
      return new TokenStreamComponents(tokenizer, new SoraniNormalizationFilter(tokenizer));
    }
  };
}
 
Example 14
Source File: TestGermanStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void setUp() throws Exception {
  super.setUp();
  analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer t = new MockTokenizer(MockTokenizer.KEYWORD, false);
      return new TokenStreamComponents(t,
          new GermanStemFilter(new LowerCaseFilter(t)));
    }
  };
}
 
Example 15
Source File: TestJapaneseIterationMarkCharFilterFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testIterationMarksWithKeywordTokenizer() throws IOException {
  final String text = "時々馬鹿々々しいところゞゝゝミスヾ";
  JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(new HashMap<String,String>());
  Reader filter = filterFactory.create(new StringReader(text));
  TokenStream tokenStream = new MockTokenizer(MockTokenizer.KEYWORD, false);
  ((Tokenizer)tokenStream).setReader(filter);
  assertTokenStreamContents(tokenStream, new String[]{"時時馬鹿馬鹿しいところどころミスズ"});
}
 
Example 16
Source File: FuzzySuggesterTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEmpty() throws Exception {
  Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
  Directory tempDir = getDirectory();
  FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy", analyzer);
  suggester.build(new InputArrayIterator(new Input[0]));

  List<LookupResult> result = suggester.lookup("a", false, 20);
  assertTrue(result.isEmpty());
  IOUtils.close(analyzer, tempDir);
}
 
Example 17
Source File: AnalyzingSuggesterTest.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testKeywordWithPayloads() throws Exception {
  Iterable<Input> keys = shuffle(
    new Input("foo", 50, new BytesRef("hello")),
    new Input("bar", 10, new BytesRef("goodbye")),
    new Input("barbar", 12, new BytesRef("thank you")),
    new Input("bar", 9, new BytesRef("should be deduplicated")),
    new Input("bar", 8, new BytesRef("should also be deduplicated")),
    new Input("barbara", 6, new BytesRef("for all the fish")));
  
  Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
  Directory tempDir = getDirectory();
  AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", analyzer);
  suggester.build(new InputArrayIterator(keys));
  for (int i = 0; i < 2; i++) {
    // top N of 2, but only foo is available
    List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("f", random()), false, 2);
    assertEquals(1, results.size());
    assertEquals("foo", results.get(0).key.toString());
    assertEquals(50, results.get(0).value, 0.01F);
    assertEquals(new BytesRef("hello"), results.get(0).payload);
    
    // top N of 1 for 'bar': we return this even though
    // barbar is higher because exactFirst is enabled:
    results = suggester.lookup(TestUtil.stringToCharSequence("bar", random()), false, 1);
    assertEquals(1, results.size());
    assertEquals("bar", results.get(0).key.toString());
    assertEquals(10, results.get(0).value, 0.01F);
    assertEquals(new BytesRef("goodbye"), results.get(0).payload);
    
    // top N Of 2 for 'b'
    results = suggester.lookup(TestUtil.stringToCharSequence("b", random()), false, 2);
    assertEquals(2, results.size());
    assertEquals("barbar", results.get(0).key.toString());
    assertEquals(12, results.get(0).value, 0.01F);
    assertEquals(new BytesRef("thank you"), results.get(0).payload);
    assertEquals("bar", results.get(1).key.toString());
    assertEquals(10, results.get(1).value, 0.01F);
    assertEquals(new BytesRef("goodbye"), results.get(1).payload);
    
    // top N of 3 for 'ba'
    results = suggester.lookup(TestUtil.stringToCharSequence("ba", random()), false, 3);
    assertEquals(3, results.size());
    assertEquals("barbar", results.get(0).key.toString());
    assertEquals(12, results.get(0).value, 0.01F);
    assertEquals(new BytesRef("thank you"), results.get(0).payload);
    assertEquals("bar", results.get(1).key.toString());
    assertEquals(10, results.get(1).value, 0.01F);
    assertEquals(new BytesRef("goodbye"), results.get(1).payload);
    assertEquals("barbara", results.get(2).key.toString());
    assertEquals(6, results.get(2).value, 0.01F);
    assertEquals(new BytesRef("for all the fish"), results.get(2).payload);
  }
  IOUtils.close(analyzer, tempDir);
}
 
Example 18
Source File: AnalyzingSuggesterTest.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */
public void testKeyword() throws Exception {
  Iterable<Input> keys = shuffle(
      new Input("foo", 50),
      new Input("bar", 10),
      new Input("barbar", 10),
      new Input("barbar", 12),
      new Input("barbara", 6),
      new Input("bar", 5),
      new Input("barbara", 1)
  );

  Directory tempDir = getDirectory();

  Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
  AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", analyzer);
  suggester.build(new InputArrayIterator(keys));
  
  // top N of 2, but only foo is available
  List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("f", random()), false, 2);
  assertEquals(1, results.size());
  assertEquals("foo", results.get(0).key.toString());
  assertEquals(50, results.get(0).value, 0.01F);
  
  // top N of 1 for 'bar': we return this even though
  // barbar is higher because exactFirst is enabled:
  results = suggester.lookup(TestUtil.stringToCharSequence("bar", random()), false, 1);
  assertEquals(1, results.size());
  assertEquals("bar", results.get(0).key.toString());
  assertEquals(10, results.get(0).value, 0.01F);
  
  // top N Of 2 for 'b'
  results = suggester.lookup(TestUtil.stringToCharSequence("b", random()), false, 2);
  assertEquals(2, results.size());
  assertEquals("barbar", results.get(0).key.toString());
  assertEquals(12, results.get(0).value, 0.01F);
  assertEquals("bar", results.get(1).key.toString());
  assertEquals(10, results.get(1).value, 0.01F);
  
  // top N of 3 for 'ba'
  results = suggester.lookup(TestUtil.stringToCharSequence("ba", random()), false, 3);
  assertEquals(3, results.size());
  assertEquals("barbar", results.get(0).key.toString());
  assertEquals(12, results.get(0).value, 0.01F);
  assertEquals("bar", results.get(1).key.toString());
  assertEquals(10, results.get(1).value, 0.01F);
  assertEquals("barbara", results.get(2).key.toString());
  assertEquals(6, results.get(2).value, 0.01F);
  
  IOUtils.close(analyzer, tempDir);
}
 
Example 19
Source File: FuzzySuggesterTest.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */
public void testKeyword() throws Exception {
  Input keys[] = new Input[] {
      new Input("foo", 50),
      new Input("bar", 10),
      new Input("barbar", 12),
      new Input("barbara", 6)
  };
  
  Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
  Directory tempDir = getDirectory();
  FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy",analyzer);
  suggester.build(new InputArrayIterator(keys));
  
  List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("bariar", random()), false, 2);
  assertEquals(2, results.size());
  assertEquals("barbar", results.get(0).key.toString());
  assertEquals(12, results.get(0).value, 0.01F);
  
  results = suggester.lookup(TestUtil.stringToCharSequence("barbr", random()), false, 2);
  assertEquals(2, results.size());
  assertEquals("barbar", results.get(0).key.toString());
  assertEquals(12, results.get(0).value, 0.01F);
  
  results = suggester.lookup(TestUtil.stringToCharSequence("barbara", random()), false, 2);
  assertEquals(2, results.size());
  assertEquals("barbara", results.get(0).key.toString());
  assertEquals(6, results.get(0).value, 0.01F);
  
  results = suggester.lookup(TestUtil.stringToCharSequence("barbar", random()), false, 2);
  assertEquals(2, results.size());
  assertEquals("barbar", results.get(0).key.toString());
  assertEquals(12, results.get(0).value, 0.01F);
  assertEquals("barbara", results.get(1).key.toString());
  assertEquals(6, results.get(1).value, 0.01F);
  
  results = suggester.lookup(TestUtil.stringToCharSequence("barbaa", random()), false, 2);
  assertEquals(2, results.size());
  assertEquals("barbar", results.get(0).key.toString());
  assertEquals(12, results.get(0).value, 0.01F);
  assertEquals("barbara", results.get(1).key.toString());
  assertEquals(6, results.get(1).value, 0.01F);
  
  // top N of 2, but only foo is available
  results = suggester.lookup(TestUtil.stringToCharSequence("f", random()), false, 2);
  assertEquals(1, results.size());
  assertEquals("foo", results.get(0).key.toString());
  assertEquals(50, results.get(0).value, 0.01F);
  
  // top N of 1 for 'bar': we return this even though
  // barbar is higher because exactFirst is enabled:
  results = suggester.lookup(TestUtil.stringToCharSequence("bar", random()), false, 1);
  assertEquals(1, results.size());
  assertEquals("bar", results.get(0).key.toString());
  assertEquals(10, results.get(0).value, 0.01F);
  
  // top N Of 2 for 'b'
  results = suggester.lookup(TestUtil.stringToCharSequence("b", random()), false, 2);
  assertEquals(2, results.size());
  assertEquals("barbar", results.get(0).key.toString());
  assertEquals(12, results.get(0).value, 0.01F);
  assertEquals("bar", results.get(1).key.toString());
  assertEquals(10, results.get(1).value, 0.01F);
  
  // top N of 3 for 'ba'
  results = suggester.lookup(TestUtil.stringToCharSequence("ba", random()), false, 3);
  assertEquals(3, results.size());
  assertEquals("barbar", results.get(0).key.toString());
  assertEquals(12, results.get(0).value, 0.01F);
  assertEquals("bar", results.get(1).key.toString());
  assertEquals(10, results.get(1).value, 0.01F);
  assertEquals("barbara", results.get(2).key.toString());
  assertEquals(6, results.get(2).value, 0.01F);
  
  IOUtils.close(analyzer, tempDir);
}