Java Code Examples for org.apache.lucene.analysis.MockTokenizer

The following examples show how to use org.apache.lucene.analysis.MockTokenizer. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: lucene-solr   Source File: TestCompoundWordTokenFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception {
  CharArraySet dict = makeDictionary("ab", "cd", "ef");

  Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenizer.setReader(new StringReader("abcdef"));
  DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
    tokenizer,
    dict,
    CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);

  assertTokenStreamContents(tf,
    new String[] { "abcdef", "ab", "cd", "ef" },
    new int[] { 0, 0, 0, 0},
    new int[] { 6, 6, 6, 6},
    new int[] { 1, 0, 0, 0}
    );
}
 
Example 2
Source Project: lucene-solr   Source File: HighlighterTest.java    License: Apache License 2.0 6 votes vote down vote up
public void testMaxSizeEndHighlight() throws Exception {
  TestHighlightRunner helper = new TestHighlightRunner() {
    @Override
    public void run() throws Exception {
      CharacterRunAutomaton stopWords = new CharacterRunAutomaton(new RegExp("i[nt]").toAutomaton());
      TermQuery query = new TermQuery(new Term("text", "searchterm"));

      String text = "this is a text with searchterm in it";
      SimpleHTMLFormatter fm = new SimpleHTMLFormatter();
      Highlighter hg = getHighlighter(query, "text", fm);
      hg.setTextFragmenter(new NullFragmenter());
      hg.setMaxDocCharsToAnalyze(36);
      String match = hg.getBestFragment(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords), "text", text);
      assertTrue(
          "Matched text should contain remainder of text after highlighted query ",
          match.endsWith("in it"));
    }
  };
  helper.start();
}
 
Example 3
Source Project: lucene-solr   Source File: MockTokenizerFactory.java    License: Apache License 2.0 6 votes vote down vote up
/** Creates a new MockTokenizerFactory */
public MockTokenizerFactory(Map<String,String> args) {
  super(args);
  String patternArg = get(args, "pattern", Arrays.asList("keyword", "simple", "whitespace"));
  if ("keyword".equalsIgnoreCase(patternArg)) {
    pattern = MockTokenizer.KEYWORD;
  } else if ("simple".equalsIgnoreCase(patternArg)) {
    pattern = MockTokenizer.SIMPLE;
  } else {
    pattern = MockTokenizer.WHITESPACE;
  }
  
  enableChecks = getBoolean(args, "enableChecks", true);
  if (!args.isEmpty()) {
    throw new IllegalArgumentException("Unknown parameters: " + args);
  }
}
 
Example 4
Source Project: lucene-solr   Source File: QueryParserTestBase.java    License: Apache License 2.0 6 votes vote down vote up
public void testStopwords() throws Exception {
  CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton());
  CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet));
  Query result = getQuery("field:the OR field:foo",qp);
  assertNotNull("result is null and it shouldn't be", result);
  assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery || result instanceof MatchNoDocsQuery);
  if (result instanceof BooleanQuery) {
    assertEquals(0, ((BooleanQuery) result).clauses().size());
  }
  result = getQuery("field:woo OR field:the",qp);
  assertNotNull("result is null and it shouldn't be", result);
  assertTrue("result is not a TermQuery", result instanceof TermQuery);
  result = getQuery("(fieldX:xxxxx OR fieldy:xxxxxxxx)^2 AND (fieldx:the OR fieldy:foo)",qp);
  assertNotNull("result is null and it shouldn't be", result);
  assertTrue("result is not a BoostQuery", result instanceof BoostQuery);
  result = ((BoostQuery) result).getQuery();
  assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery);
  if (VERBOSE) System.out.println("Result: " + result);
  assertTrue(((BooleanQuery) result).clauses().size() + " does not equal: " + 2, ((BooleanQuery) result).clauses().size() == 2);
}
 
Example 5
Source Project: lucene-solr   Source File: TestMaxTermFrequency.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void setUp() throws Exception {
  super.setUp();
  dir = newDirectory();
  IndexWriterConfig config = newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true))
                               .setMergePolicy(newLogMergePolicy());
  config.setSimilarity(new TestSimilarity());
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir, config);
  Document doc = new Document();
  Field foo = newTextField("foo", "", Field.Store.NO);
  doc.add(foo);
  for (int i = 0; i < 100; i++) {
    foo.setStringValue(addValue());
    writer.addDocument(doc);
  }
  reader = writer.getReader();
  writer.close();
}
 
Example 6
Source Project: lucene-solr   Source File: TestQueryParser.java    License: Apache License 2.0 6 votes vote down vote up
public void testFuzzySlopeExtendability() throws ParseException {
  QueryParser qp = new QueryParser("a",  new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)) {

    @Override
    Query handleBareFuzzy(String qfield, Token fuzzySlop, String termImage)
        throws ParseException {
      
      if(fuzzySlop.image.endsWith("€")) {
        float fms = fuzzyMinSim;
        try {
          fms = Float.parseFloat(fuzzySlop.image.substring(1, fuzzySlop.image.length()-1));
        } catch (Exception ignored) { }
        float value = Float.parseFloat(termImage);
        return getRangeQuery(qfield, Float.toString(value-fms/2.f), Float.toString(value+fms/2.f), true, true);
      }
      return super.handleBareFuzzy(qfield, fuzzySlop, termImage);
    }
    
  };
  assertEquals(qp.parse("a:[11.95 TO 12.95]"), qp.parse("12.45~1€"));
}
 
Example 7
Source Project: lucene-solr   Source File: TestUniqueTermCount.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void setUp() throws Exception {
  super.setUp();
  dir = newDirectory();
  MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
  IndexWriterConfig config = newIndexWriterConfig(analyzer);
  config.setMergePolicy(newLogMergePolicy());
  config.setSimilarity(new TestSimilarity());
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir, config);
  Document doc = new Document();
  Field foo = newTextField("foo", "", Field.Store.NO);
  doc.add(foo);
  for (int i = 0; i < 100; i++) {
    foo.setStringValue(addValue());
    writer.addDocument(doc);
  }
  reader = writer.getReader();
  writer.close();
}
 
Example 8
Source Project: lucene-solr   Source File: TestPayloadCheckQuery.java    License: Apache License 2.0 6 votes vote down vote up
@BeforeClass
public static void beforeClass() throws Exception {
  Analyzer simplePayloadAnalyzer = new Analyzer() {
      @Override
      public TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
        return new TokenStreamComponents(tokenizer, new SimplePayloadFilter(tokenizer));
      }
  };

  directory = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
      newIndexWriterConfig(simplePayloadAnalyzer)
          .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy()));
  //writer.infoStream = System.out;
  for (int i = 0; i < 2000; i++) {
    Document doc = new Document();
    doc.add(newTextField("field", English.intToEnglish(i), Field.Store.YES));
    writer.addDocument(doc);
  }
  reader = writer.getReader();
  searcher = newSearcher(reader);
  writer.close();
}
 
Example 9
Source Project: lucene-solr   Source File: TestCompoundWordTokenFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testReset() throws Exception {
  CharArraySet dict = makeDictionary("Rind", "Fleisch", "Draht", "Schere", "Gesetz",
      "Aufgabe", "Überwachung");

  MockTokenizer wsTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wsTokenizer.setEnableChecks(false); // we will reset in a strange place
  wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
  DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
      wsTokenizer, dict,
      CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
  
  CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
  tf.reset();
  assertTrue(tf.incrementToken());
  assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
  assertTrue(tf.incrementToken());
  assertEquals("Rind", termAtt.toString());
  tf.end();
  tf.close();
  wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
  tf.reset();
  assertTrue(tf.incrementToken());
  assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
}
 
Example 10
Source Project: lucene-solr   Source File: TestPhraseQuery.java    License: Apache License 2.0 6 votes vote down vote up
public void testPhraseQueryWithStopAnalyzer() throws Exception {
  Directory directory = newDirectory();
  Analyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
  RandomIndexWriter writer = new RandomIndexWriter(random(), directory, 
      newIndexWriterConfig(stopAnalyzer));
  Document doc = new Document();
  doc.add(newTextField("field", "the stop words are here", Field.Store.YES));
  writer.addDocument(doc);
  IndexReader reader = writer.getReader();
  writer.close();

  IndexSearcher searcher = newSearcher(reader);

  // valid exact phrase query
  PhraseQuery query = new PhraseQuery("field", "stop", "words");
  ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
  assertEquals(1, hits.length);
  QueryUtils.check(random(), query,searcher);

  reader.close();
  directory.close();
}
 
Example 11
Source Project: lucene-solr   Source File: TestBeiderMorseFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testCustomAttribute() throws IOException {
  TokenStream stream = new MockTokenizer(MockTokenizer.KEYWORD, false);
  ((Tokenizer)stream).setReader(new StringReader("D'Angelo"));
  stream = new PatternKeywordMarkerFilter(stream, Pattern.compile(".*"));
  stream = new BeiderMorseFilter(stream, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true));
  KeywordAttribute keyAtt = stream.addAttribute(KeywordAttribute.class);
  stream.reset();
  int i = 0;
  while(stream.incrementToken()) {
    assertTrue(keyAtt.isKeyword());
    i++;
  }
  assertEquals(12, i);
  stream.end();
  stream.close();
}
 
Example 12
Source Project: lucene-solr   Source File: TestQPHelper.java    License: Apache License 2.0 6 votes vote down vote up
public void testPositionIncrement() throws Exception {
  StandardQueryParser qp = new StandardQueryParser();
  qp.setAnalyzer(
      new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET));

  qp.setEnablePositionIncrements(true);

  String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\"";
  // 0 2 5 7 8
  int expectedPositions[] = { 1, 3, 4, 6, 9 };
  PhraseQuery pq = (PhraseQuery) qp.parse(qtxt, "a");
  // System.out.println("Query text: "+qtxt);
  // System.out.println("Result: "+pq);
  Term t[] = pq.getTerms();
  int pos[] = pq.getPositions();
  for (int i = 0; i < t.length; i++) {
    // System.out.println(i+". "+t[i]+"  pos: "+pos[i]);
    assertEquals("term " + i + " = " + t[i] + " has wrong term-position!",
        expectedPositions[i], pos[i]);
  }
}
 
Example 13
Source Project: lucene-solr   Source File: TestICUNormalizer2Filter.java    License: Apache License 2.0 6 votes vote down vote up
public void testAlternate() throws IOException {
  Analyzer a = new Analyzer() {
    @Override
    public TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new ICUNormalizer2Filter(
          tokenizer,
          /* specify nfc with decompose to get nfd */
          Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE)));
    }
  };
  
  // decompose EAcute into E + combining Acute
  assertAnalyzesTo(a, "\u00E9", new String[] { "\u0065\u0301" });
  a.close();
}
 
Example 14
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
  final int numIters = atLeast(3);
  for (int i = 0; i < numIters; i++) {
    SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean());
    final int numEntries = atLeast(10);
    for (int j = 0; j < numEntries; j++) {
      add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean());
    }
    final SynonymMap map = b.build();
    final boolean ignoreCase = random().nextBoolean();
    
    final Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
        TokenStream stream = new SynonymGraphFilter(tokenizer, map, ignoreCase);
        return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream));
      }
    };

    checkRandomData(random(), analyzer, 200);
    analyzer.close();
  }
}
 
Example 15
Source Project: lucene-solr   Source File: TestICUNormalizer2CharFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testMassiveLigature() throws IOException {
  String input = "\uFDFA";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenStream.setReader(reader);

  assertTokenStreamContents(tokenStream,
    new String[] {"صلى", "الله", "عليه", "وسلم"},
    new int[]{0, 0, 0, 0},
    new int[]{0, 0, 0, 1},
    input.length()
  );
}
 
Example 16
Source Project: lucene-solr   Source File: TestSynonymMapFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testDontKeepOrig() throws Exception {
  b = new SynonymMap.Builder(true);
  add("a b", "foo", false);

  final SynonymMap map = b.build();

  final Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
      return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, false));
    }
  };

  assertAnalyzesTo(analyzer, "a b c",
                   new String[] {"foo", "c"},
                   new int[] {0, 4},
                   new int[] {3, 5},
                   null,
                   new int[] {1, 1},
                   new int[] {1, 1},
                   true);
  checkAnalysisConsistency(random(), analyzer, false, "a b c");
  analyzer.close();
}
 
Example 17
Source Project: lucene-solr   Source File: TestDirectSpellChecker.java    License: Apache License 2.0 6 votes vote down vote up
public void testBogusField() throws Exception {
  DirectSpellChecker spellChecker = new DirectSpellChecker();
  Directory dir = newDirectory();
  Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);

  for (int i = 0; i < 20; i++) {
    Document doc = new Document();
    doc.add(newTextField("numbers", English.intToEnglish(i), Field.Store.NO));
    writer.addDocument(doc);
  }

  IndexReader ir = writer.getReader();

  SuggestWord[] similar = spellChecker.suggestSimilar(new Term(
      "bogusFieldBogusField", "fvie"), 2, ir,
      SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
  assertEquals(0, similar.length);
  
  IOUtils.close(ir, writer, dir, analyzer);
}
 
Example 18
Source Project: lucene-solr   Source File: TestFrenchLightStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("chevaux"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new FrenchLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "chevaux", "chevaux");
  a.close();
}
 
Example 19
/** Test that invalid arguments result in exception */
public void testInvalidArguments() throws Exception {
  IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
    Reader reader = new StringReader("foo foobar super-duper-trooper");
    TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    ((Tokenizer)stream).setReader(reader);
    tokenFilterFactory("CodepointCount",
        CodepointCountFilterFactory.MIN_KEY, "5",
        CodepointCountFilterFactory.MAX_KEY, "4").create(stream);
  });
  assertTrue(expected.getMessage().contains("maximum length must not be greater than minimum length"));
}
 
Example 20
Source Project: lucene-solr   Source File: TestSynonymGraphFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testZeroSyns() throws Exception {
  Tokenizer tokenizer = new MockTokenizer();
  tokenizer.setReader(new StringReader("aa bb"));

  IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () ->
      new SynonymGraphFilter(tokenizer, new SynonymMap.Builder(true).build(), true));
  assertEquals("fst must be non-null", ex.getMessage());
}
 
Example 21
Source Project: lucene-solr   Source File: TestHunspellStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, dictionary));
    }  
  };
  checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
  analyzer.close();
}
 
Example 22
Source Project: lucene-solr   Source File: TestHungarianLightStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void setUp() throws Exception {
  super.setUp();
  analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      return new TokenStreamComponents(source, new HungarianLightStemFilter(source));
    }
  };
}
 
Example 23
Source Project: lucene-solr   Source File: TestKStemmer.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void setUp() throws Exception {
  super.setUp();
  a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
      return new TokenStreamComponents(tokenizer, new KStemFilter(tokenizer));
    }
  };
}
 
Example 24
Source Project: lucene-solr   Source File: TestCapitalizationFilter.java    License: Apache License 2.0 5 votes vote down vote up
static void assertCapitalizesTo(String input, String expected[],
    boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
    Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
    int maxTokenLength) throws IOException {
  final MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenizer.setReader(new StringReader(input));
  assertCapitalizesTo(tokenizer,
      expected, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength,
      maxWordCount, maxTokenLength);
}
 
Example 25
Source Project: lucene-solr   Source File: HighlighterPhraseTest.java    License: Apache License 2.0 5 votes vote down vote up
public void testSparsePhraseWithNoPositions() throws IOException, InvalidTokenOffsetsException {
  final String TEXT = "the fox did not jump";
  final Directory directory = newDirectory();
  final IndexWriter indexWriter = new IndexWriter(directory,
      newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
  try {
    final Document document = new Document();

    FieldType customType = new FieldType(TextField.TYPE_STORED);
    customType.setStoreTermVectorOffsets(true);
    customType.setStoreTermVectors(true);
    document.add(new Field(FIELD, TEXT, customType));
    indexWriter.addDocument(document);
  } finally {
    indexWriter.close();
  }
  final IndexReader indexReader = DirectoryReader.open(directory);
  try {
    assertEquals(1, indexReader.numDocs());
    final IndexSearcher indexSearcher = newSearcher(indexReader);
    final PhraseQuery phraseQuery = new PhraseQuery(1, FIELD, "did", "jump");
    TopDocs hits = indexSearcher.search(phraseQuery, 1);
    assertEquals(1, hits.totalHits.value);
    final Highlighter highlighter = new Highlighter(
        new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
        new QueryScorer(phraseQuery));
    final TokenStream tokenStream =
        TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
    assertEquals("the fox <B>did</B> not <B>jump</B>", highlighter
        .getBestFragment(tokenStream, TEXT));
  } finally {
    indexReader.close();
    directory.close();
  }
}
 
Example 26
/** Test stemming with variant set explicitly to Bokmål */
public void testBokmaalStemming() throws Exception {
  Reader reader = new StringReader("eple eplet epler eplene eplets eplenes");
  TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  ((Tokenizer)stream).setReader(reader);
  stream = tokenFilterFactory("NorwegianMinimalStem", "variant", "nb").create(stream);
  assertTokenStreamContents(stream, new String[] { "epl", "epl", "epl", "epl", "epl", "epl" });
}
 
Example 27
@Override
public void setUp() throws Exception {
  super.setUp();
  analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      final TokenStream stream = new SerbianNormalizationRegularFilter(tokenizer);
      return new TokenStreamComponents(tokenizer, stream);
    }
  };
}
 
Example 28
Source Project: lucene-solr   Source File: HighlighterPhraseTest.java    License: Apache License 2.0 5 votes vote down vote up
public void testInOrderWithStopWords() throws IOException, InvalidTokenOffsetsException {
  MockAnalyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true,
      MockTokenFilter.ENGLISH_STOPSET);        
  final String TEXT = "the cd the ab the the the the the the the ab the cd the";
  final Directory directory = newDirectory();
  try (IndexWriter indexWriter = new IndexWriter(directory,
      newIndexWriterConfig(stopAnalyzer))) {
    final Document document = new Document();
    document.add(newTextField(FIELD, TEXT, Store.YES));
    indexWriter.addDocument(document);
  }
  try (IndexReader indexReader = DirectoryReader.open(directory)) {
    assertEquals(1, indexReader.numDocs());
    final IndexSearcher indexSearcher = newSearcher(indexReader);
    //equivalent of "ab the cd"
    final PhraseQuery phraseQuery = new PhraseQuery.Builder()
        .add(new Term(FIELD, "ab"), 0)
        .add(new Term(FIELD, "cd"), 2).build();

    TopDocs hits = indexSearcher.search(phraseQuery, 100);
    assertEquals(1, hits.totalHits.value);

    final Highlighter highlighter = new Highlighter(
        new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
        new QueryScorer(phraseQuery));
    String[] frags = highlighter.getBestFragments(stopAnalyzer, FIELD, TEXT, 10);
    assertEquals(1, frags.length);
    assertTrue("contains <B>ab</B> the <B>cd</B>",
        (frags[0].contains("<B>ab</B> the <B>cd</B>")));
    assertTrue("does not contain <B>cd</B> the <B>ab</B>",
        (!frags[0].contains("<B>cd</B> the <B>ab</B>")));
  } finally {
    directory.close();
  }
}
 
Example 29
Source Project: lucene-solr   Source File: TestArabicStemFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet(1, true);
  set.add("ساهدهات");
  MockTokenizer tokenStream  = whitespaceMockTokenizer("ساهدهات");

  ArabicStemFilter filter = new ArabicStemFilter(new SetKeywordMarkerFilter(tokenStream, set));
  assertTokenStreamContents(filter, new String[]{"ساهدهات"});
}
 
Example 30
Source Project: lucene-solr   Source File: TestMultipleIndexFields.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testDefault() throws Exception {
  Directory indexDir = newDirectory();
  Directory taxoDir = newDirectory();
  
  // create and open an index writer
  RandomIndexWriter iw = new RandomIndexWriter(random(), indexDir, newIndexWriterConfig(
      new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
  // create and open a taxonomy writer
  TaxonomyWriter tw = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE);
  FacetsConfig config = getConfig();

  seedIndex(tw, iw, config);

  IndexReader ir = iw.getReader();
  tw.commit();

  // prepare index reader and taxonomy.
  TaxonomyReader tr = new DirectoryTaxonomyReader(taxoDir);

  // prepare searcher to search against
  IndexSearcher searcher = newSearcher(ir);

  FacetsCollector sfc = performSearch(tr, ir, searcher);

  // Obtain facets results and hand-test them
  assertCorrectResults(getTaxonomyFacetCounts(tr, config, sfc));

  assertOrdinalsExist("$facets", ir);

  iw.close();
  IOUtils.close(tr, ir, tw, indexDir, taxoDir);
}