Java Code Examples for org.apache.lucene.analysis.en.EnglishAnalyzer#ENGLISH_STOP_WORDS_SET

The following examples show how to use org.apache.lucene.analysis.en.EnglishAnalyzer#ENGLISH_STOP_WORDS_SET . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestThaiAnalyzer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testPositionIncrements() throws Exception {
  final ThaiAnalyzer analyzer = new ThaiAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
  assertAnalyzesTo(analyzer, "การที่ได้ต้อง the แสดงว่างานดี", 
      new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
      new int[] { 0, 3, 6, 9, 18, 22, 25, 28 },
      new int[] { 3, 6, 9, 13, 22, 25, 28, 30 },
      new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });
 
  // case that a stopword is adjacent to thai text, with no whitespace
  assertAnalyzesTo(analyzer, "การที่ได้ต้องthe แสดงว่างานดี", 
      new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
      new int[] { 0, 3, 6, 9, 17, 21, 24, 27 },
      new int[] { 3, 6, 9, 13, 21, 24, 27, 29 },
      new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });
  analyzer.close();
}
 
Example 2
Source File: CommonGramsFilterFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void inform(ResourceLoader loader) throws IOException {
  if (commonWordFiles != null) {
    if ("snowball".equalsIgnoreCase(format)) {
      commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase);
    } else {
      commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
    }
  } else {
    commonWords = EnglishAnalyzer.ENGLISH_STOP_WORDS_SET;
  }
}
 
Example 3
Source File: TestAnalyzers.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testStop() throws Exception {
  Analyzer a = new StopAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
  assertAnalyzesTo(a, "foo bar FOO BAR", 
                   new String[] { "foo", "bar", "foo", "bar" });
  assertAnalyzesTo(a, "foo a bar such FOO THESE BAR", 
                   new String[] { "foo", "bar", "foo", "bar" });
  assertEquals(new BytesRef("\"\\à3[]()! cz@"), a.normalize("dummy", "\"\\À3[]()! Cz@"));
  assertEquals(new BytesRef("the"), a.normalize("dummy", "the"));
  a.close();
}
 
Example 4
Source File: TestAnalyzers.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
  Analyzer analyzers[] = new Analyzer[] { new WhitespaceAnalyzer(), new SimpleAnalyzer(),
      new StopAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET), new UnicodeWhitespaceAnalyzer() };
  for (Analyzer analyzer : analyzers) {
    checkRandomData(random(), analyzer, 100*RANDOM_MULTIPLIER);
  }
  IOUtils.close(analyzers);
}
 
Example 5
Source File: TestAnalyzers.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** blast some random large strings through the analyzer */
public void testRandomHugeStrings() throws Exception {
  Analyzer analyzers[] = new Analyzer[] { new WhitespaceAnalyzer(), new SimpleAnalyzer(),
      new StopAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET), new UnicodeWhitespaceAnalyzer() };
  for (Analyzer analyzer : analyzers) {
    checkRandomData(random(), analyzer, 10*RANDOM_MULTIPLIER, 8192);
  }
  IOUtils.close(analyzers);
}
 
Example 6
Source File: TestStopAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void setUp() throws Exception {
  super.setUp();
  
  Iterator<?> it = EnglishAnalyzer.ENGLISH_STOP_WORDS_SET.iterator();
  while(it.hasNext()) {
    inValidTokens.add(it.next());
  }
  stop = new StopAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
}
 
Example 7
Source File: StandardHtmlStripAnalyzer.java    From crate with Apache License 2.0 4 votes vote down vote up
/**
 * @deprecated use {@link StandardHtmlStripAnalyzer#StandardHtmlStripAnalyzer(CharArraySet)} instead
 */
@Deprecated
public StandardHtmlStripAnalyzer() {
    super(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
}
 
Example 8
Source File: StandardAnalyzer.java    From datawave with Apache License 2.0 2 votes vote down vote up
/**
 * Build an analyzer with the default stop words: ({@link EnglishAnalyzer#ENGLISH_STOP_WORDS_SET}).
 * <p>
 * This hides the matchVersion parameter we don't always want consumers to have to be concerned with it. Generally matchVersion will be set to the current
 * Lucene version.
 */
public StandardAnalyzer() {
    this(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
}