org.apache.lucene.analysis.core.StopAnalyzer Java Examples

The following examples show how to use org.apache.lucene.analysis.core.StopAnalyzer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: StandardAnalyzerProvider.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
public StandardAnalyzerProvider(Index index, Settings indexSettings, Environment env, String name, Settings settings) {
    super(index, indexSettings, name, settings);
    this.esVersion = Version.indexCreated(indexSettings);
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_Beta1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }

    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);
    int maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
    standardAnalyzer = new StandardAnalyzer(stopWords);
    standardAnalyzer.setVersion(version);
    standardAnalyzer.setMaxTokenLength(maxTokenLength);
}
 
Example #2
Source File: PatternAnalyzerProvider.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
@Inject
public PatternAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);

    Version esVersion = Version.indexCreated(indexSettingsService.getSettings());
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }
    boolean lowercase = settings.getAsBoolean("lowercase", true);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);

    String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/);
    if (sPattern == null) {
        throw new IllegalArgumentException("Analyzer [" + name + "] of type pattern must have a `pattern` set");
    }
    Pattern pattern = Regex.compile(sPattern, settings.get("flags"));

    analyzer = new PatternAnalyzer(pattern, lowercase, stopWords);
}
 
Example #3
Source File: TestTextField.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Test
public void testAnalyzeMultiTerm() {
  // No terms provided by the StopFilter (stop word) for the multi-term part.
  // This is supported. Check TextField.analyzeMultiTerm returns null (and does not throw an exception).
  BytesRef termBytes = TextField.analyzeMultiTerm("field", "the", new StopAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET));
  assertNull(termBytes);

  // One term provided by the WhitespaceTokenizer for the multi-term part.
  // This is the regular case. Check TextField.analyzeMultiTerm returns it (and does not throw an exception).
  termBytes = TextField.analyzeMultiTerm("field", "Sol", new WhitespaceAnalyzer());
  assertEquals("Sol", termBytes.utf8ToString());

  // Two terms provided by the WhitespaceTokenizer for the multi-term part.
  // This is not allowed. Expect an exception.
  SolrException exception = expectThrows(SolrException.class, () -> TextField.analyzeMultiTerm("field", "term1 term2", new WhitespaceAnalyzer()));
  assertEquals("Unexpected error code", SolrException.ErrorCode.BAD_REQUEST.code, exception.code());
}
 
Example #4
Source File: StandardHtmlStripAnalyzerProvider.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Inject
public StandardHtmlStripAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env,  @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    this.esVersion = Version.indexCreated(indexSettingsService.getSettings());
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);
    analyzer = new StandardHtmlStripAnalyzer(stopWords);
    analyzer.setVersion(version);
}
 
Example #5
Source File: StopAnalyzerProvider.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Inject
public StopAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
    this.stopAnalyzer = new StopAnalyzer(stopWords);
    this.stopAnalyzer.setVersion(version);
}
 
Example #6
Source File: StopTokenFilterFactory.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Inject
public StopTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    this.ignoreCase = settings.getAsBoolean("ignore_case", false);
    this.removeTrailing = settings.getAsBoolean("remove_trailing", true);
    this.stopWords = Analysis.parseStopWords(env, settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
    if (version.onOrAfter(Version.LUCENE_4_4) && settings.get("enable_position_increments") != null) {
        throw new IllegalArgumentException("enable_position_increments is not supported anymore as of Lucene 4.4 as it can create broken token streams."
                + " Please fix your analysis chain or use an older compatibility version (<= 4.3).");
    }
    this.enablePositionIncrements = settings.getAsBoolean("enable_position_increments", true);
}
 
Example #7
Source File: NeedsConfiguringAnalyzerFactory.java    From database with GNU General Public License v2.0 5 votes vote down vote up
protected Set<?> getStopWordsForClass(String clazzName) {
	Class<? extends Analyzer> analyzerClass = getAnalyzerClass(clazzName);
	try {
		return (Set<?>) analyzerClass.getMethod("getDefaultStopSet").invoke(null);
	} catch (Exception e) {
		if (StandardAnalyzer.class.equals(analyzerClass)) {
			return StandardAnalyzer.STOP_WORDS_SET;
		}
		if (StopAnalyzer.class.equals(analyzerClass)) {
			return StopAnalyzer.ENGLISH_STOP_WORDS_SET;
		}
		throw new RuntimeException("Failed to find stop words from " + clazzName + " for language range "+languageRange);
	}
}
 
Example #8
Source File: GraphUtil.java    From SciGraph with Apache License 2.0 5 votes vote down vote up
/***
 * TODO: This and every spot that uses it is a bit of a hack
 * This should ideally be handled by the index.
 * @param value
 * @return
 */
public static boolean ignoreProperty(Object value) {
  if (value instanceof String
      && (CharMatcher.WHITESPACE.matchesAllOf((String) value)
          || StopAnalyzer.ENGLISH_STOP_WORDS_SET.contains(((String) value).toLowerCase()))) {
    return true;
  } 
  return false;
}
 
Example #9
Source File: LuceneUtils.java    From SciGraph with Apache License 2.0 5 votes vote down vote up
public static boolean isStopword(String word) {
  for (Iterator<?> stopWord = StopAnalyzer.ENGLISH_STOP_WORDS_SET.iterator(); stopWord.hasNext();) {
    String stopword = new String((char[]) stopWord.next());
    if (stopword.equalsIgnoreCase(word)) {
      return true;
    }
  }
  return false;
}
 
Example #10
Source File: StopAnalyzerProvider.java    From Elasticsearch with Apache License 2.0 4 votes vote down vote up
@Override
public StopAnalyzer get() {
    return this.stopAnalyzer;
}
 
Example #11
Source File: StandardHtmlStripAnalyzer.java    From Elasticsearch with Apache License 2.0 4 votes vote down vote up
/**
 * @deprecated use {@link StandardHtmlStripAnalyzer#StandardHtmlStripAnalyzer(CharArraySet)} instead
 */
@Deprecated
public StandardHtmlStripAnalyzer() {
    super(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}
 
Example #12
Source File: DefaultAnalyzer.java    From modernmt with Apache License 2.0 4 votes vote down vote up
protected DefaultAnalyzer(AnalyzerConfig config) {
    super(config, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}
 
Example #13
Source File: PreBuiltAnalyzersTest.java    From stratio-cassandra with Apache License 2.0 4 votes vote down vote up
@Test
public void testGetStop() {
    Analyzer analyzer = PreBuiltAnalyzers.STOP.get();
    Assert.assertEquals(StopAnalyzer.class, analyzer.getClass());
}
 
Example #14
Source File: LuceneAnalyzerIntegrationTest.java    From tutorials with MIT License 4 votes vote down vote up
@Test
public void whenUseStopAnalyzer_thenAnalyzed() throws IOException {
    List<String> result = analyze(SAMPLE_TEXT, new StopAnalyzer());

    assertThat(result, contains("baeldung", "com", "lucene", "analyzers", "test"));
}
 
Example #15
Source File: StopAnalyzerProvider.java    From crate with Apache License 2.0 4 votes vote down vote up
public StopAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
    this.stopAnalyzer = new StopAnalyzer(stopWords);
    this.stopAnalyzer.setVersion(version);
}
 
Example #16
Source File: StopAnalyzerProvider.java    From crate with Apache License 2.0 4 votes vote down vote up
@Override
public StopAnalyzer get() {
    return this.stopAnalyzer;
}