org.apache.lucene.analysis.core.StopAnalyzer Java Examples

The following examples show how to use org.apache.lucene.analysis.core.StopAnalyzer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source Project: Elasticsearch   Author: baidu   File: StandardAnalyzerProvider.java    License: Apache License 2.0 6 votes vote down vote up
public StandardAnalyzerProvider(Index index, Settings indexSettings, Environment env, String name, Settings settings) {
    super(index, indexSettings, name, settings);
    this.esVersion = Version.indexCreated(indexSettings);
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_Beta1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }

    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);
    int maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
    standardAnalyzer = new StandardAnalyzer(stopWords);
    standardAnalyzer.setVersion(version);
    standardAnalyzer.setMaxTokenLength(maxTokenLength);
}
 
Example #2
Source Project: Elasticsearch   Author: baidu   File: PatternAnalyzerProvider.java    License: Apache License 2.0 6 votes vote down vote up
@Inject
public PatternAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);

    Version esVersion = Version.indexCreated(indexSettingsService.getSettings());
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }
    boolean lowercase = settings.getAsBoolean("lowercase", true);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);

    String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/);
    if (sPattern == null) {
        throw new IllegalArgumentException("Analyzer [" + name + "] of type pattern must have a `pattern` set");
    }
    Pattern pattern = Regex.compile(sPattern, settings.get("flags"));

    analyzer = new PatternAnalyzer(pattern, lowercase, stopWords);
}
 
Example #3
Source Project: lucene-solr   Author: apache   File: TestTextField.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testAnalyzeMultiTerm() {
  // No terms provided by the StopFilter (stop word) for the multi-term part.
  // This is supported. Check TextField.analyzeMultiTerm returns null (and does not throw an exception).
  BytesRef termBytes = TextField.analyzeMultiTerm("field", "the", new StopAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET));
  assertNull(termBytes);

  // One term provided by the WhitespaceTokenizer for the multi-term part.
  // This is the regular case. Check TextField.analyzeMultiTerm returns it (and does not throw an exception).
  termBytes = TextField.analyzeMultiTerm("field", "Sol", new WhitespaceAnalyzer());
  assertEquals("Sol", termBytes.utf8ToString());

  // Two terms provided by the WhitespaceTokenizer for the multi-term part.
  // This is not allowed. Expect an exception.
  SolrException exception = expectThrows(SolrException.class, () -> TextField.analyzeMultiTerm("field", "term1 term2", new WhitespaceAnalyzer()));
  assertEquals("Unexpected error code", SolrException.ErrorCode.BAD_REQUEST.code, exception.code());
}
 
Example #4
Source Project: Elasticsearch   Author: baidu   File: StandardHtmlStripAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public StandardHtmlStripAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env,  @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    this.esVersion = Version.indexCreated(indexSettingsService.getSettings());
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);
    analyzer = new StandardHtmlStripAnalyzer(stopWords);
    analyzer.setVersion(version);
}
 
Example #5
Source Project: Elasticsearch   Author: baidu   File: StopAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public StopAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
    this.stopAnalyzer = new StopAnalyzer(stopWords);
    this.stopAnalyzer.setVersion(version);
}
 
Example #6
Source Project: Elasticsearch   Author: baidu   File: StopTokenFilterFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public StopTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    this.ignoreCase = settings.getAsBoolean("ignore_case", false);
    this.removeTrailing = settings.getAsBoolean("remove_trailing", true);
    this.stopWords = Analysis.parseStopWords(env, settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
    if (version.onOrAfter(Version.LUCENE_4_4) && settings.get("enable_position_increments") != null) {
        throw new IllegalArgumentException("enable_position_increments is not supported anymore as of Lucene 4.4 as it can create broken token streams."
                + " Please fix your analysis chain or use an older compatibility version (<= 4.3).");
    }
    this.enablePositionIncrements = settings.getAsBoolean("enable_position_increments", true);
}
 
Example #7
Source Project: database   Author: blazegraph   File: NeedsConfiguringAnalyzerFactory.java    License: GNU General Public License v2.0 5 votes vote down vote up
protected Set<?> getStopWordsForClass(String clazzName) {
	Class<? extends Analyzer> analyzerClass = getAnalyzerClass(clazzName);
	try {
		return (Set<?>) analyzerClass.getMethod("getDefaultStopSet").invoke(null);
	} catch (Exception e) {
		if (StandardAnalyzer.class.equals(analyzerClass)) {
			return StandardAnalyzer.STOP_WORDS_SET;
		}
		if (StopAnalyzer.class.equals(analyzerClass)) {
			return StopAnalyzer.ENGLISH_STOP_WORDS_SET;
		}
		throw new RuntimeException("Failed to find stop words from " + clazzName + " for language range "+languageRange);
	}
}
 
Example #8
Source Project: SciGraph   Author: SciGraph   File: GraphUtil.java    License: Apache License 2.0 5 votes vote down vote up
/***
 * TODO: This and every spot that uses it is a bit of a hack
 * This should ideally be handled by the index.
 * @param value
 * @return
 */
public static boolean ignoreProperty(Object value) {
  if (value instanceof String
      && (CharMatcher.WHITESPACE.matchesAllOf((String) value)
          || StopAnalyzer.ENGLISH_STOP_WORDS_SET.contains(((String) value).toLowerCase()))) {
    return true;
  } 
  return false;
}
 
Example #9
Source Project: SciGraph   Author: SciGraph   File: LuceneUtils.java    License: Apache License 2.0 5 votes vote down vote up
public static boolean isStopword(String word) {
  for (Iterator<?> stopWord = StopAnalyzer.ENGLISH_STOP_WORDS_SET.iterator(); stopWord.hasNext();) {
    String stopword = new String((char[]) stopWord.next());
    if (stopword.equalsIgnoreCase(word)) {
      return true;
    }
  }
  return false;
}
 
Example #10
Source Project: Elasticsearch   Author: baidu   File: StopAnalyzerProvider.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public StopAnalyzer get() {
    return this.stopAnalyzer;
}
 
Example #11
Source Project: Elasticsearch   Author: baidu   File: StandardHtmlStripAnalyzer.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * @deprecated use {@link StandardHtmlStripAnalyzer#StandardHtmlStripAnalyzer(CharArraySet)} instead
 */
@Deprecated
public StandardHtmlStripAnalyzer() {
    super(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}
 
Example #12
Source Project: modernmt   Author: modernmt   File: DefaultAnalyzer.java    License: Apache License 2.0 4 votes vote down vote up
protected DefaultAnalyzer(AnalyzerConfig config) {
    super(config, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}
 
Example #13
Source Project: stratio-cassandra   Author: Stratio   File: PreBuiltAnalyzersTest.java    License: Apache License 2.0 4 votes vote down vote up
@Test
public void testGetStop() {
    Analyzer analyzer = PreBuiltAnalyzers.STOP.get();
    Assert.assertEquals(StopAnalyzer.class, analyzer.getClass());
}
 
Example #14
Source Project: tutorials   Author: eugenp   File: LuceneAnalyzerIntegrationTest.java    License: MIT License 4 votes vote down vote up
@Test
public void whenUseStopAnalyzer_thenAnalyzed() throws IOException {
    List<String> result = analyze(SAMPLE_TEXT, new StopAnalyzer());

    assertThat(result, contains("baeldung", "com", "lucene", "analyzers", "test"));
}
 
Example #15
Source Project: crate   Author: crate   File: StopAnalyzerProvider.java    License: Apache License 2.0 4 votes vote down vote up
public StopAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
    this.stopAnalyzer = new StopAnalyzer(stopWords);
    this.stopAnalyzer.setVersion(version);
}
 
Example #16
Source Project: crate   Author: crate   File: StopAnalyzerProvider.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public StopAnalyzer get() {
    return this.stopAnalyzer;
}