Java Code Examples for org.apache.lucene.analysis.core.StopAnalyzer

The following examples show how to use org.apache.lucene.analysis.core.StopAnalyzer. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: Elasticsearch   Source File: StandardAnalyzerProvider.java    License: Apache License 2.0 6 votes vote down vote up
public StandardAnalyzerProvider(Index index, Settings indexSettings, Environment env, String name, Settings settings) {
    super(index, indexSettings, name, settings);
    this.esVersion = Version.indexCreated(indexSettings);
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_Beta1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }

    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);
    int maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
    standardAnalyzer = new StandardAnalyzer(stopWords);
    standardAnalyzer.setVersion(version);
    standardAnalyzer.setMaxTokenLength(maxTokenLength);
}
 
Example 2
Source Project: Elasticsearch   Source File: PatternAnalyzerProvider.java    License: Apache License 2.0 6 votes vote down vote up
@Inject
public PatternAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);

    Version esVersion = Version.indexCreated(indexSettingsService.getSettings());
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }
    boolean lowercase = settings.getAsBoolean("lowercase", true);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);

    String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/);
    if (sPattern == null) {
        throw new IllegalArgumentException("Analyzer [" + name + "] of type pattern must have a `pattern` set");
    }
    Pattern pattern = Regex.compile(sPattern, settings.get("flags"));

    analyzer = new PatternAnalyzer(pattern, lowercase, stopWords);
}
 
Example 3
Source Project: lucene-solr   Source File: TestTextField.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testAnalyzeMultiTerm() {
  // No terms provided by the StopFilter (stop word) for the multi-term part.
  // This is supported. Check TextField.analyzeMultiTerm returns null (and does not throw an exception).
  BytesRef termBytes = TextField.analyzeMultiTerm("field", "the", new StopAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET));
  assertNull(termBytes);

  // One term provided by the WhitespaceTokenizer for the multi-term part.
  // This is the regular case. Check TextField.analyzeMultiTerm returns it (and does not throw an exception).
  termBytes = TextField.analyzeMultiTerm("field", "Sol", new WhitespaceAnalyzer());
  assertEquals("Sol", termBytes.utf8ToString());

  // Two terms provided by the WhitespaceTokenizer for the multi-term part.
  // This is not allowed. Expect an exception.
  SolrException exception = expectThrows(SolrException.class, () -> TextField.analyzeMultiTerm("field", "term1 term2", new WhitespaceAnalyzer()));
  assertEquals("Unexpected error code", SolrException.ErrorCode.BAD_REQUEST.code, exception.code());
}
 
Example 4
@Inject
public StandardHtmlStripAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env,  @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    this.esVersion = Version.indexCreated(indexSettingsService.getSettings());
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);
    analyzer = new StandardHtmlStripAnalyzer(stopWords);
    analyzer.setVersion(version);
}
 
Example 5
Source Project: Elasticsearch   Source File: StopAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public StopAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
    this.stopAnalyzer = new StopAnalyzer(stopWords);
    this.stopAnalyzer.setVersion(version);
}
 
Example 6
Source Project: Elasticsearch   Source File: StopTokenFilterFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public StopTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    this.ignoreCase = settings.getAsBoolean("ignore_case", false);
    this.removeTrailing = settings.getAsBoolean("remove_trailing", true);
    this.stopWords = Analysis.parseStopWords(env, settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
    if (version.onOrAfter(Version.LUCENE_4_4) && settings.get("enable_position_increments") != null) {
        throw new IllegalArgumentException("enable_position_increments is not supported anymore as of Lucene 4.4 as it can create broken token streams."
                + " Please fix your analysis chain or use an older compatibility version (<= 4.3).");
    }
    this.enablePositionIncrements = settings.getAsBoolean("enable_position_increments", true);
}
 
Example 7
protected Set<?> getStopWordsForClass(String clazzName) {
	Class<? extends Analyzer> analyzerClass = getAnalyzerClass(clazzName);
	try {
		return (Set<?>) analyzerClass.getMethod("getDefaultStopSet").invoke(null);
	} catch (Exception e) {
		if (StandardAnalyzer.class.equals(analyzerClass)) {
			return StandardAnalyzer.STOP_WORDS_SET;
		}
		if (StopAnalyzer.class.equals(analyzerClass)) {
			return StopAnalyzer.ENGLISH_STOP_WORDS_SET;
		}
		throw new RuntimeException("Failed to find stop words from " + clazzName + " for language range "+languageRange);
	}
}
 
Example 8
Source Project: SciGraph   Source File: GraphUtil.java    License: Apache License 2.0 5 votes vote down vote up
/***
 * TODO: This and every spot that uses it is a bit of a hack
 * This should ideally be handled by the index.
 * @param value
 * @return
 */
public static boolean ignoreProperty(Object value) {
  if (value instanceof String
      && (CharMatcher.WHITESPACE.matchesAllOf((String) value)
          || StopAnalyzer.ENGLISH_STOP_WORDS_SET.contains(((String) value).toLowerCase()))) {
    return true;
  } 
  return false;
}
 
Example 9
Source Project: SciGraph   Source File: LuceneUtils.java    License: Apache License 2.0 5 votes vote down vote up
public static boolean isStopword(String word) {
  for (Iterator<?> stopWord = StopAnalyzer.ENGLISH_STOP_WORDS_SET.iterator(); stopWord.hasNext();) {
    String stopword = new String((char[]) stopWord.next());
    if (stopword.equalsIgnoreCase(word)) {
      return true;
    }
  }
  return false;
}
 
Example 10
Source Project: Elasticsearch   Source File: StopAnalyzerProvider.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public StopAnalyzer get() {
    return this.stopAnalyzer;
}
 
Example 11
Source Project: Elasticsearch   Source File: StandardHtmlStripAnalyzer.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * @deprecated use {@link StandardHtmlStripAnalyzer#StandardHtmlStripAnalyzer(CharArraySet)} instead
 */
@Deprecated
public StandardHtmlStripAnalyzer() {
    super(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}
 
Example 12
Source Project: modernmt   Source File: DefaultAnalyzer.java    License: Apache License 2.0 4 votes vote down vote up
protected DefaultAnalyzer(AnalyzerConfig config) {
    super(config, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}
 
Example 13
Source Project: stratio-cassandra   Source File: PreBuiltAnalyzersTest.java    License: Apache License 2.0 4 votes vote down vote up
@Test
public void testGetStop() {
    Analyzer analyzer = PreBuiltAnalyzers.STOP.get();
    Assert.assertEquals(StopAnalyzer.class, analyzer.getClass());
}
 
Example 14
Source Project: tutorials   Source File: LuceneAnalyzerIntegrationTest.java    License: MIT License 4 votes vote down vote up
@Test
public void whenUseStopAnalyzer_thenAnalyzed() throws IOException {
    List<String> result = analyze(SAMPLE_TEXT, new StopAnalyzer());

    assertThat(result, contains("baeldung", "com", "lucene", "analyzers", "test"));
}
 
Example 15
Source Project: crate   Source File: StopAnalyzerProvider.java    License: Apache License 2.0 4 votes vote down vote up
public StopAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
    this.stopAnalyzer = new StopAnalyzer(stopWords);
    this.stopAnalyzer.setVersion(version);
}
 
Example 16
Source Project: crate   Source File: StopAnalyzerProvider.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public StopAnalyzer get() {
    return this.stopAnalyzer;
}