org.apache.lucene.analysis.core.StopAnalyzer Java Examples

The following examples show how to use org.apache.lucene.analysis.core.StopAnalyzer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: StandardAnalyzerProvider.java From Elasticsearch with Apache License 2.0

6 votes

public StandardAnalyzerProvider(Index index, Settings indexSettings, Environment env, String name, Settings settings) {
    super(index, indexSettings, name, settings);
    this.esVersion = Version.indexCreated(indexSettings);
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_Beta1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }

    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);
    int maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
    standardAnalyzer = new StandardAnalyzer(stopWords);
    standardAnalyzer.setVersion(version);
    standardAnalyzer.setMaxTokenLength(maxTokenLength);
}

Example #2

Source File: PatternAnalyzerProvider.java From Elasticsearch with Apache License 2.0

6 votes

@Inject
public PatternAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);

    Version esVersion = Version.indexCreated(indexSettingsService.getSettings());
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }
    boolean lowercase = settings.getAsBoolean("lowercase", true);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);

    String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/);
    if (sPattern == null) {
        throw new IllegalArgumentException("Analyzer [" + name + "] of type pattern must have a `pattern` set");
    }
    Pattern pattern = Regex.compile(sPattern, settings.get("flags"));

    analyzer = new PatternAnalyzer(pattern, lowercase, stopWords);
}

Example #3

Source File: TestTextField.java From lucene-solr with Apache License 2.0

6 votes

@Test
public void testAnalyzeMultiTerm() {
  // No terms provided by the StopFilter (stop word) for the multi-term part.
  // This is supported. Check TextField.analyzeMultiTerm returns null (and does not throw an exception).
  BytesRef termBytes = TextField.analyzeMultiTerm("field", "the", new StopAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET));
  assertNull(termBytes);

  // One term provided by the WhitespaceTokenizer for the multi-term part.
  // This is the regular case. Check TextField.analyzeMultiTerm returns it (and does not throw an exception).
  termBytes = TextField.analyzeMultiTerm("field", "Sol", new WhitespaceAnalyzer());
  assertEquals("Sol", termBytes.utf8ToString());

  // Two terms provided by the WhitespaceTokenizer for the multi-term part.
  // This is not allowed. Expect an exception.
  SolrException exception = expectThrows(SolrException.class, () -> TextField.analyzeMultiTerm("field", "term1 term2", new WhitespaceAnalyzer()));
  assertEquals("Unexpected error code", SolrException.ErrorCode.BAD_REQUEST.code, exception.code());
}

Example #4

Source File: StandardHtmlStripAnalyzerProvider.java From Elasticsearch with Apache License 2.0

5 votes

@Inject
public StandardHtmlStripAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env,  @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    this.esVersion = Version.indexCreated(indexSettingsService.getSettings());
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);
    analyzer = new StandardHtmlStripAnalyzer(stopWords);
    analyzer.setVersion(version);
}

Example #5

Source File: StopAnalyzerProvider.java From Elasticsearch with Apache License 2.0

5 votes

@Inject
public StopAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
    this.stopAnalyzer = new StopAnalyzer(stopWords);
    this.stopAnalyzer.setVersion(version);
}

Example #6

Source File: StopTokenFilterFactory.java From Elasticsearch with Apache License 2.0

5 votes

@Inject
public StopTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    this.ignoreCase = settings.getAsBoolean("ignore_case", false);
    this.removeTrailing = settings.getAsBoolean("remove_trailing", true);
    this.stopWords = Analysis.parseStopWords(env, settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
    if (version.onOrAfter(Version.LUCENE_4_4) && settings.get("enable_position_increments") != null) {
        throw new IllegalArgumentException("enable_position_increments is not supported anymore as of Lucene 4.4 as it can create broken token streams."
                + " Please fix your analysis chain or use an older compatibility version (<= 4.3).");
    }
    this.enablePositionIncrements = settings.getAsBoolean("enable_position_increments", true);
}

Example #7

Source File: NeedsConfiguringAnalyzerFactory.java From database with GNU General Public License v2.0

5 votes

protected Set<?> getStopWordsForClass(String clazzName) {
	Class<? extends Analyzer> analyzerClass = getAnalyzerClass(clazzName);
	try {
		return (Set<?>) analyzerClass.getMethod("getDefaultStopSet").invoke(null);
	} catch (Exception e) {
		if (StandardAnalyzer.class.equals(analyzerClass)) {
			return StandardAnalyzer.STOP_WORDS_SET;
		}
		if (StopAnalyzer.class.equals(analyzerClass)) {
			return StopAnalyzer.ENGLISH_STOP_WORDS_SET;
		}
		throw new RuntimeException("Failed to find stop words from " + clazzName + " for language range "+languageRange);
	}
}

Example #8

Source File: GraphUtil.java From SciGraph with Apache License 2.0

5 votes

/***
 * TODO: This and every spot that uses it is a bit of a hack
 * This should ideally be handled by the index.
 * @param value
 * @return
 */
public static boolean ignoreProperty(Object value) {
  if (value instanceof String
      && (CharMatcher.WHITESPACE.matchesAllOf((String) value)
          || StopAnalyzer.ENGLISH_STOP_WORDS_SET.contains(((String) value).toLowerCase()))) {
    return true;
  } 
  return false;
}

Example #9

Source File: LuceneUtils.java From SciGraph with Apache License 2.0

5 votes

public static boolean isStopword(String word) {
  for (Iterator<?> stopWord = StopAnalyzer.ENGLISH_STOP_WORDS_SET.iterator(); stopWord.hasNext();) {
    String stopword = new String((char[]) stopWord.next());
    if (stopword.equalsIgnoreCase(word)) {
      return true;
    }
  }
  return false;
}

Example #10

Source File: StopAnalyzerProvider.java From Elasticsearch with Apache License 2.0

4 votes

@Override
public StopAnalyzer get() {
    return this.stopAnalyzer;
}

Example #11

Source File: StandardHtmlStripAnalyzer.java From Elasticsearch with Apache License 2.0

4 votes

/**
 * @deprecated use {@link StandardHtmlStripAnalyzer#StandardHtmlStripAnalyzer(CharArraySet)} instead
 */
@Deprecated
public StandardHtmlStripAnalyzer() {
    super(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}

Example #12

Source File: DefaultAnalyzer.java From modernmt with Apache License 2.0

4 votes

protected DefaultAnalyzer(AnalyzerConfig config) {
    super(config, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}

Example #13

Source File: PreBuiltAnalyzersTest.java From stratio-cassandra with Apache License 2.0

4 votes

@Test
public void testGetStop() {
    Analyzer analyzer = PreBuiltAnalyzers.STOP.get();
    Assert.assertEquals(StopAnalyzer.class, analyzer.getClass());
}

Example #14

Source File: LuceneAnalyzerIntegrationTest.java From tutorials with MIT License

4 votes

@Test
public void whenUseStopAnalyzer_thenAnalyzed() throws IOException {
    List<String> result = analyze(SAMPLE_TEXT, new StopAnalyzer());

    assertThat(result, contains("baeldung", "com", "lucene", "analyzers", "test"));
}

Example #15

Source File: StopAnalyzerProvider.java From crate with Apache License 2.0

4 votes

public StopAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
    this.stopAnalyzer = new StopAnalyzer(stopWords);
    this.stopAnalyzer.setVersion(version);
}

Example #16

Source File: StopAnalyzerProvider.java From crate with Apache License 2.0

4 votes

@Override
public StopAnalyzer get() {
    return this.stopAnalyzer;
}