org.apache.lucene.analysis.util.CharArraySet Java Examples

The following examples show how to use org.apache.lucene.analysis.util.CharArraySet. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SnowballAnalyzerBuilder.java    From stratio-cassandra with Apache License 2.0 6 votes vote down vote up
/**
 * Builds a new {@link SnowballAnalyzerBuilder} for the specified language and stopwords.
 *
 * @param language  The language. The supported languages are English, French, Spanish, Portuguese, Italian,
 *                  Romanian, German, Dutch, Swedish, Norwegian, Danish, Russian, Finnish, Irish, Hungarian,
 *                  Turkish, Armenian, Basque and Catalan.
 * @param stopwords The comma separated stopwords {@code String}.
 */
@JsonCreator
public SnowballAnalyzerBuilder(@JsonProperty("language") final String language,
                               @JsonProperty("stopwords") String stopwords) {

    // Check language
    if (language == null || language.trim().isEmpty()) {
        throw new IllegalArgumentException("Language must be specified");
    }

    // Setup stopwords
    CharArraySet stops = stopwords == null ? getDefaultStopwords(language) : getStopwords(stopwords);

    // Setup analyzer
    this.analyzer = buildAnalyzer(language, stops);

    // Force analysis validation
    AnalysisUtils.analyzeAsText("test", analyzer);
}
 
Example #2
Source File: Analysis.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
public static CharArraySet parseStemExclusion(Settings settings, CharArraySet defaultStemExclusion) {
    String value = settings.get("stem_exclusion");
    if (value != null) {
        if ("_none_".equals(value)) {
            return CharArraySet.EMPTY_SET;
        } else {
            // LUCENE 4 UPGRADE: Should be settings.getAsBoolean("stem_exclusion_case", false)?
            return new CharArraySet(Strings.commaDelimitedListToSet(value), false);
        }
    }
    String[] stemExclusion = settings.getAsArray("stem_exclusion", null);
    if (stemExclusion != null) {
        // LUCENE 4 UPGRADE: Should be settings.getAsBoolean("stem_exclusion_case", false)?
        return new CharArraySet(Arrays.asList(stemExclusion), false);
    } else {
        return defaultStemExclusion;
    }
}
 
Example #3
Source File: QueryParserImpl.java    From AdSearch_Endpoints with Apache License 2.0 6 votes vote down vote up
@Override
  public List<String> parseQuery(String queryStr) {
    // tokenize queryStr, remove stop word, stemming
	List<String> tokens = new ArrayList<String>();
	AttributeFactory factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
	Tokenizer tokenizer = new StandardTokenizer(factory);
	tokenizer.setReader(new StringReader(queryStr));
	CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet();
    TokenStream tokenStream = new StopFilter(tokenizer, stopWords);
//    StringBuilder sb = new StringBuilder();
    CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
    try {
    	tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            
            tokens.add(term);
//            sb.append(term + " ");
        }
        tokenStream.end();
        tokenStream.close();

        tokenizer.close();  
	} catch (IOException e) {
		e.printStackTrace();
	}
//	System.out.println("QU="+ sb.toString());
	return tokens;	
  }
 
Example #4
Source File: StandardAnalyzerProvider.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
public StandardAnalyzerProvider(Index index, Settings indexSettings, Environment env, String name, Settings settings) {
    super(index, indexSettings, name, settings);
    this.esVersion = Version.indexCreated(indexSettings);
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_Beta1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }

    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);
    int maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
    standardAnalyzer = new StandardAnalyzer(stopWords);
    standardAnalyzer.setVersion(version);
    standardAnalyzer.setMaxTokenLength(maxTokenLength);
}
 
Example #5
Source File: PatternAnalyzerProvider.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
@Inject
public PatternAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);

    Version esVersion = Version.indexCreated(indexSettingsService.getSettings());
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }
    boolean lowercase = settings.getAsBoolean("lowercase", true);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);

    String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/);
    if (sPattern == null) {
        throw new IllegalArgumentException("Analyzer [" + name + "] of type pattern must have a `pattern` set");
    }
    Pattern pattern = Regex.compile(sPattern, settings.get("flags"));

    analyzer = new PatternAnalyzer(pattern, lowercase, stopWords);
}
 
Example #6
Source File: KuromojiUDF.java    From incubator-hivemall with Apache License 2.0 5 votes vote down vote up
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
    if (_analyzer == null) {
        CharArraySet stopWords = stopWords(_stopWordsArray);

        UserDictionary userDict = null;
        if (_userDictObj instanceof String[]) {
            userDict = userDictionary((String[]) _userDictObj);
        } else if (_userDictObj instanceof String) {
            userDict = userDictionary((String) _userDictObj);
        }

        this._analyzer = new JapaneseAnalyzer(userDict, _mode, stopWords, _stopTags);
    }

    Object arg0 = arguments[0].get();
    if (arg0 == null) {
        return null;
    }
    String line = arg0.toString();

    if (_returnPos) {
        return parseLine(_analyzer, line, _result);
    } else {
        return parseLine(_analyzer, line);
    }
}
 
Example #7
Source File: Analysis.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
public static CharArraySet parseWords(Environment env, Settings settings, String name, CharArraySet defaultWords, Map<String, Set<?>> namedWords, boolean ignoreCase) {
    String value = settings.get(name);
    if (value != null) {
        if ("_none_".equals(value)) {
            return CharArraySet.EMPTY_SET;
        } else {
            return resolveNamedWords(Strings.commaDelimitedListToSet(value), namedWords, ignoreCase);
        }
    }
    List<String> pathLoadedWords = getWordList(env, settings, name);
    if (pathLoadedWords != null) {
        return resolveNamedWords(pathLoadedWords, namedWords, ignoreCase);
    }
    return defaultWords;
}
 
Example #8
Source File: SnowballAnalyzerBuilder.java    From stratio-cassandra with Apache License 2.0 5 votes vote down vote up
/**
 * Returns the stopwords {@link CharArraySet} for the specified comma separated stopwords {@code String}.
 *
 * @param stopwords A {@code String} comma separated stopwords list.
 * @return The stopwords {@link CharArraySet} for the specified comma separated stopwords {@code String}.
 */
private static CharArraySet getStopwords(String stopwords) {
    List<String> stopwordsList = new ArrayList<>();
    for (String stop : stopwords.split(",")) {
        stopwordsList.add(stop.trim());
    }
    return new CharArraySet(stopwordsList, true);
}
 
Example #9
Source File: NeedsConfiguringAnalyzerFactory.java    From database with GNU General Public License v2.0 5 votes vote down vote up
/**
 * This is called only when we have already identified that
 * the class does support stopwords.
 * @return
 */
public Set<?> getStopWords() {
	
	if (doNotUseStopWords()) 
		return CharArraySet.EMPTY_SET;
	
	if (useDefaultStopWords()) {
		return getStopWordsForClass(className);
	}
	
	return getStopWordsForClass(stopwords);
}
 
Example #10
Source File: NeedsConfiguringAnalyzerFactory.java    From database with GNU General Public License v2.0 5 votes vote down vote up
public PatternAnalyzer(ConfigOptionsToAnalyzer lro, Pattern pattern, CharArraySet stopWords) throws Exception {
	/*
	super(lro.languageRange, getConstructor(PatternAnalyzerImpl.class,Pattern.class, CharArraySet.class), 
		pattern, stopWords);
		*/
	super(lro.languageRange, new PatternAnalyzerImpl(pattern, stopWords), new PatternAnalyzerImpl(pattern, CharArraySet.EMPTY_SET));
}
 
Example #11
Source File: NeedsConfiguringAnalyzerFactory.java    From database with GNU General Public License v2.0 5 votes vote down vote up
private static Object[] useEmptyStopWordSet(Object[] params) {
	Object rslt[] = new Object[params.length];
	for (int i=0; i<params.length; i++) {
		if (params[i] instanceof Set) {
			rslt[i] = CharArraySet.EMPTY_SET;
		} else {
			rslt[i] = params[i];
		}
	}
	return rslt;
}
 
Example #12
Source File: LanguageAnalyzer.java    From modernmt with Apache License 2.0 5 votes vote down vote up
protected LanguageAnalyzer(AnalyzerConfig config, CharArraySet defaultStopWordsSet) {
    super(config.stopWordsSet == null ? defaultStopWordsSet : config.stopWordsSet);

    if (config.enableStemming && config.stemmingExclusionSet != null)
        this.stemmingExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(config.stemmingExclusionSet));
    else
        this.stemmingExclusionSet = null;

    this.config = config;
}
 
Example #13
Source File: KuromojiUDF.java    From incubator-hivemall with Apache License 2.0 5 votes vote down vote up
@Nonnull
private static CharArraySet stopWords(@Nullable final String[] array)
        throws UDFArgumentException {
    if (array == null) {
        return JapaneseAnalyzer.getDefaultStopSet();
    }
    if (array.length == 0) {
        return CharArraySet.EMPTY_SET;
    }
    return new CharArraySet(Arrays.asList(array), /* ignoreCase */true);
}
 
Example #14
Source File: CommonMMSeg4jSegmenter.java    From linden with Apache License 2.0 5 votes vote down vote up
private void initStopWords(String stopWordsPath) {
  if (stopWordsPath != null) {
    try {
      List<String> lines = FileUtils.readLines(new File(stopWordsPath));
      Set<String> set = new HashSet<>(lines);
      stopWords = CharArraySet.copy(set);
    } catch (IOException e) {
      throw new RuntimeException("Read stop words failed path : " + stopWordsPath);
    }
  }
}
 
Example #15
Source File: KeywordMarkerTokenFilterFactory.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Inject
public KeywordMarkerTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);

    boolean ignoreCase = settings.getAsBoolean("ignore_case", false);
    Set<?> rules = Analysis.getWordSet(env, settings, "keywords");
    if (rules == null) {
        throw new IllegalArgumentException("keyword filter requires either `keywords` or `keywords_path` to be configured");
    }
    keywordLookup = new CharArraySet(rules, ignoreCase);
}
 
Example #16
Source File: BasqueAnalyzerProvider.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Inject
public BasqueAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new BasqueAnalyzer(Analysis.parseStopWords(env, settings, BasqueAnalyzer.getDefaultStopSet()),
                                  Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example #17
Source File: IrishAnalyzerProvider.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Inject
public IrishAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new IrishAnalyzer(Analysis.parseStopWords(env, settings, IrishAnalyzer.getDefaultStopSet()),
                                 Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example #18
Source File: LindenStandardAnalyzerFactory.java    From linden with Apache License 2.0 5 votes vote down vote up
@Override
public StandardAnalyzer getInstance(Map<String, String> params) throws IOException {
  if (params.containsKey(STOPWORDS_EMPTY)) {
    if (Boolean.parseBoolean(params.get(STOPWORDS_EMPTY))) {
      return new StandardAnalyzer(CharArraySet.EMPTY_SET);
    }
  }
  return new StandardAnalyzer();
}
 
Example #19
Source File: Analysis.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
public static CharArraySet getWordSet(Environment env, Settings settings, String settingsPrefix) {
    List<String> wordList = getWordList(env, settings, settingsPrefix);
    if (wordList == null) {
        return null;
    }
    return new CharArraySet(wordList, settings.getAsBoolean(settingsPrefix + "_case", false));
}
 
Example #20
Source File: Analysis.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
private static CharArraySet resolveNamedWords(Collection<String> words, Map<String, Set<?>> namedWords, boolean ignoreCase) {
    if (namedWords == null) {
        return new CharArraySet(words, ignoreCase);
    }
    CharArraySet setWords = new CharArraySet(words.size(), ignoreCase);
    for (String word : words) {
        if (namedWords.containsKey(word)) {
            setWords.addAll(namedWords.get(word));
        } else {
            setWords.add(word);
        }
    }
    return setWords;
}
 
Example #21
Source File: LatvianAnalyzerProvider.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Inject
public LatvianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new LatvianAnalyzer(Analysis.parseStopWords(env, settings, LatvianAnalyzer.getDefaultStopSet()),
                                   Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example #22
Source File: LithuanianAnalyzerProvider.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Inject
public LithuanianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new LithuanianAnalyzer(Analysis.parseStopWords(env, settings, LithuanianAnalyzer.getDefaultStopSet()),
                                  Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example #23
Source File: HindiAnalyzerProvider.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Inject
public HindiAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new HindiAnalyzer(Analysis.parseStopWords(env, settings, HindiAnalyzer.getDefaultStopSet()),
                                 Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example #24
Source File: SoraniAnalyzerProvider.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Inject
public SoraniAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new SoraniAnalyzer(Analysis.parseStopWords(env, settings, SoraniAnalyzer.getDefaultStopSet()),
                                  Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example #25
Source File: CatalanAnalyzerProvider.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Inject
public CatalanAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new CatalanAnalyzer(Analysis.parseStopWords(env, settings, CatalanAnalyzer.getDefaultStopSet()),
                                   Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example #26
Source File: TurkishAnalyzerProvider.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Inject
public TurkishAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new TurkishAnalyzer(Analysis.parseStopWords(env, settings, TurkishAnalyzer.getDefaultStopSet()),
                                   Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example #27
Source File: BulgarianAnalyzerProvider.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Inject
public BulgarianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new BulgarianAnalyzer(Analysis.parseStopWords(env, settings, BulgarianAnalyzer.getDefaultStopSet()),
                                     Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example #28
Source File: RussianAnalyzerProvider.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Inject
public RussianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new RussianAnalyzer(Analysis.parseStopWords(env, settings, RussianAnalyzer.getDefaultStopSet()),
                                   Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example #29
Source File: DutchAnalyzerProvider.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Inject
public DutchAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new DutchAnalyzer(Analysis.parseStopWords(env, settings, DutchAnalyzer.getDefaultStopSet()),
                                 Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example #30
Source File: FinnishAnalyzerProvider.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Inject
public FinnishAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new FinnishAnalyzer(Analysis.parseStopWords(env, settings, FinnishAnalyzer.getDefaultStopSet()),
                                   Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}