Java Code Examples for org.apache.lucene.analysis.util.CharArraySet

The following examples show how to use org.apache.lucene.analysis.util.CharArraySet. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: Elasticsearch   Source File: StandardAnalyzerProvider.java    License: Apache License 2.0 6 votes vote down vote up
public StandardAnalyzerProvider(Index index, Settings indexSettings, Environment env, String name, Settings settings) {
    super(index, indexSettings, name, settings);
    this.esVersion = Version.indexCreated(indexSettings);
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_Beta1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }

    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);
    int maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
    standardAnalyzer = new StandardAnalyzer(stopWords);
    standardAnalyzer.setVersion(version);
    standardAnalyzer.setMaxTokenLength(maxTokenLength);
}
 
Example 2
Source Project: Elasticsearch   Source File: PatternAnalyzerProvider.java    License: Apache License 2.0 6 votes vote down vote up
@Inject
public PatternAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);

    Version esVersion = Version.indexCreated(indexSettingsService.getSettings());
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }
    boolean lowercase = settings.getAsBoolean("lowercase", true);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);

    String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/);
    if (sPattern == null) {
        throw new IllegalArgumentException("Analyzer [" + name + "] of type pattern must have a `pattern` set");
    }
    Pattern pattern = Regex.compile(sPattern, settings.get("flags"));

    analyzer = new PatternAnalyzer(pattern, lowercase, stopWords);
}
 
Example 3
Source Project: Elasticsearch   Source File: Analysis.java    License: Apache License 2.0 6 votes vote down vote up
public static CharArraySet parseStemExclusion(Settings settings, CharArraySet defaultStemExclusion) {
    String value = settings.get("stem_exclusion");
    if (value != null) {
        if ("_none_".equals(value)) {
            return CharArraySet.EMPTY_SET;
        } else {
            // LUCENE 4 UPGRADE: Should be settings.getAsBoolean("stem_exclusion_case", false)?
            return new CharArraySet(Strings.commaDelimitedListToSet(value), false);
        }
    }
    String[] stemExclusion = settings.getAsArray("stem_exclusion", null);
    if (stemExclusion != null) {
        // LUCENE 4 UPGRADE: Should be settings.getAsBoolean("stem_exclusion_case", false)?
        return new CharArraySet(Arrays.asList(stemExclusion), false);
    } else {
        return defaultStemExclusion;
    }
}
 
Example 4
Source Project: AdSearch_Endpoints   Source File: QueryParserImpl.java    License: Apache License 2.0 6 votes vote down vote up
@Override
  public List<String> parseQuery(String queryStr) {
    // tokenize queryStr, remove stop word, stemming
	List<String> tokens = new ArrayList<String>();
	AttributeFactory factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
	Tokenizer tokenizer = new StandardTokenizer(factory);
	tokenizer.setReader(new StringReader(queryStr));
	CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet();
    TokenStream tokenStream = new StopFilter(tokenizer, stopWords);
//    StringBuilder sb = new StringBuilder();
    CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
    try {
    	tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            
            tokens.add(term);
//            sb.append(term + " ");
        }
        tokenStream.end();
        tokenStream.close();

        tokenizer.close();  
	} catch (IOException e) {
		e.printStackTrace();
	}
//	System.out.println("QU="+ sb.toString());
	return tokens;	
  }
 
Example 5
Source Project: stratio-cassandra   Source File: SnowballAnalyzerBuilder.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Builds a new {@link SnowballAnalyzerBuilder} for the specified language and stopwords.
 *
 * @param language  The language. The supported languages are English, French, Spanish, Portuguese, Italian,
 *                  Romanian, German, Dutch, Swedish, Norwegian, Danish, Russian, Finnish, Irish, Hungarian,
 *                  Turkish, Armenian, Basque and Catalan.
 * @param stopwords The comma separated stopwords {@code String}.
 */
@JsonCreator
public SnowballAnalyzerBuilder(@JsonProperty("language") final String language,
                               @JsonProperty("stopwords") String stopwords) {

    // Check language
    if (language == null || language.trim().isEmpty()) {
        throw new IllegalArgumentException("Language must be specified");
    }

    // Setup stopwords
    CharArraySet stops = stopwords == null ? getDefaultStopwords(language) : getStopwords(stopwords);

    // Setup analyzer
    this.analyzer = buildAnalyzer(language, stops);

    // Force analysis validation
    AnalysisUtils.analyzeAsText("test", analyzer);
}
 
Example 6
Source Project: Elasticsearch   Source File: RomanianAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public RomanianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new RomanianAnalyzer(Analysis.parseStopWords(env, settings, RomanianAnalyzer.getDefaultStopSet()),
                                    Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example 7
Source Project: Elasticsearch   Source File: BasqueAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public BasqueAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new BasqueAnalyzer(Analysis.parseStopWords(env, settings, BasqueAnalyzer.getDefaultStopSet()),
                                  Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example 8
@Inject
public StandardHtmlStripAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env,  @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    this.esVersion = Version.indexCreated(indexSettingsService.getSettings());
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);
    analyzer = new StandardHtmlStripAnalyzer(stopWords);
    analyzer.setVersion(version);
}
 
Example 9
Source Project: Elasticsearch   Source File: IndonesianAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public IndonesianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new IndonesianAnalyzer(Analysis.parseStopWords(env, settings, IndonesianAnalyzer.getDefaultStopSet()),
                                      Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example 10
Source Project: Elasticsearch   Source File: ArabicAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public ArabicAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    arabicAnalyzer = new ArabicAnalyzer(Analysis.parseStopWords(env, settings, ArabicAnalyzer.getDefaultStopSet()),
                                        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    arabicAnalyzer.setVersion(version);
}
 
Example 11
Source Project: Elasticsearch   Source File: SnowballAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public SnowballAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);

    String language = settings.get("language", settings.get("name", "English"));
    CharArraySet defaultStopwords = defaultLanguageStopwords.containsKey(language) ? defaultLanguageStopwords.get(language) : CharArraySet.EMPTY_SET;
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);

    analyzer = new SnowballAnalyzer(language, stopWords);
    analyzer.setVersion(version);
}
 
Example 12
Source Project: Elasticsearch   Source File: StopAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public StopAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
    this.stopAnalyzer = new StopAnalyzer(stopWords);
    this.stopAnalyzer.setVersion(version);
}
 
Example 13
Source Project: Elasticsearch   Source File: SwedishAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public SwedishAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new SwedishAnalyzer(Analysis.parseStopWords(env, settings, SwedishAnalyzer.getDefaultStopSet()),
                                   Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example 14
Source Project: Elasticsearch   Source File: SpanishAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public SpanishAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new SpanishAnalyzer(Analysis.parseStopWords(env, settings, SpanishAnalyzer.getDefaultStopSet()),
                                   Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example 15
@Inject
public WordDelimiterTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);

    // Sample Format for the type table:
    // $ => DIGIT
    // % => DIGIT
    // . => DIGIT
    // \u002C => DIGIT
    // \u200D => ALPHANUM
    List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
    if (charTypeTableValues == null) {
        this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
    } else {
        this.charTypeTable = parseTypes(charTypeTableValues);
    }
    int flags = 0;
    // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
    flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true);
    // If set, causes number subwords to be generated: "500-42" => "500" "42"
    flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true);
    // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
    flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false);
    // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042"
    flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false);
    // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
    flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false);
    // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
    flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true);
    // If set, includes original words in subwords: "500-42" => "500" "42" "500-42"
    flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false);
    // 1, causes "j2se" to be three tokens; "j" "2" "se"
    flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true);
    // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
    flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
    // If not null is the set of tokens to protect from being delimited
    Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words");
    this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
    this.flags = flags;
}
 
Example 16
Source Project: Elasticsearch   Source File: PortugueseAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public PortugueseAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new PortugueseAnalyzer(Analysis.parseStopWords(env, settings, PortugueseAnalyzer.getDefaultStopSet()),
                                      Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example 17
Source Project: Elasticsearch   Source File: DanishAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public DanishAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new DanishAnalyzer(Analysis.parseStopWords(env, settings, DanishAnalyzer.getDefaultStopSet()),
                                  Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example 18
Source Project: Elasticsearch   Source File: ArmenianAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public ArmenianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new ArmenianAnalyzer(Analysis.parseStopWords(env, settings, ArmenianAnalyzer.getDefaultStopSet()),
                                    Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example 19
Source Project: Elasticsearch   Source File: CjkAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public CjkAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, CJKAnalyzer.getDefaultStopSet());

    analyzer = new CJKAnalyzer(stopWords);
    analyzer.setVersion(version);
}
 
Example 20
Source Project: Elasticsearch   Source File: GalicianAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public GalicianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new GalicianAnalyzer(Analysis.parseStopWords(env, settings, GalicianAnalyzer.getDefaultStopSet()),
                                    Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example 21
Source Project: Elasticsearch   Source File: GermanAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public GermanAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new GermanAnalyzer(Analysis.parseStopWords(env, settings, GermanAnalyzer.getDefaultStopSet()),
                                  Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example 22
Source Project: Elasticsearch   Source File: EnglishAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public EnglishAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new EnglishAnalyzer(Analysis.parseStopWords(env, settings, EnglishAnalyzer.getDefaultStopSet()),
                                   Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example 23
Source Project: Elasticsearch   Source File: BrazilianAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public BrazilianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new BrazilianAnalyzer(Analysis.parseStopWords(env, settings, BrazilianAnalyzer.getDefaultStopSet()),
                                     Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example 24
Source Project: Elasticsearch   Source File: ItalianAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public ItalianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new ItalianAnalyzer(Analysis.parseStopWords(env, settings, ItalianAnalyzer.getDefaultStopSet()),
                                   Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example 25
Source Project: Elasticsearch   Source File: FrenchAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public FrenchAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new FrenchAnalyzer(Analysis.parseStopWords(env, settings, FrenchAnalyzer.getDefaultStopSet()),
                                  Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example 26
Source Project: Elasticsearch   Source File: HungarianAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public HungarianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new HungarianAnalyzer(Analysis.parseStopWords(env, settings, HungarianAnalyzer.getDefaultStopSet()),
                                     Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example 27
Source Project: Elasticsearch   Source File: CzechAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public CzechAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new CzechAnalyzer(Analysis.parseStopWords(env, settings, CzechAnalyzer.getDefaultStopSet()),
                                 Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example 28
Source Project: Elasticsearch   Source File: NorwegianAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public NorwegianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new NorwegianAnalyzer(Analysis.parseStopWords(env, settings, NorwegianAnalyzer.getDefaultStopSet()),
                                     Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example 29
Source Project: Elasticsearch   Source File: FinnishAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public FinnishAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new FinnishAnalyzer(Analysis.parseStopWords(env, settings, FinnishAnalyzer.getDefaultStopSet()),
                                   Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
 
Example 30
Source Project: Elasticsearch   Source File: DutchAnalyzerProvider.java    License: Apache License 2.0 5 votes vote down vote up
@Inject
public DutchAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new DutchAnalyzer(Analysis.parseStopWords(env, settings, DutchAnalyzer.getDefaultStopSet()),
                                 Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}