// Copyright 2016 Xiaomi, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.xiaomi.linden.lucene.analyzer;

import java.io.Reader;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory;
import org.apache.lucene.analysis.standard.StandardTokenizer;

public class LindenWordDelimiterAnalyzer extends Analyzer {

  private static final String SET_STOP_WORDS = "set.stopwords";
  private static final String TO_LOWER_CASE = "lower.case";

  private boolean toLowerCase = true;
  private boolean setStopWords = true;

  private WordDelimiterFilterFactory factoryDefault;

  /**
   * generateWordParts
   * Causes parts of words to be generated:
   * <p/>
   * "PowerShot" => "Power" "Shot"
   * <p>
   * generateNumberParts
   * Causes number subwords to be generated:
   * <p/>
   * "500-42" => "500" "42"
   * <p>
   * catenateWords
   * Causes maximum runs of word parts to be catenated:
   * <p/>
   * "wi-fi" => "wifi"
   * <p>
   * catenateNumbers
   * Causes maximum runs of word parts to be catenated:
   * <p/>
   * "500-42" => "50042"
   * <p>
   * catenateAll
   * Causes all subword parts to be catenated:
   * <p/>
   * "wi-fi-4000" => "wifi4000"
   * <p>
   * preserveOriginal
   * Causes original words are preserved and added to the subword list (Defaults to false)
   * <p/>
   * "500-42" => "500" "42" "500-42"
   * <p>
   * splitOnCaseChange
   * If not set, causes case changes to be ignored (subwords will only be generated
   * given SUBWORD_DELIM tokens)
   * <p>
   * splitOnNumerics
   * If not set, causes numeric changes to be ignored (subwords will only be generated
   * given SUBWORD_DELIM tokens).
   * <p>
   * stemEnglishPossessive
   * Causes trailing "'s" to be removed for each subword
   * <p/>
   * "O'Neil's" => "O", "Neil"
   */

  public LindenWordDelimiterAnalyzer(Map<String, String> params) {
    if (params.containsKey(SET_STOP_WORDS)) {
      this.setStopWords = Boolean.parseBoolean(params.get(SET_STOP_WORDS));
      params.remove(SET_STOP_WORDS);
    }
    if (params.containsKey(TO_LOWER_CASE)) {
      this.toLowerCase = Boolean.parseBoolean(params.get(TO_LOWER_CASE));
      params.remove(TO_LOWER_CASE);
    }
    factoryDefault = new WordDelimiterFilterFactory(params);
  }


  @Override
  protected TokenStreamComponents createComponents(String s, Reader reader) {
    final Tokenizer source = new StandardTokenizer(reader);

    TokenStream ts = factoryDefault.create(source);
    if (this.toLowerCase) {
      ts = new LowerCaseFilter(ts);
    }
    if (this.setStopWords) {
      ts = new StopFilter(ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
    }
    return new TokenStreamComponents(source, ts);
  }
}