java source code of SimpleSynonymMap

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.ginobefunny.elasticsearch.plugins.synonym.service;

import com.ginobefunny.elasticsearch.plugins.synonym.service.utils.Monitor;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.elasticsearch.common.logging.ESLoggerFactory;

import java.io.IOException;
import java.util.*;

/**
 * Created by ginozhang on 2017/1/12.
 * SEE: org.apache.lucene.analysis.synonym.SolrSynonymParser
 */
public class SimpleSynonymMap {

    private static final Logger LOGGER = ESLoggerFactory.getLogger(Monitor.class.getName());

    private Map<String, List<String>> ruleMap = new HashMap<String, List<String>>();

    private final Configuration configuration;

    public SimpleSynonymMap(Configuration cfg) {
        this.configuration = cfg;
    }

    public void addRule(String rule) {
        try {
            addInternal(rule);
        } catch (Throwable t) {
            LOGGER.error("Add synonym rule failed. rule: " + rule, t);
        }
    }

    private void addInternal(String line) throws IOException {
        String sides[] = split(line, "=>");
        if (sides.length > 1) { // explicit mapping
            if (sides.length != 2) {
                throw new IllegalArgumentException("more than one explicit mapping specified on the same line");
            }

            List<String> inputList = new ArrayList<>();
            String inputStrings[] = split(sides[0], ",");
            for (int i = 0; i < inputStrings.length; i++) {
                inputList.addAll(analyze(process(inputStrings[i])));
            }

            List<String> outputList = new ArrayList<>();
            String outputStrings[] = split(sides[1], ",");
            for (int i = 0; i < outputStrings.length; i++) {
                outputList.addAll(analyze(process(outputStrings[i])));
            }

            // these mappings are explicit and never preserve original
            for (String input : inputList) {
                for (String output : outputList) {
                    addToRuleMap(input, output);
                }
            }
        } else {
            List<String> inputList = new ArrayList<>();
            String inputStrings[] = split(line, ",");
            for (int i = 0; i < inputStrings.length; i++) {
                inputList.addAll(analyze(process(inputStrings[i])));
            }

            if (configuration.isExpand()) {
                // all pairs
                for (String input : inputList) {
                    for (String output : inputList) {
                        addToRuleMap(input, output);
                    }
                }
            } else {
                // all subsequent inputs map to first one; we also add inputs[0] here
                // so that we "effectively" (because we remove the original input and
                // add back a synonym with the same text) change that token's type to
                // SYNONYM (matching legacy behavior):
                for (int i = 0; i < inputList.size(); i++) {
                    addToRuleMap(inputList.get(i), inputList.get(0));
                }
            }
        }
    }

    private Set<String> analyze(String text) throws IOException {
        Set<String> result = new HashSet<String>();
        Analyzer analyzer = configuration.getAnalyzer();
        try (TokenStream ts = analyzer.tokenStream("", text)) {
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
            ts.reset();
            while (ts.incrementToken()) {
                int length = termAtt.length();
                if (length == 0) {
                    throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
                }
                if (posIncAtt.getPositionIncrement() != 1) {
                    throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
                }

                result.add(new String(termAtt.buffer(), 0, termAtt.length()));
            }

            ts.end();
            return result;
        }
    }

    private void addToRuleMap(String inputString, String outputString) {
        List<String> outputs = ruleMap.get(inputString);
        if (outputs == null) {
            outputs = new ArrayList<String>();
            ruleMap.put(inputString, outputs);
        }

        if (!outputs.contains(outputString)) {
            outputs.add(outputString);
        }
    }

    private static String[] split(String s, String separator) {
        List<String> list = new ArrayList<String>(2);
        StringBuilder sb = new StringBuilder();
        int pos = 0, end = s.length();
        while (pos < end) {
            if (s.startsWith(separator, pos)) {
                if (sb.length() > 0) {
                    list.add(sb.toString());
                    sb = new StringBuilder();
                }
                pos += separator.length();
                continue;
            }

            char ch = s.charAt(pos++);
            if (ch == '\\') {
                sb.append(ch);
                if (pos >= end) break;  // ERROR, or let it go?
                ch = s.charAt(pos++);
            }

            sb.append(ch);
        }

        if (sb.length() > 0) {
            list.add(sb.toString());
        }

        return list.toArray(new String[list.size()]);
    }

    private String process(String input) {

        String inputStr = configuration.isIgnoreCase() ? input.trim().toLowerCase(Locale.getDefault()) : input;
        if (inputStr.indexOf("\\") >= 0) {
            StringBuilder sb = new StringBuilder();
            for (int i = 0; i < inputStr.length(); i++) {
                char ch = inputStr.charAt(i);
                if (ch == '\\' && i < inputStr.length() - 1) {
                    sb.append(inputStr.charAt(++i));
                } else {
                    sb.append(ch);
                }
            }
            return sb.toString();
        }
        return inputStr;
    }

    public List<String> getSynonymWords(String input) {
        if (!ruleMap.containsKey(input)) {
            return null;
        }

        return ruleMap.get(input);
    }

}