package eu.modernmt.processing.tokenizer.corenlp;

import edu.stanford.nlp.international.arabic.process.ArabicTokenizer;
import edu.stanford.nlp.international.french.process.FrenchTokenizer;
import edu.stanford.nlp.international.spanish.process.SpanishTokenizer;
import edu.stanford.nlp.ling.HasOffset;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.TokenizerFactory;
import eu.modernmt.lang.Language;
import eu.modernmt.lang.UnsupportedLanguageException;
import eu.modernmt.processing.tokenizer.BaseTokenizer;
import eu.modernmt.processing.tokenizer.TokenizedString;

import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;

public class CoreNLPTokenAnnotator implements BaseTokenizer.Annotator {

    private static final Map<Language, TokenizerFactory<?>> FACTORIES = new HashMap<>();

    static {
        FACTORIES.put(Language.ENGLISH, PTBTokenizer.factory());
        FACTORIES.put(Language.ARABIC, ArabicTokenizer.factory());
        FACTORIES.put(Language.FRENCH, FrenchTokenizer.factory());
        FACTORIES.put(Language.SPANISH, SpanishTokenizer.factory());
    }

    private final TokenizerFactory<?> factory;

    public static CoreNLPTokenAnnotator forLanguage(Language language) throws UnsupportedLanguageException {
        TokenizerFactory<?> factory = FACTORIES.get(language);
        if (factory == null)
            throw new UnsupportedLanguageException(language);

        /*sets special options if source language is English*/
        if (Language.ENGLISH.getLanguage().equals(language.getLanguage()))
            factory.setOptions("ptb3Escaping=false,asciiQuotes=true,normalizeSpace=false");

        return new CoreNLPTokenAnnotator(factory);
    }

    private CoreNLPTokenAnnotator(TokenizerFactory<?> factory) {
        this.factory = factory;
    }

    @Override
    public void annotate(TokenizedString string) {
        Reader reader = new StringReader(string.toString());
        edu.stanford.nlp.process.Tokenizer<?> tokenizer;
        synchronized (this) {
            tokenizer = this.factory.getTokenizer(reader);
        }

        while (tokenizer.hasNext()) {
            Object token = tokenizer.next();

            if (token instanceof HasOffset) {
                HasOffset hasOffset = (HasOffset) token;
                int begin = hasOffset.beginPosition();
                int end = hasOffset.endPosition();

                string.setWord(begin, end);
            }
        }
    }

}