/*
 * Copyright 2016
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universit├Ąt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.impl;

import com.cybozu.labs.langdetect.Detector;
import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException;
import com.cybozu.labs.langdetect.Language;
import de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.LanguageIdentifier;
import org.apache.commons.io.IOUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;

/**
 * @author Ivan Habernal
 */
public class CybozuLanguageIdentifier
        implements LanguageIdentifier
{
    private static final double PROBABILITY_THRESHOLD = 0.80;

    static final String[] PROFILES = { "af", "bn", "de", "es", "fi", "he", "hu", "ja", "lt", "ml",
            "nl", "pl", "ru", "so", "sw", "th", "uk", "zh-cn", "ar", "cs", "el", "et", "fr", "hi",
            "id", "kn", "lv", "mr", "no", "pt", "sk", "sq", "ta", "tl", "ur", "zh-tw", "bg", "da",
            "en", "fa", "gu", "hr", "it", "ko", "mk", "ne", "pa", "ro", "sl", "sv", "te", "tr",
            "vi" };

    public CybozuLanguageIdentifier()
    {
        List<String> jsonProfiles = new ArrayList<String>();
        for (String profile : PROFILES) {
            // locate the stream
            String resourceName = "profiles/" + profile;
            InputStream inputStream = CybozuLanguageIdentifier.class.getClassLoader()
                    .getResourceAsStream(resourceName);

            if (inputStream == null) {
                throw new RuntimeException(
                        "Cannot locate resource " + resourceName + " on the classpath.");
            }

            // read the profile to string
            StringWriter sw = new StringWriter();
            try {
                IOUtils.copy(inputStream, sw, "utf-8");
            }
            catch (IOException e) {
                throw new RuntimeException(e);
            }

            // add to all profiles
            jsonProfiles.add(sw.toString());
        }

        // and load all languages
        try {
            DetectorFactory.loadProfile(jsonProfiles);
        }
        catch (LangDetectException e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    public String identifyLanguage(String html)
            throws IOException
    {
        // extracting plain html text
        Document doc = Jsoup.parse(html);
        String text = doc.text();

        // we might have removed everything -> no lang
        if (text.isEmpty()) {
            return UNKNOWN_LANGUAGE;
        }

        try {
            Detector detector = DetectorFactory.create();
            detector.append(text);
            String detectedLang = detector.detect();

            ArrayList<Language> detectedProbabilities = detector.getProbabilities();

            if (detectedProbabilities.get(0).prob > PROBABILITY_THRESHOLD) {
                return detectedLang;
            }
            else {
                return UNKNOWN_LANGUAGE;
            }
        }
        catch (LangDetectException e) {
            return UNKNOWN_LANGUAGE;
        }
    }

}