package org.elasticsearch.index.analysis;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.vi.VietnameseAnalyzer;
import org.apache.lucene.analysis.vi.VietnameseTokenizer;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.Index;
import org.elasticsearch.plugin.analysis.vi.AnalysisVietnamesePlugin;
import org.elasticsearch.test.ESTestCase;

import java.io.IOException;
import java.io.StringReader;

import static org.hamcrest.Matchers.*;
import static org.apache.lucene.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

/**
 * Created by duydo on 2/19/17.
 */
public class VietnameseAnalysisTest extends ESTestCase {

    public void testSimpleVietnameseAnalysis() throws IOException {
        TestAnalysis analysis = createTestAnalysis();
        assertNotNull(analysis);

        TokenizerFactory tokenizerFactory = analysis.tokenizer.get("vi_tokenizer");
        assertNotNull(tokenizerFactory);
        assertThat(tokenizerFactory, instanceOf(VietnameseTokenizerFactory.class));

        NamedAnalyzer analyzer = analysis.indexAnalyzers.get("vi_analyzer");
        assertNotNull(analyzer);
        assertThat(analyzer.analyzer(), instanceOf(VietnameseAnalyzer.class));

        analyzer = analysis.indexAnalyzers.get("my_analyzer");
        assertNotNull(analyzer);
        assertThat(analyzer.analyzer(), instanceOf(CustomAnalyzer.class));
        assertThat(analyzer.analyzer().tokenStream(null, new StringReader("")), instanceOf(VietnameseTokenizer.class));

    }


    public void testVietnameseTokenizer() throws IOException {
        TestAnalysis analysis = createTestAnalysis();
        TokenizerFactory tokenizerFactory = analysis.tokenizer.get("vi_tokenizer");
        assertNotNull(tokenizerFactory);

        Tokenizer tokenizer = tokenizerFactory.create();
        assertNotNull(tokenizer);

        tokenizer.setReader(new StringReader("công nghệ thông tin Việt Nam"));
        assertTokenStreamContents(tokenizer, new String[]{"công nghệ thông tin", "Việt", "Nam"});
    }

    public void testVietnameseAnalyzer() throws IOException {
        TestAnalysis analysis = createTestAnalysis();
        NamedAnalyzer analyzer = analysis.indexAnalyzers.get("vi_analyzer");
        assertNotNull(analyzer);

        TokenStream ts = analyzer.analyzer().tokenStream("test", "công nghệ thông tin Việt Nam");
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        for (String expected : new String[]{"công nghệ thông tin", "việt", "nam"}) {
            assertThat(ts.incrementToken(), equalTo(true));
            assertThat(term.toString(), equalTo(expected));
        }
        assertThat(ts.incrementToken(), equalTo(false));
    }

    public static TestAnalysis createTestAnalysis() throws IOException {
        String json = "/org/elasticsearch/index/analysis/vi_analysis.json";
        Settings settings = Settings.builder()
                .loadFromStream(json, VietnameseAnalysisTest.class.getResourceAsStream(json), true)
                .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
                .build();
        Settings nodeSettings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()).build();
        return createTestAnalysis(new Index("test", "_na_"), nodeSettings, settings, new AnalysisVietnamesePlugin());
    }

    public void testTokenOffset() throws IOException {

        TestAnalysis analysis = createTestAnalysis();
        NamedAnalyzer analyzer = analysis.indexAnalyzers.get("vi_analyzer");
        assertNotNull(analyzer);

        TokenStream ts = analyzer.analyzer().tokenStream("test", "Phụ tùng xe Mazda bán tải dưới 7 chỗ: ống dẫn gió tới két làm mát khí nạp- cao su lưu hóa, mới 100%, phục vụ BHBD. Ms:1D0013246A");
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class);
        ts.reset();
        String[] expected = new String[]{"phụ tùng", "xe", "mazda", "bán", "tải", "7", "chỗ", "ống", "dẫn", "gió", "tới", "két", "làm", "mát", "khí", "nạp", "cao su", "lưu hóa", "mới", "100%", "phục vụ", "bhbd", "ms", "1", "d0", "013246", "a"};
        int[] expectedOffset = new int[]{0, 9, 12, 18, 22, 31, 33, 38, 42, 46, 50, 54, 58, 62, 66, 70, 75, 82, 91, 95, 101, 109, 115, 118, 119, 121, 127};

        for (int i = 0; i < expected.length; i++) {
            assertThat(ts.incrementToken(), equalTo(true));
            assertThat(term.toString(), equalTo(expected[i]));
            assertTrue(offset.startOffset() == expectedOffset[i]);
        }
        assertThat(ts.incrementToken(), equalTo(false));
    }
}