package com.hankcs.lucene;

import com.hankcs.hanlp.HanLP;
import junit.framework.TestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;

import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;

public class HanLPTokenizerTest extends TestCase
{
    Tokenizer tokenizer;

    @Override
    public void setUp() throws Exception
    {
        tokenizer = new HanLPTokenizer(HanLP.newSegment()
                                               .enableJapaneseNameRecognize(true)
                                               .enableIndexMode(true), null, false);
        tokenizer.setReader(new StringReader("林志玲亮相网友:确定不是波多野结衣?"));
        tokenizer.reset();
    }

    public void testIncrementToken() throws Exception
    {
        while (tokenizer.incrementToken())
        {
            CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
            // 偏移量
            OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
            // 距离
            PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
            // 词性
            TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
            System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
        }
    }

    public void testMultiText() throws Exception
    {
        String[] sentences = new String[]{
                "中华人民共和国",
                "地大物博"
        };
        tokenizer = new HanLPTokenizer(HanLP.newSegment()
                                               .enableJapaneseNameRecognize(true)
                                               .enableIndexMode(true), null, false);
        for (String sentence : sentences)
        {
            tokenizer.setReader(new StringReader(sentence));
            tokenizer.reset();
            testIncrementToken();
            tokenizer.close();
        }
    }

    public void testPinyinTokenFilter() throws Exception
    {
        Map<String, String> args = new HashMap<>();
        args.put("original", "true");
        args.put("pinyin", "false");
        args.put("pinyinFirstChar", "true");
        HanLPPinyinTokenFilterFactory factory = new HanLPPinyinTokenFilterFactory(args);
        TokenStream tokenStream = factory.create(tokenizer);
        while (tokenStream.incrementToken())
        {
            CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class);
            // 偏移量
            OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
            // 距离
            PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
            // 词性
            TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class);
            System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type());
        }
    }
}