package com.jstarcraft.nlp.lucene; import java.io.StringReader; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.junit.Assert; import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.jstarcraft.core.utility.StringUtility; public abstract class NlpSegmenterTestCase { private static final Logger LOGGER = LoggerFactory.getLogger(NlpSegmenterTestCase.class); protected abstract Tokenizer getSegmenter(); @Test public void testSegmenter() throws Exception { Tokenizer segmenter = getSegmenter(); String text = "中华人民共和国(People's Republic of China),简称'中国'"; segmenter.setReader(new StringReader(text)); segmenter.reset(); while (segmenter.incrementToken()) { // 词元 CharTermAttribute term = segmenter.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offset = segmenter.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute position = segmenter.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute type = segmenter.getAttribute(TypeAttribute.class); LOGGER.debug(StringUtility.format("segmenter:term is {}, begin is {}, end is {}", term, offset.startOffset(), offset.endOffset())); Assert.assertEquals(term.toString().toLowerCase(), text.substring(offset.startOffset(), offset.endOffset()).toLowerCase()); } } }