java source code of PartOfSpeechTagging

/**
 *
 * APDPlat - Application Product Development Platform
 * Copyright (c) 2013, 杨尚川, [email protected]
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

package org.apdplat.word.tagging;

import org.apdplat.word.WordSegmenter;
import org.apdplat.word.recognition.RecognitionTool;
import org.apdplat.word.segmentation.PartOfSpeech;
import org.apdplat.word.segmentation.Word;
import org.apdplat.word.util.AutoDetector;
import org.apdplat.word.util.GenericTrie;
import org.apdplat.word.util.ResourceLoader;
import org.apdplat.word.util.WordConfTools;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.List;

/**
 * 词性标注
 * @author 杨尚川
 */
public class PartOfSpeechTagging {
    private PartOfSpeechTagging(){}

    private static final Logger LOGGER = LoggerFactory.getLogger(PartOfSpeechTagging.class);
    private static final GenericTrie<String> GENERIC_TRIE = new GenericTrie<>();
    static{
        reload();
    }
    public static void reload(){
        if(!WordConfTools.getBoolean("pos.tag.enable", true)){
            LOGGER.info("未启用词性标注");
            return;
        }
        AutoDetector.loadAndWatch(new ResourceLoader() {

            @Override
            public void clear() {
                GENERIC_TRIE.clear();
            }

            @Override
            public void load(List<String> lines) {
                LOGGER.info("初始化词性标注器");
                int count = 0;
                for (String line : lines) {
                    try {
                        String[] attr = line.split(":");
                        GENERIC_TRIE.put(attr[0], attr[1]);
                        count++;
                    } catch (Exception e) {
                        LOGGER.error("错误的词性数据：" + line);
                    }
                }
                LOGGER.info("词性标注器初始化完毕，词性数据条数：" + count);
            }

            @Override
            public void add(String line) {
                try {
                    String[] attr = line.split(":");
                    GENERIC_TRIE.put(attr[0], attr[1]);
                } catch (Exception e) {
                    LOGGER.error("错误的词性数据：" + line);
                }
            }

            @Override
            public void remove(String line) {
                try {
                    String[] attr = line.split(":");
                    GENERIC_TRIE.remove(attr[0]);
                } catch (Exception e) {
                    LOGGER.error("错误的词性数据：" + line);
                }
            }

        }, WordConfTools.get("part.of.speech.dic.path", "classpath:part_of_speech_dic.txt"));
    }
    public static void process(List<Word> words){
        words.stream().forEach(word->{
            if(word.getPartOfSpeech()!=null){
                if(LOGGER.isDebugEnabled()) {
                    LOGGER.debug("忽略已经标注过的词：{}", word);
                }
                return;
            }
            String wordText = word.getText();
            String pos = GENERIC_TRIE.get(wordText);
            if(pos == null){
                //识别英文
                if(RecognitionTool.isEnglish(wordText)){
                    pos = "w";
                }
                //识别数字
                if(RecognitionTool.isNumber(wordText)){
                    pos = "m";
                }
                //中文数字
                if(RecognitionTool.isChineseNumber(wordText)){
                    pos = "mh";
                }
                //识别小数和分数
                if(RecognitionTool.isFraction(wordText)){
                    if(wordText.contains(".")||wordText.contains("．")||wordText.contains("·")){
                        pos = "mx";
                    }
                    if(wordText.contains("/")||wordText.contains("／")){
                        pos = "mf";
                    }
                }
                //识别数量词
                if(RecognitionTool.isQuantifier(wordText)){
                    //分数
                    if(wordText.contains("‰")||wordText.contains("%")||wordText.contains("％")){
                        pos = "mf";
                    }
                    //时间量词
                    else if(wordText.contains("时")||wordText.contains("分")||wordText.contains("秒")){
                        pos = "tq";
                    }
                    //日期量词
                    else if(wordText.contains("年")||wordText.contains("月")||wordText.contains("日")
                            ||wordText.contains("天")||wordText.contains("号")){
                        pos = "tdq";
                    }
                    //数量词
                    else{
                        pos = "mq";
                    }
                }
            }
            word.setPartOfSpeech(PartOfSpeech.valueOf(pos));
        });
    }

    public static void main(String[] args) {
        List<Word> words = WordSegmenter.segWithStopWords("我爱中国，我爱杨尚川");
        System.out.println("未标注词性："+words);
        //词性标注
        PartOfSpeechTagging.process(words);
        System.out.println("标注词性："+words);
    }
}