/**
 * 
 * APDPlat - Application Product Development Platform
 * Copyright (c) 2013, 杨尚川, [email protected]
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * 
 */

package org.apdplat.qa.parser;

import java.io.File;
import java.util.List;

import org.apdplat.qa.util.Tools;
import org.apdplat.word.WordSegmenter;
import org.apdplat.word.segmentation.SegmentationAlgorithm;
import org.apdplat.word.segmentation.Word;
import org.apdplat.word.tagging.PartOfSpeechTagging;
import org.apdplat.word.util.WordConfTools;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * 分词器
 *
 * @author 杨尚川
 */
public class WordParser {

    private static final Logger LOG = LoggerFactory.getLogger(WordParser.class);

    static {
        String appPath = Tools.getAppPath(WordParser.class);
        String confFile = appPath + "/web/dic/word_v_1_3/word.local.conf";
        if(!new File(confFile).exists()){
            confFile = appPath + "/jar/dic/word_v_1_3/word.local.conf";
        }
        if(new File(confFile).exists()){
            LOG.info("word分词的自定义配置文件:"+confFile);
            WordConfTools.forceOverride(confFile);
        }else{
            LOG.info("不存在word分词的自定义配置文件:"+confFile);
        }
    }

    /**
     * 带词性标注(包括细分词性标注)的分析方法
     *
     * @param str 需要分词的文本
     * @return 分词结果
     */
    public static List<Word> parseWithoutStopWords(String str) {
        List<Word> words = WordSegmenter.seg(str, SegmentationAlgorithm.MaxNgramScore);
        //词性标注
        PartOfSpeechTagging.process(words);
        return words;
    }
    public static List<Word> parse(String str) {
        List<Word> words = WordSegmenter.segWithStopWords(str, SegmentationAlgorithm.MaxNgramScore);
        //词性标注
        PartOfSpeechTagging.process(words);
        return words;
    }

    public static void main(String[] args) {
        List<Word> parse = parse("在河边一排排梨树下面有许多的非洲象和熊猫,还有很多的桉树,红色的金鱼在水里游来游去,猎豹在绿色的草地上跑来跑去!");
        System.out.println(parse);
        parse = parse("布什是个什么样的人");
        System.out.println(parse);
        parse = parse("张三和");
        System.out.println(parse);
        parse = parse("哈雷彗星的发现者是六小龄童和伦琴,专访微软亚洲研究院院长洪小文");
        System.out.println(parse);
        String str = " 《创业邦》杂志记者对微软亚洲研究院院长洪小文进行了专访。 《创业邦》:微软亚洲  研究院 ... 从研发的角度来说,研究院是一个战略性的部门。因为一家公司最后成功与   ...";
        parse = parse(str);
        System.out.println(parse);
    }
}