org.apdplat.word.util.WordConfTools Java Examples

The following examples show how to use org.apdplat.word.util.WordConfTools. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ChineseWordAnalyzerTest.java    From jstarcraft-nlp with Apache License 2.0 6 votes vote down vote up
@Test
public void test1() {
    try {
        Analyzer analyzer = new WordAnalyzer();
        TokenStream tokenStream = analyzer.tokenStream("text", "杨尚川是APDPlat应用级产品开发平台的作者");
        List<String> words = new ArrayList<>();
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            words.add(charTermAttribute.toString());
        }
        tokenStream.close();
        String expResult = "[杨尚川, 是, apdplat, 应用级, 产品, 开发, 平台, 的, 作者]";
        if ("bigram".equals(WordConfTools.get("ngram", "bigram"))) {
            expResult = "[杨尚川, 是, apdplat, 应用, 级, 产品, 开发, 平台, 的, 作者]";
        }
        Assert.assertEquals(expResult, words.toString());
    } catch (IOException e) {
        Assert.fail("分词出错" + e.getMessage());
    }
}
 
Example #2
Source File: ChineseWordAnalyzerTest.java    From word with Apache License 2.0 6 votes vote down vote up
@Test
public void test1() {
    try{
        Analyzer analyzer = new ChineseWordAnalyzer();
        TokenStream tokenStream = analyzer.tokenStream("text", "杨尚川是APDPlat应用级产品开发平台的作者");
        List<String> words = new ArrayList<>();
        tokenStream.reset();
        while(tokenStream.incrementToken()){
            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
            words.add(charTermAttribute.toString());
        }
        tokenStream.close();
        String expResult = "[杨尚川, 是, apdplat, 应用级, 产品, 开发, 平台, 的, 作者]";
        if("bigram".equals(WordConfTools.get("ngram", "bigram"))){
            expResult = "[杨尚川, 是, apdplat, 应用, 级, 产品, 开发, 平台, 的, 作者]";
        }
        assertEquals(expResult, words.toString());
    }catch(IOException e){
        fail("分词出错"+e.getMessage());
    }
}
 
Example #3
Source File: WordTokenizerFactory.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
public WordTokenizerFactory(Map<String, String> configuration) {
    super(configuration);
    if (configuration != null) {
        String conf = configuration.get("conf");
        if (conf != null) {
            // 强制覆盖默认配置
            WordConfTools.forceOverride(conf);
        } else {
            LOGGER.info("没有指定conf参数");
        }
        String algorithm = configuration.get("segAlgorithm");
        if (algorithm != null) {
            try {
                SegmentationAlgorithm segmentationAlgorithm = SegmentationAlgorithm.valueOf(algorithm);
                segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
                LOGGER.info("使用指定分词算法:" + algorithm);
            } catch (Exception e) {
                LOGGER.error("参数segAlgorithm指定的值错误:" + algorithm);
                LOGGER.error("参数segAlgorithm可指定的值有:");
                for (SegmentationAlgorithm sa : SegmentationAlgorithm.values()) {
                    LOGGER.error("\t" + sa.name());
                }
            }
        } else {
            LOGGER.info("没有指定segAlgorithm参数");
        }
    }
    if (segmentation == null) {
        segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching);
        LOGGER.info("使用默认分词算法:" + SegmentationAlgorithm.BidirectionalMaximumMatching);
    }
}
 
Example #4
Source File: WordSegmentFactory.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
@Override
public Segmentation build(Map<String, String> configurations) {
    for (Entry<String, String> keyValue : configurations.entrySet()) {
        String key = keyValue.getKey();
        String value = keyValue.getValue();
        WordConfTools.set(key, value);
    }

    String algorithm = get(configurations, "algorithm", "FullSegmentation");
    Segmentation segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.valueOf(algorithm));
    return segmentation;
}
 
Example #5
Source File: WordSegmenterTestCase.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
@Override
protected Tokenizer getSegmenter() {
    // 可以配置到word.local.conf
    // 保持标点符号
    WordConfTools.set("keep.punctuation", "true");
    // 保持空格
    WordConfTools.set("keep.whitespace", "true");
    Segmentation segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.FullSegmentation);
    WordTokenizer tokenizer = new WordTokenizer(segmentation);
    return tokenizer;
}
 
Example #6
Source File: WordTokenizerTestCase.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
@Override
protected NlpTokenizer<? extends NlpToken> getTokenizer() {
    // 保持标点符号
    WordConfTools.set("keep.punctuation", "true");
    // 保持空格
    WordConfTools.set("keep.whitespace", "true");
    Segmentation segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.FullSegmentation);
    return new WordTokenizer(segmentation);
}
 
Example #7
Source File: wordTest.java    From Doctor with Apache License 2.0 5 votes vote down vote up
@Test
public void fenci() {
    String word="有低热乏力倦怠体重下降";
    //设置词典
    WordConfTools.set("dic.path", "classpath:wordLink.txt");
    //设置最多多少个字为一词
    WordConfTools.set("intercept.length","10");
    DictionaryFactory.reload();//更改词典路径之后,重新加载词典
    List<Word> words = WordSegmenter.seg(word, SegmentationAlgorithm.MaximumMatching);
    System.out.println(words);
}
 
Example #8
Source File: DictionaryFactory.java    From word with Apache License 2.0 5 votes vote down vote up
private static Dictionary constructDictionary(){  
    try{
        //选择词典实现,可以通过参数选择不同的实现
        String dicClass = WordConfTools.get("dic.class", "org.apdplat.word.dictionary.impl.TrieV4");
        LOGGER.info("dic.class="+dicClass);
        return (Dictionary)Class.forName(dicClass.trim()).newInstance();
    } catch (ClassNotFoundException | IllegalAccessException | InstantiationException ex) {
        System.err.println("词典装载失败:"+ex.getMessage());
        throw new RuntimeException(ex);
    }
}
 
Example #9
Source File: DictionaryFactory.java    From word with Apache License 2.0 5 votes vote down vote up
private static void test(String dicClass) throws Exception{
    WordConfTools.set("dic.class", dicClass);
    System.gc();
    Thread.sleep(60000);
    Dictionary dictionary = DictionaryFactory.getDictionary();
    System.gc();
    Thread.sleep(60000);
    AtomicInteger h = new AtomicInteger();
    AtomicInteger e = new AtomicInteger();
    Stream<String> words = Files.lines(Paths.get("src/test/resources/dic.txt"));
    System.gc();
    Thread.sleep(60000);
    long start = System.currentTimeMillis();
    for(int i=0; i<100; i++){
        words.forEach(word -> {
            for (int j = 0; j < word.length(); j++) {
                String sw = word.substring(0, j + 1);
                for (int k = 0; k < sw.length(); k++) {
                    if (dictionary.contains(sw, k, sw.length() - k)) {
                        h.incrementAndGet();
                    } else {
                        e.incrementAndGet();
                    }
                }
            }
        });
    }
    long cost = System.currentTimeMillis() - start;
    LOGGER.info(dicClass + " 未查询到的次数:"+ e.get() + ", 查询到的次数:" +  h.get() + " 耗时:" + cost + " 毫秒");
    System.gc();
    Thread.sleep(60000);
    LOGGER.info("test finish");
    System.exit(0);
}
 
Example #10
Source File: ChineseWordTokenizerFactory.java    From word with Apache License 2.0 5 votes vote down vote up
public ChineseWordTokenizerFactory(Map<String, String> args){
    super(args);
    if(args != null){
        String conf = args.get("conf");
        if(conf != null){
            //强制覆盖默认配置
            WordConfTools.forceOverride(conf);
        }else{
            LOGGER.info("没有指定conf参数");
        }
        String algorithm = args.get("segAlgorithm");
        if(algorithm != null){
            try{
                SegmentationAlgorithm segmentationAlgorithm = SegmentationAlgorithm.valueOf(algorithm);
                segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
                LOGGER.info("使用指定分词算法:"+algorithm);
            }catch(Exception e){
                LOGGER.error("参数segAlgorithm指定的值错误:"+algorithm);
                LOGGER.error("参数segAlgorithm可指定的值有:");
                for(SegmentationAlgorithm sa : SegmentationAlgorithm.values()){
                    LOGGER.error("\t"+sa.name());
                }
            }
        }else{
            LOGGER.info("没有指定segAlgorithm参数");
        }
    }
    if(segmentation == null){
        segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching);
        LOGGER.info("使用默认分词算法:"+SegmentationAlgorithm.BidirectionalMaximumMatching);
    }
}
 
Example #11
Source File: WordRefiner.java    From word with Apache License 2.0 4 votes vote down vote up
/**
 * 先拆词,再组词
 * @param words
 * @return
 */
public static List<Word> refine(List<Word> words){
    if(LOGGER.isDebugEnabled()) {
        LOGGER.debug("对分词结果进行refine之前:{}", words);
    }
    List<Word> result = new ArrayList<>(words.size());
    //一:拆词
    for(Word word : words){
        List<Word> splitWords = WordRefiner.split(word);
        if(splitWords==null){
            result.add(word);
        }else{
            if(LOGGER.isDebugEnabled()) {
                LOGGER.debug("词: " + word.getText() + " 被拆分为:" + splitWords);
            }
            result.addAll(splitWords);
        }
    }
    if(LOGGER.isDebugEnabled()) {
        LOGGER.debug("对分词结果进行refine阶段的拆词之后:{}", result);
    }
    //二:组词
    if(result.size()<2){
        return result;
    }
    int combineMaxLength = WordConfTools.getInt("word.refine.combine.max.length", 3);
    if(combineMaxLength < 2){
        combineMaxLength = 2;
    }
    List<Word> finalResult = new ArrayList<>(result.size());
    for(int i=0; i<result.size(); i++){
        List<Word> toCombineWords = null;
        Word combinedWord = null;
        for(int j=2; j<=combineMaxLength; j++){
            int to = i+j;
            if(to > result.size()){
                to = result.size();
            }
            toCombineWords = result.subList(i, to);
            combinedWord = WordRefiner.combine(toCombineWords);
            if(combinedWord != null){
                i += j;
                i--;
                break;
            }
        }
        if(combinedWord == null){
            finalResult.add(result.get(i));
        }else{
            if(LOGGER.isDebugEnabled()) {
                LOGGER.debug("词: " + toCombineWords + " 被合并为:" + combinedWord);
            }
            finalResult.add(combinedWord);
        }
    }
    if(LOGGER.isDebugEnabled()) {
        LOGGER.debug("对分词结果进行refine阶段的组词之后:{}", finalResult);
    }
    return finalResult;
}