org.apdplat.word.segmentation.SegmentationFactory Java Examples

The following examples show how to use org.apdplat.word.segmentation.SegmentationFactory. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TextSimilarity.java    From word with Apache License 2.0 6 votes vote down vote up
/**
 * 对文本进行分词
 * @param text 文本
 * @return 分词结果
 */
private List<Word> seg(String text){
    if(text == null){
        return Collections.emptyList();
    }
    if(segmentation == null){
        //延迟初始化
        segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.MaxNgramScore);
    }
    List<Word> words = segmentation.seg(text);
    if(filterStopWord) {
        //停用词过滤
        StopWord.filterStopWords(words);
    }
    return words;
}
 
Example #2
Source File: AntonymTagging.java    From word with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    List<Word> words = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching).seg("5月初有哪些电影值得观看");
    System.out.println(words);
    AntonymTagging.process(words);
    System.out.println(words);
    words = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching).seg("由于工作不到位、服务不完善导致顾客在用餐时发生不愉快的事情,餐厅方面应该向顾客作出真诚的道歉,而不是敷衍了事。");
    System.out.println(words);
    AntonymTagging.process(words);
    System.out.println(words);
}
 
Example #3
Source File: ChineseWordTokenizerFactory.java    From word with Apache License 2.0 5 votes vote down vote up
public ChineseWordTokenizerFactory(Map<String, String> args){
    super(args);
    if(args != null){
        String conf = args.get("conf");
        if(conf != null){
            //强制覆盖默认配置
            WordConfTools.forceOverride(conf);
        }else{
            LOGGER.info("没有指定conf参数");
        }
        String algorithm = args.get("segAlgorithm");
        if(algorithm != null){
            try{
                SegmentationAlgorithm segmentationAlgorithm = SegmentationAlgorithm.valueOf(algorithm);
                segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
                LOGGER.info("使用指定分词算法:"+algorithm);
            }catch(Exception e){
                LOGGER.error("参数segAlgorithm指定的值错误:"+algorithm);
                LOGGER.error("参数segAlgorithm可指定的值有:");
                for(SegmentationAlgorithm sa : SegmentationAlgorithm.values()){
                    LOGGER.error("\t"+sa.name());
                }
            }
        }else{
            LOGGER.info("没有指定segAlgorithm参数");
        }
    }
    if(segmentation == null){
        segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching);
        LOGGER.info("使用默认分词算法:"+SegmentationAlgorithm.BidirectionalMaximumMatching);
    }
}
 
Example #4
Source File: ChineseWordTokenizer.java    From word with Apache License 2.0 5 votes vote down vote up
public ChineseWordTokenizer(String segmentationAlgorithm) {
    try{
        SegmentationAlgorithm sa = SegmentationAlgorithm.valueOf(segmentationAlgorithm);
        this.segmentation = SegmentationFactory.getSegmentation(sa);
    }catch(Exception e){
        this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
    }
}
 
Example #5
Source File: ChineseWordAnalyzer.java    From word with Apache License 2.0 5 votes vote down vote up
public ChineseWordAnalyzer(String segmentationAlgorithm) {
    try{
        SegmentationAlgorithm sa = SegmentationAlgorithm.valueOf(segmentationAlgorithm);
        this.segmentation = SegmentationFactory.getSegmentation(sa);
    }catch(Exception e){
        this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
    }
}
 
Example #6
Source File: SynonymTagging.java    From word with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    List<Word> words = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching).seg("楚离陌千方百计为无情找回记忆");
    System.out.println(words);
    SynonymTagging.process(words);
    System.out.println(words);
    SynonymTagging.process(words, false);
    System.out.println(words);
    words = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching).seg("手劲大的老人往往更长寿");
    System.out.println(words);
    SynonymTagging.process(words);
    System.out.println(words);
    SynonymTagging.process(words, false);
    System.out.println(words);
}
 
Example #7
Source File: WordSegmenter.java    From word with Apache License 2.0 5 votes vote down vote up
/**
 * 对文本进行分词,移除停用词
 * 使用双向最大匹配算法
 * @param text 文本
 * @return 分词结果
 */
public static List<Word> seg(String text){
    List<Word> words = SegmentationFactory.getSegmentation(SegmentationAlgorithm.MaxNgramScore).seg(text);
    //停用词过滤
    StopWord.filterStopWords(words);
    return words;
}
 
Example #8
Source File: WordTokenizerTestCase.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
@Override
protected NlpTokenizer<? extends NlpToken> getTokenizer() {
    // 保持标点符号
    WordConfTools.set("keep.punctuation", "true");
    // 保持空格
    WordConfTools.set("keep.whitespace", "true");
    Segmentation segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.FullSegmentation);
    return new WordTokenizer(segmentation);
}
 
Example #9
Source File: WordSegmentFactory.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
@Override
public Segmentation build(Map<String, String> configurations) {
    for (Entry<String, String> keyValue : configurations.entrySet()) {
        String key = keyValue.getKey();
        String value = keyValue.getValue();
        WordConfTools.set(key, value);
    }

    String algorithm = get(configurations, "algorithm", "FullSegmentation");
    Segmentation segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.valueOf(algorithm));
    return segmentation;
}
 
Example #10
Source File: WordTokenizer.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
public WordTokenizer(String segmentationAlgorithm) {
    try {
        SegmentationAlgorithm sa = SegmentationAlgorithm.valueOf(segmentationAlgorithm);
        this.segmentation = SegmentationFactory.getSegmentation(sa);
    } catch (Exception e) {
        this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
    }
}
 
Example #11
Source File: WordTokenizerFactory.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
public WordTokenizerFactory(Map<String, String> configuration) {
    super(configuration);
    if (configuration != null) {
        String conf = configuration.get("conf");
        if (conf != null) {
            // 强制覆盖默认配置
            WordConfTools.forceOverride(conf);
        } else {
            LOGGER.info("没有指定conf参数");
        }
        String algorithm = configuration.get("segAlgorithm");
        if (algorithm != null) {
            try {
                SegmentationAlgorithm segmentationAlgorithm = SegmentationAlgorithm.valueOf(algorithm);
                segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
                LOGGER.info("使用指定分词算法:" + algorithm);
            } catch (Exception e) {
                LOGGER.error("参数segAlgorithm指定的值错误:" + algorithm);
                LOGGER.error("参数segAlgorithm可指定的值有:");
                for (SegmentationAlgorithm sa : SegmentationAlgorithm.values()) {
                    LOGGER.error("\t" + sa.name());
                }
            }
        } else {
            LOGGER.info("没有指定segAlgorithm参数");
        }
    }
    if (segmentation == null) {
        segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching);
        LOGGER.info("使用默认分词算法:" + SegmentationAlgorithm.BidirectionalMaximumMatching);
    }
}
 
Example #12
Source File: WordAnalyzer.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
public WordAnalyzer(String segmentationAlgorithm) {
    try {
        SegmentationAlgorithm sa = SegmentationAlgorithm.valueOf(segmentationAlgorithm);
        this.segmentation = SegmentationFactory.getSegmentation(sa);
    } catch (Exception e) {
        this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
    }
}
 
Example #13
Source File: WordSegmenterTestCase.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
@Override
protected Tokenizer getSegmenter() {
    // 可以配置到word.local.conf
    // 保持标点符号
    WordConfTools.set("keep.punctuation", "true");
    // 保持空格
    WordConfTools.set("keep.whitespace", "true");
    Segmentation segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.FullSegmentation);
    WordTokenizer tokenizer = new WordTokenizer(segmentation);
    return tokenizer;
}
 
Example #14
Source File: ChineseWordAnalyzer.java    From word with Apache License 2.0 4 votes vote down vote up
public ChineseWordAnalyzer(SegmentationAlgorithm segmentationAlgorithm) {
    this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
}
 
Example #15
Source File: Utils.java    From word with Apache License 2.0 4 votes vote down vote up
/**
 *
 * 对文件进行分词
 * @param input 输入文件
 * @param output 输出文件
 * @param removeStopWords 是否移除停用词
 * @param segmentationAlgorithm 分词算法
 * @param fileSegmentationCallback 分词结果回调
 * @throws Exception
 */
public static void seg(File input, File output, boolean removeStopWords, SegmentationAlgorithm segmentationAlgorithm, FileSegmentationCallback fileSegmentationCallback) throws Exception{
    LOGGER.info("开始对文件进行分词:"+input.toString());
    Segmentation segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
    float max=(float)Runtime.getRuntime().maxMemory()/1000000;
    float total=(float)Runtime.getRuntime().totalMemory()/1000000;
    float free=(float)Runtime.getRuntime().freeMemory()/1000000;
    String pre="执行之前剩余内存:"+max+"-"+total+"+"+free+"="+(max-total+free);
    //准备输出目录
    if(!output.getParentFile().exists()){
        output.getParentFile().mkdirs();
    }
    try(BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(input),"utf-8"));
        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output),"utf-8"))){
        long size = Files.size(input.toPath());
        LOGGER.info("size:"+size);
        LOGGER.info("文件大小:"+(float)size/1024/1024+" MB");
        int textLength=0;
        int progress=0;
        long start = System.currentTimeMillis();
        String line = null;
        while((line = reader.readLine()) != null){
            if("".equals(line.trim())){
                writer.write("\n");
                continue;
            }
            textLength += line.length();
            List<Word> words = segmentation.seg(line);
            if(removeStopWords){
                //停用词过滤
                StopWord.filterStopWords(words);
            }
            if(words == null){
                continue;
            }
            for(Word word : words){
                if(fileSegmentationCallback != null) {
                    fileSegmentationCallback.callback(word);
                }
                writer.write(word.getText()+" ");
            }
            writer.write("\n");
            progress += line.length();
            if( progress > 500000){
                progress = 0;
                LOGGER.info("分词进度:"+(int)((float)textLength*2/size*100)+"%");
            }
        }
        long cost = System.currentTimeMillis() - start;
        float rate = textLength/cost;
        LOGGER.info("字符数目:"+textLength);
        LOGGER.info("分词耗时:"+getTimeDes(cost)+" 毫秒");
        LOGGER.info("分词速度:"+rate+" 字符/毫秒");
    }
    max=(float)Runtime.getRuntime().maxMemory()/1000000;
    total=(float)Runtime.getRuntime().totalMemory()/1000000;
    free=(float)Runtime.getRuntime().freeMemory()/1000000;
    String post="执行之后剩余内存:"+max+"-"+total+"+"+free+"="+(max-total+free);
    LOGGER.info(pre);
    LOGGER.info(post);
    LOGGER.info("将文件 "+input.toString()+" 的分词结果保存到文件 "+output);
}
 
Example #16
Source File: ChineseWordTokenizer.java    From word with Apache License 2.0 4 votes vote down vote up
public ChineseWordTokenizer(SegmentationAlgorithm segmentationAlgorithm) {
    this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
}
 
Example #17
Source File: WordTokenizer.java    From jstarcraft-nlp with Apache License 2.0 4 votes vote down vote up
public WordTokenizer(SegmentationAlgorithm segmentationAlgorithm) {
    this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
}
 
Example #18
Source File: ChineseWordTokenizer.java    From word with Apache License 2.0 4 votes vote down vote up
public ChineseWordTokenizer() {
    segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
}
 
Example #19
Source File: ChineseWordAnalyzer.java    From word with Apache License 2.0 4 votes vote down vote up
public ChineseWordAnalyzer(){
    this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
}
 
Example #20
Source File: WordAnalyzer.java    From jstarcraft-nlp with Apache License 2.0 4 votes vote down vote up
public WordAnalyzer() {
    this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
}
 
Example #21
Source File: TextSimilarity.java    From word with Apache License 2.0 4 votes vote down vote up
public void setSegmentationAlgorithm(SegmentationAlgorithm segmentationAlgorithm){
    segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
    LOGGER.info("设置分词算法为:"+segmentationAlgorithm.getDes());
}
 
Example #22
Source File: WordTokenizer.java    From jstarcraft-nlp with Apache License 2.0 4 votes vote down vote up
public WordTokenizer() {
    segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
}
 
Example #23
Source File: PinyinTagging.java    From word with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
    List<Word> words = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching).seg("《速度与激情7》的中国内地票房自4月12日上映以来,在短短两周内突破20亿人民币");
    System.out.println(words);
    PinyinTagging.process(words);
    System.out.println(words);
}
 
Example #24
Source File: WordAnalyzer.java    From jstarcraft-nlp with Apache License 2.0 4 votes vote down vote up
public WordAnalyzer(SegmentationAlgorithm segmentationAlgorithm) {
    this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
}
 
Example #25
Source File: WordSegmenter.java    From word with Apache License 2.0 3 votes vote down vote up
/**
 * 对文本进行分词,移除停用词
 * 可指定其他分词算法
 * @param text 文本
 * @param segmentationAlgorithm 分词算法
 * @return 分词结果
 */
public static List<Word> seg(String text, SegmentationAlgorithm segmentationAlgorithm){        
    List<Word> words = SegmentationFactory.getSegmentation(segmentationAlgorithm).seg(text);
    //停用词过滤
    StopWord.filterStopWords(words);
    return words;
}
 
Example #26
Source File: WordSegmenter.java    From word with Apache License 2.0 2 votes vote down vote up
/**
 * 对文本进行分词,保留停用词
 * 可指定其他分词算法
 * @param text 文本
 * @param segmentationAlgorithm 分词算法
 * @return 分词结果
 */
public static List<Word> segWithStopWords(String text, SegmentationAlgorithm segmentationAlgorithm){
    return SegmentationFactory.getSegmentation(segmentationAlgorithm).seg(text);
}
 
Example #27
Source File: WordSegmenter.java    From word with Apache License 2.0 2 votes vote down vote up
/**
 * 对文本进行分词,保留停用词
 * 使用双向最大匹配算法
 * @param text 文本
 * @return 分词结果
 */
public static List<Word> segWithStopWords(String text){
    return SegmentationFactory.getSegmentation(SegmentationAlgorithm.MaxNgramScore).seg(text);
}
 
Example #28
Source File: WordFrequencyStatistics.java    From word with Apache License 2.0 2 votes vote down vote up
/**
 * 构造函数
 * @param resultPath 词频统计结果保存路径
 * @param segmentationAlgorithm 分词算法
 */
public WordFrequencyStatistics(String resultPath, SegmentationAlgorithm segmentationAlgorithm){
    this.resultPath = resultPath;
    this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
}
 
Example #29
Source File: WordFrequencyStatistics.java    From word with Apache License 2.0 2 votes vote down vote up
/**
 * 构造函数
 * @param resultPath 词频统计结果保存路径
 * @param segmentationAlgorithm 分词算法,要符合 org.apdplat.word.segmentation.SegmentationAlgorithm 中的定义
 */
public WordFrequencyStatistics(String resultPath, String segmentationAlgorithm){
    this.resultPath = resultPath;
    this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.valueOf(segmentationAlgorithm));
}
 
Example #30
Source File: WordFrequencyStatistics.java    From word with Apache License 2.0 2 votes vote down vote up
/**
 * 设置分词算法
 * @param segmentationAlgorithm 分词算法
 */
public void setSegmentationAlgorithm(SegmentationAlgorithm segmentationAlgorithm) {
    this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
}