org.apdplat.word.segmentation.SegmentationAlgorithm Java Examples

The following examples show how to use org.apdplat.word.segmentation.SegmentationAlgorithm. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TextSimilarity.java    From word with Apache License 2.0 6 votes vote down vote up
/**
 * 对文本进行分词
 * @param text 文本
 * @return 分词结果
 */
private List<Word> seg(String text){
    if(text == null){
        return Collections.emptyList();
    }
    if(segmentation == null){
        //延迟初始化
        segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.MaxNgramScore);
    }
    List<Word> words = segmentation.seg(text);
    if(filterStopWord) {
        //停用词过滤
        StopWord.filterStopWords(words);
    }
    return words;
}
 
Example #2
Source File: ChineseWordTokenizerFactory.java    From word with Apache License 2.0 5 votes vote down vote up
public ChineseWordTokenizerFactory(Map<String, String> args){
    super(args);
    if(args != null){
        String conf = args.get("conf");
        if(conf != null){
            //强制覆盖默认配置
            WordConfTools.forceOverride(conf);
        }else{
            LOGGER.info("没有指定conf参数");
        }
        String algorithm = args.get("segAlgorithm");
        if(algorithm != null){
            try{
                SegmentationAlgorithm segmentationAlgorithm = SegmentationAlgorithm.valueOf(algorithm);
                segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
                LOGGER.info("使用指定分词算法:"+algorithm);
            }catch(Exception e){
                LOGGER.error("参数segAlgorithm指定的值错误:"+algorithm);
                LOGGER.error("参数segAlgorithm可指定的值有:");
                for(SegmentationAlgorithm sa : SegmentationAlgorithm.values()){
                    LOGGER.error("\t"+sa.name());
                }
            }
        }else{
            LOGGER.info("没有指定segAlgorithm参数");
        }
    }
    if(segmentation == null){
        segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching);
        LOGGER.info("使用默认分词算法:"+SegmentationAlgorithm.BidirectionalMaximumMatching);
    }
}
 
Example #3
Source File: WordSegmenterTestCase.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
@Override
protected Tokenizer getSegmenter() {
    // 可以配置到word.local.conf
    // 保持标点符号
    WordConfTools.set("keep.punctuation", "true");
    // 保持空格
    WordConfTools.set("keep.whitespace", "true");
    Segmentation segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.FullSegmentation);
    WordTokenizer tokenizer = new WordTokenizer(segmentation);
    return tokenizer;
}
 
Example #4
Source File: WordTokenizerFactory.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
public WordTokenizerFactory(Map<String, String> configuration) {
    super(configuration);
    if (configuration != null) {
        String conf = configuration.get("conf");
        if (conf != null) {
            // 强制覆盖默认配置
            WordConfTools.forceOverride(conf);
        } else {
            LOGGER.info("没有指定conf参数");
        }
        String algorithm = configuration.get("segAlgorithm");
        if (algorithm != null) {
            try {
                SegmentationAlgorithm segmentationAlgorithm = SegmentationAlgorithm.valueOf(algorithm);
                segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
                LOGGER.info("使用指定分词算法:" + algorithm);
            } catch (Exception e) {
                LOGGER.error("参数segAlgorithm指定的值错误:" + algorithm);
                LOGGER.error("参数segAlgorithm可指定的值有:");
                for (SegmentationAlgorithm sa : SegmentationAlgorithm.values()) {
                    LOGGER.error("\t" + sa.name());
                }
            }
        } else {
            LOGGER.info("没有指定segAlgorithm参数");
        }
    }
    if (segmentation == null) {
        segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching);
        LOGGER.info("使用默认分词算法:" + SegmentationAlgorithm.BidirectionalMaximumMatching);
    }
}
 
Example #5
Source File: WordAnalyzer.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
public WordAnalyzer(String segmentationAlgorithm) {
    try {
        SegmentationAlgorithm sa = SegmentationAlgorithm.valueOf(segmentationAlgorithm);
        this.segmentation = SegmentationFactory.getSegmentation(sa);
    } catch (Exception e) {
        this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
    }
}
 
Example #6
Source File: WordTokenizerTestCase.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
@Override
protected NlpTokenizer<? extends NlpToken> getTokenizer() {
    // 保持标点符号
    WordConfTools.set("keep.punctuation", "true");
    // 保持空格
    WordConfTools.set("keep.whitespace", "true");
    Segmentation segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.FullSegmentation);
    return new WordTokenizer(segmentation);
}
 
Example #7
Source File: WordSegmentFactory.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
@Override
public Segmentation build(Map<String, String> configurations) {
    for (Entry<String, String> keyValue : configurations.entrySet()) {
        String key = keyValue.getKey();
        String value = keyValue.getValue();
        WordConfTools.set(key, value);
    }

    String algorithm = get(configurations, "algorithm", "FullSegmentation");
    Segmentation segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.valueOf(algorithm));
    return segmentation;
}
 
Example #8
Source File: WordAnalyzer.java    From hugegraph with Apache License 2.0 5 votes vote down vote up
public WordAnalyzer(String mode) {
    try {
        this.algorithm = SegmentationAlgorithm.valueOf(mode);
    } catch (Exception e) {
        throw new ConfigException(
                  "Unsupported segment mode '%s' for word analyzer, " +
                  "the available values are %s", e, mode, SUPPORT_MODES);
    }
}
 
Example #9
Source File: wordTest.java    From Doctor with Apache License 2.0 5 votes vote down vote up
@Test
public void fenci() {
    String word="有低热乏力倦怠体重下降";
    //设置词典
    WordConfTools.set("dic.path", "classpath:wordLink.txt");
    //设置最多多少个字为一词
    WordConfTools.set("intercept.length","10");
    DictionaryFactory.reload();//更改词典路径之后,重新加载词典
    List<Word> words = WordSegmenter.seg(word, SegmentationAlgorithm.MaximumMatching);
    System.out.println(words);
}
 
Example #10
Source File: wordTest.java    From Doctor with Apache License 2.0 5 votes vote down vote up
@Test
public void cipin(){
    //词频统计设置
    WordFrequencyStatistics wordFrequencyStatistics = new WordFrequencyStatistics();
    wordFrequencyStatistics.setRemoveStopWord(true);
    wordFrequencyStatistics.setResultPath("util-frequency-statistics.txt");
    wordFrequencyStatistics.setSegmentationAlgorithm(SegmentationAlgorithm.MaxNgramScore);
    //开始分词
    wordFrequencyStatistics.seg(s);
    //输出词频统计结果
    wordFrequencyStatistics.dump();
}
 
Example #11
Source File: TextSearcher.java    From jsearch with Apache License 2.0 5 votes vote down vote up
public TextSearcher(String index, String indexText, int pageSize, SegmentationAlgorithm segmentationAlgorithm){
    this.segmentationAlgorithm = segmentationAlgorithm;
    this.index = index;
    this.indexText = indexText;
    this.pageSize = pageSize;
    init();
}
 
Example #12
Source File: WordTokenizer.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
public WordTokenizer(String segmentationAlgorithm) {
    try {
        SegmentationAlgorithm sa = SegmentationAlgorithm.valueOf(segmentationAlgorithm);
        this.segmentation = SegmentationFactory.getSegmentation(sa);
    } catch (Exception e) {
        this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
    }
}
 
Example #13
Source File: ChineseWordTokenizer.java    From word with Apache License 2.0 5 votes vote down vote up
public ChineseWordTokenizer(String segmentationAlgorithm) {
    try{
        SegmentationAlgorithm sa = SegmentationAlgorithm.valueOf(segmentationAlgorithm);
        this.segmentation = SegmentationFactory.getSegmentation(sa);
    }catch(Exception e){
        this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
    }
}
 
Example #14
Source File: ChineseWordAnalyzer.java    From word with Apache License 2.0 5 votes vote down vote up
public ChineseWordAnalyzer(String segmentationAlgorithm) {
    try{
        SegmentationAlgorithm sa = SegmentationAlgorithm.valueOf(segmentationAlgorithm);
        this.segmentation = SegmentationFactory.getSegmentation(sa);
    }catch(Exception e){
        this.segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
    }
}
 
Example #15
Source File: SynonymTagging.java    From word with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    List<Word> words = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching).seg("楚离陌千方百计为无情找回记忆");
    System.out.println(words);
    SynonymTagging.process(words);
    System.out.println(words);
    SynonymTagging.process(words, false);
    System.out.println(words);
    words = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching).seg("手劲大的老人往往更长寿");
    System.out.println(words);
    SynonymTagging.process(words);
    System.out.println(words);
    SynonymTagging.process(words, false);
    System.out.println(words);
}
 
Example #16
Source File: AntonymTagging.java    From word with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    List<Word> words = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching).seg("5月初有哪些电影值得观看");
    System.out.println(words);
    AntonymTagging.process(words);
    System.out.println(words);
    words = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching).seg("由于工作不到位、服务不完善导致顾客在用餐时发生不愉快的事情,餐厅方面应该向顾客作出真诚的道歉,而不是敷衍了事。");
    System.out.println(words);
    AntonymTagging.process(words);
    System.out.println(words);
}
 
Example #17
Source File: WordSegmenter.java    From word with Apache License 2.0 5 votes vote down vote up
/**
 * 对文本进行分词,移除停用词
 * 使用双向最大匹配算法
 * @param text 文本
 * @return 分词结果
 */
public static List<Word> seg(String text){
    List<Word> words = SegmentationFactory.getSegmentation(SegmentationAlgorithm.MaxNgramScore).seg(text);
    //停用词过滤
    StopWord.filterStopWords(words);
    return words;
}
 
Example #18
Source File: WordParser.java    From QuestionAnsweringSystem with Apache License 2.0 5 votes vote down vote up
/**
 * 带词性标注(包括细分词性标注)的分析方法
 *
 * @param str 需要分词的文本
 * @return 分词结果
 */
public static List<Word> parseWithoutStopWords(String str) {
    List<Word> words = WordSegmenter.seg(str, SegmentationAlgorithm.MaxNgramScore);
    //词性标注
    PartOfSpeechTagging.process(words);
    return words;
}
 
Example #19
Source File: DefinitionSimilarRule.java    From superword with Apache License 2.0 5 votes vote down vote up
public static List<Result> run(Dictionary dictionary, Set<Word> words, String wordDefinition, int count) {
    List<String> allWordDefinition = MySQLUtils.getAllWordDefinition(dictionary.name(), words);

    TextSimilarity textSimilarity = new CosineTextSimilarity();

    if (dictionary == Dictionary.OXFORD || dictionary == Dictionary.WEBSTER) {
        textSimilarity.setSegmentationAlgorithm(SegmentationAlgorithm.PureEnglish);
    }
    if (dictionary == Dictionary.ICIBA || dictionary == Dictionary.YOUDAO) {
        textSimilarity.setSegmentationAlgorithm(SegmentationAlgorithm.MaxNgramScore);
    }

    List<Result> results = new ArrayList<>();

    for (Hit hit : textSimilarity.rank(wordDefinition, allWordDefinition, count).getHits()) {
        String[] attrs = hit.getText().split("_");
        String word = attrs[0];
        StringBuilder definition = new StringBuilder(attrs[1]);
        for (int j = 2; j < attrs.length; j++) {
            definition.append(attrs[j]).append("_");
        }

        Result result = new Result();
        result.setWord(word);
        result.setDefinition(definition.toString());
        result.setUrl(WordLinker.toLink(word));
        result.setScore(hit.getScore());

        results.add(result);
    }
    return results;
}
 
Example #20
Source File: PinyinTagging.java    From word with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
    List<Word> words = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching).seg("《速度与激情7》的中国内地票房自4月12日上映以来,在短短两周内突破20亿人民币");
    System.out.println(words);
    PinyinTagging.process(words);
    System.out.println(words);
}
 
Example #21
Source File: WordTokenizer.java    From jstarcraft-nlp with Apache License 2.0 4 votes vote down vote up
public WordTokenizer() {
    segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
}
 
Example #22
Source File: WordAnalyzer.java    From jstarcraft-nlp with Apache License 2.0 4 votes vote down vote up
public WordAnalyzer(SegmentationAlgorithm segmentationAlgorithm) {
    this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
}
 
Example #23
Source File: BidirectionalMaximumMatching.java    From word with Apache License 2.0 4 votes vote down vote up
@Override
public SegmentationAlgorithm getSegmentationAlgorithm() {
    return SegmentationAlgorithm.BidirectionalMaximumMatching;
}
 
Example #24
Source File: Evaluation.java    From word with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception{
    //分好词的人工标注文本,词之间以空格分隔
    String corpusText = "target/evaluation/corpus-text.txt";
    //测试文本,是把corpus-text.txt以标点符号分隔为多行的结果
    String testText = "target/evaluation/test-text.txt";
    //测试文本对应的人工标注文本,作为分词是否正确的标准
    String standardText = "target/evaluation/standard-text.txt";
    //word分词结果
    String resultText = "target/evaluation/result-text-";
    //分词结果和人工标注标准完全一致的文本
    String perfectResult = "target/evaluation/perfect-result-";
    //分词结果和人工标注标准不一致的文本
    String wrongResult = "target/evaluation/wrong-result-";
    //评估结果位于target/evaluation目录下:
    Path path = Paths.get("target/evaluation");
    if(!Files.exists(path)){
        Files.createDirectory(path);
    }
    //1、抽取文本
    ExtractText.extractFromCorpus(corpusText, " ", false);
    //2、生成测试数据集和标准数据集
    int textCharCount = generateDataset(corpusText, testText, standardText);
    List<EvaluationResult> result = new ArrayList<>();
    for(SegmentationAlgorithm segmentationAlgorithm : SegmentationAlgorithm.values()){
        long start = System.currentTimeMillis();
        //3、对测试数据集进行分词
        WordSegmenter.segWithStopWords(new File(testText), new File(resultText+segmentationAlgorithm.name()+".txt"), segmentationAlgorithm);
        long cost = System.currentTimeMillis() - start;
        float rate = textCharCount/(float)cost;
        //4、分词效果评估
        EvaluationResult evaluationResult = evaluation(resultText+segmentationAlgorithm.name()+".txt", standardText, perfectResult+segmentationAlgorithm.name()+".txt", wrongResult+segmentationAlgorithm.name()+".txt");
        evaluationResult.setSegmentationAlgorithm(segmentationAlgorithm);
        evaluationResult.setSegSpeed(rate);
        result.add(evaluationResult);
    }
    //5、输出测试报告
    LOGGER.info("*************************************************************************************************************");
    Collections.sort(result);
    for(int i=0; i<result.size(); i++){
        LOGGER.info(result.get(i).toString());
        if(i < result.size()-1){
            LOGGER.info("");
        }
    }
    LOGGER.info("*************************************************************************************************************");
}
 
Example #25
Source File: EvaluationResult.java    From word with Apache License 2.0 4 votes vote down vote up
public SegmentationAlgorithm getSegmentationAlgorithm() {
    return segmentationAlgorithm;
}
 
Example #26
Source File: EvaluationResult.java    From word with Apache License 2.0 4 votes vote down vote up
public void setSegmentationAlgorithm(SegmentationAlgorithm segmentationAlgorithm) {
    this.segmentationAlgorithm = segmentationAlgorithm;
}
 
Example #27
Source File: ChineseWordTokenizer.java    From word with Apache License 2.0 4 votes vote down vote up
public ChineseWordTokenizer(SegmentationAlgorithm segmentationAlgorithm) {
    this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
}
 
Example #28
Source File: WordTokenizer.java    From jstarcraft-nlp with Apache License 2.0 4 votes vote down vote up
public WordTokenizer(SegmentationAlgorithm segmentationAlgorithm) {
    this.segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
}
 
Example #29
Source File: ChineseWordTokenizer.java    From word with Apache License 2.0 4 votes vote down vote up
public ChineseWordTokenizer() {
    segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMinimumMatching);
}
 
Example #30
Source File: Utils.java    From word with Apache License 2.0 4 votes vote down vote up
/**
 *
 * 对文件进行分词
 * @param input 输入文件
 * @param output 输出文件
 * @param removeStopWords 是否移除停用词
 * @param segmentationAlgorithm 分词算法
 * @param fileSegmentationCallback 分词结果回调
 * @throws Exception
 */
public static void seg(File input, File output, boolean removeStopWords, SegmentationAlgorithm segmentationAlgorithm, FileSegmentationCallback fileSegmentationCallback) throws Exception{
    LOGGER.info("开始对文件进行分词:"+input.toString());
    Segmentation segmentation = SegmentationFactory.getSegmentation(segmentationAlgorithm);
    float max=(float)Runtime.getRuntime().maxMemory()/1000000;
    float total=(float)Runtime.getRuntime().totalMemory()/1000000;
    float free=(float)Runtime.getRuntime().freeMemory()/1000000;
    String pre="执行之前剩余内存:"+max+"-"+total+"+"+free+"="+(max-total+free);
    //准备输出目录
    if(!output.getParentFile().exists()){
        output.getParentFile().mkdirs();
    }
    try(BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(input),"utf-8"));
        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output),"utf-8"))){
        long size = Files.size(input.toPath());
        LOGGER.info("size:"+size);
        LOGGER.info("文件大小:"+(float)size/1024/1024+" MB");
        int textLength=0;
        int progress=0;
        long start = System.currentTimeMillis();
        String line = null;
        while((line = reader.readLine()) != null){
            if("".equals(line.trim())){
                writer.write("\n");
                continue;
            }
            textLength += line.length();
            List<Word> words = segmentation.seg(line);
            if(removeStopWords){
                //停用词过滤
                StopWord.filterStopWords(words);
            }
            if(words == null){
                continue;
            }
            for(Word word : words){
                if(fileSegmentationCallback != null) {
                    fileSegmentationCallback.callback(word);
                }
                writer.write(word.getText()+" ");
            }
            writer.write("\n");
            progress += line.length();
            if( progress > 500000){
                progress = 0;
                LOGGER.info("分词进度:"+(int)((float)textLength*2/size*100)+"%");
            }
        }
        long cost = System.currentTimeMillis() - start;
        float rate = textLength/cost;
        LOGGER.info("字符数目:"+textLength);
        LOGGER.info("分词耗时:"+getTimeDes(cost)+" 毫秒");
        LOGGER.info("分词速度:"+rate+" 字符/毫秒");
    }
    max=(float)Runtime.getRuntime().maxMemory()/1000000;
    total=(float)Runtime.getRuntime().totalMemory()/1000000;
    free=(float)Runtime.getRuntime().freeMemory()/1000000;
    String post="执行之后剩余内存:"+max+"-"+total+"+"+free+"="+(max-total+free);
    LOGGER.info(pre);
    LOGGER.info(post);
    LOGGER.info("将文件 "+input.toString()+" 的分词结果保存到文件 "+output);
}