org.apdplat.word.WordSegmenter Java Examples

The following examples show how to use org.apdplat.word.WordSegmenter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: GenericWebPageSimilarChecker.java    From rank with Apache License 2.0 6 votes vote down vote up
private double score(WebPage webPage1, WebPage webPage2){
    //分词
    List<Word> webPage1Words = WordSegmenter.seg(webPage1.getTitle()+"\n"+webPage1.getContent());
    List<Word> webPage2Words = WordSegmenter.seg(webPage2.getTitle()+"\n"+webPage2.getContent());
    //词频统计
    Map<Word, AtomicInteger> webPage1WordsFre = frequence(webPage1Words);
    Map<Word, AtomicInteger> webPage2WordsFre = frequence(webPage2Words);
    //输出详细信息
    if(LOGGER.isDebugEnabled()){
        showDetail(webPage1, webPage1Words, webPage1WordsFre);
        showDetail(webPage2, webPage2Words, webPage2WordsFre);
    }
    //使用简单共有词判定
    return simpleScore(webPage1WordsFre, webPage2WordsFre);
    //使用余弦相似度判定
    //return cosScore(webPage1WordsFre, webPage2WordsFre);
}
 
Example #2
Source File: ITEYEBlogSimilarChecker.java    From rank with Apache License 2.0 6 votes vote down vote up
private double score(Blog blog1, Blog blog2){
    //分词
    List<Word> blog1Words = WordSegmenter.seg(blog1.getTitle()+"\n"+blog1.getContent());
    List<Word> blog2Words = WordSegmenter.seg(blog2.getTitle()+"\n"+blog2.getContent());
    //词频统计
    Map<Word, AtomicInteger> blog1WordsFre = frequence(blog1Words);
    Map<Word, AtomicInteger> blog2WordsFre = frequence(blog2Words);
    //输出详细信息
    if(LOGGER.isDebugEnabled()){
        showDetail(blog1, blog1Words, blog1WordsFre);
        showDetail(blog2, blog2Words, blog2WordsFre);
    }
    //使用简单共有词判定
    return simpleScore(blog1WordsFre, blog2WordsFre);
    //使用余弦相似度判定
    //return cosScore(blog1WordsFre, blog2WordsFre);
}
 
Example #3
Source File: WordAnalyzer.java    From hugegraph with Apache License 2.0 5 votes vote down vote up
@Override
public Set<String> segment(String text) {
    Set<String> result = InsertionOrderUtil.newSet();
    List<Word> words = WordSegmenter.segWithStopWords(text, this.algorithm);
    for (Word word : words) {
        result.add(word.getText());
    }
    return result;
}
 
Example #4
Source File: wordTest.java    From Doctor with Apache License 2.0 5 votes vote down vote up
@Test
public void fenci() {
    String word="有低热乏力倦怠体重下降";
    //设置词典
    WordConfTools.set("dic.path", "classpath:wordLink.txt");
    //设置最多多少个字为一词
    WordConfTools.set("intercept.length","10");
    DictionaryFactory.reload();//更改词典路径之后,重新加载词典
    List<Word> words = WordSegmenter.seg(word, SegmentationAlgorithm.MaximumMatching);
    System.out.println(words);
}
 
Example #5
Source File: TextSearcher.java    From jsearch with Apache License 2.0 5 votes vote down vote up
public List<Doc> hit(String keyword, SearchMode searchMode){
    long start = System.currentTimeMillis();
    LOGGER.info("search keyword: " + keyword);
    List<Word> words = WordSegmenter.seg(keyword, segmentationAlgorithm);
    LOGGER.info("seg result: "+words);
    //搜索结果文档
    Set<Doc> result = new ConcurrentSkipListSet<>();
    if(words.size()==1){
        //单 词 查询
        result.addAll(term(words.get(0).getText()));
    }else if(words.size() > 1){
        //多 词 查询
        result.addAll(term(words.get(0).getText()));
        for(int i=1; i<words.size(); i++){
            if(searchMode==SearchMode.INTERSECTION) {
                SearchMode.intersection(result, term(words.get(i).getText()));
            }else {
                SearchMode.union(result, term(words.get(i).getText()));
            }
        }
    }
    //文档评分排序
    List<Doc> finalResult = result.parallelStream()
            //评分
            .map(doc -> {
                doc.setScore(score.score(doc, words.stream().map(word -> word.getText()).collect(Collectors.toList())));
                return doc;
            })
            //排序
            .sorted((a, b) -> b.getScore().compareTo(a.getScore()))
            .collect(Collectors.toList());
    long cost = System.currentTimeMillis()-start;
    LOGGER.info("hit count: "+result.size());
    LOGGER.info("query index cost: "+cost+" ms");
    return finalResult;
}
 
Example #6
Source File: WordParser.java    From QuestionAnsweringSystem with Apache License 2.0 5 votes vote down vote up
/**
 * 带词性标注(包括细分词性标注)的分析方法
 *
 * @param str 需要分词的文本
 * @return 分词结果
 */
public static List<Word> parseWithoutStopWords(String str) {
    List<Word> words = WordSegmenter.seg(str, SegmentationAlgorithm.MaxNgramScore);
    //词性标注
    PartOfSpeechTagging.process(words);
    return words;
}
 
Example #7
Source File: PartOfSpeechTagging.java    From word with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    List<Word> words = WordSegmenter.segWithStopWords("我爱中国,我爱杨尚川");
    System.out.println("未标注词性:"+words);
    //词性标注
    PartOfSpeechTagging.process(words);
    System.out.println("标注词性:"+words);
}
 
Example #8
Source File: WordParser.java    From QuestionAnsweringSystem with Apache License 2.0 4 votes vote down vote up
public static List<Word> parse(String str) {
    List<Word> words = WordSegmenter.segWithStopWords(str, SegmentationAlgorithm.MaxNgramScore);
    //词性标注
    PartOfSpeechTagging.process(words);
    return words;
}
 
Example #9
Source File: Evaluation.java    From word with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception{
    //分好词的人工标注文本,词之间以空格分隔
    String corpusText = "target/evaluation/corpus-text.txt";
    //测试文本,是把corpus-text.txt以标点符号分隔为多行的结果
    String testText = "target/evaluation/test-text.txt";
    //测试文本对应的人工标注文本,作为分词是否正确的标准
    String standardText = "target/evaluation/standard-text.txt";
    //word分词结果
    String resultText = "target/evaluation/result-text-";
    //分词结果和人工标注标准完全一致的文本
    String perfectResult = "target/evaluation/perfect-result-";
    //分词结果和人工标注标准不一致的文本
    String wrongResult = "target/evaluation/wrong-result-";
    //评估结果位于target/evaluation目录下:
    Path path = Paths.get("target/evaluation");
    if(!Files.exists(path)){
        Files.createDirectory(path);
    }
    //1、抽取文本
    ExtractText.extractFromCorpus(corpusText, " ", false);
    //2、生成测试数据集和标准数据集
    int textCharCount = generateDataset(corpusText, testText, standardText);
    List<EvaluationResult> result = new ArrayList<>();
    for(SegmentationAlgorithm segmentationAlgorithm : SegmentationAlgorithm.values()){
        long start = System.currentTimeMillis();
        //3、对测试数据集进行分词
        WordSegmenter.segWithStopWords(new File(testText), new File(resultText+segmentationAlgorithm.name()+".txt"), segmentationAlgorithm);
        long cost = System.currentTimeMillis() - start;
        float rate = textCharCount/(float)cost;
        //4、分词效果评估
        EvaluationResult evaluationResult = evaluation(resultText+segmentationAlgorithm.name()+".txt", standardText, perfectResult+segmentationAlgorithm.name()+".txt", wrongResult+segmentationAlgorithm.name()+".txt");
        evaluationResult.setSegmentationAlgorithm(segmentationAlgorithm);
        evaluationResult.setSegSpeed(rate);
        result.add(evaluationResult);
    }
    //5、输出测试报告
    LOGGER.info("*************************************************************************************************************");
    Collections.sort(result);
    for(int i=0; i<result.size(); i++){
        LOGGER.info(result.get(i).toString());
        if(i < result.size()-1){
            LOGGER.info("");
        }
    }
    LOGGER.info("*************************************************************************************************************");
}
 
Example #10
Source File: SentenceIdentify.java    From word with Apache License 2.0 4 votes vote down vote up
public static float identify(String sentence){
    List<Word> words = WordSegmenter.segWithStopWords(sentence);
    System.out.println("随机单词: "+words);
    System.out.println("生成句子: "+sentence);
    return Bigram.sentenceScore(words);
}