Java Code Examples for org.apdplat.word.WordSegmenter#seg()

The following examples show how to use org.apdplat.word.WordSegmenter#seg() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: GenericWebPageSimilarChecker.java    From rank with Apache License 2.0 6 votes vote down vote up
private double score(WebPage webPage1, WebPage webPage2){
    //分词
    List<Word> webPage1Words = WordSegmenter.seg(webPage1.getTitle()+"\n"+webPage1.getContent());
    List<Word> webPage2Words = WordSegmenter.seg(webPage2.getTitle()+"\n"+webPage2.getContent());
    //词频统计
    Map<Word, AtomicInteger> webPage1WordsFre = frequence(webPage1Words);
    Map<Word, AtomicInteger> webPage2WordsFre = frequence(webPage2Words);
    //输出详细信息
    if(LOGGER.isDebugEnabled()){
        showDetail(webPage1, webPage1Words, webPage1WordsFre);
        showDetail(webPage2, webPage2Words, webPage2WordsFre);
    }
    //使用简单共有词判定
    return simpleScore(webPage1WordsFre, webPage2WordsFre);
    //使用余弦相似度判定
    //return cosScore(webPage1WordsFre, webPage2WordsFre);
}
 
Example 2
Source File: ITEYEBlogSimilarChecker.java    From rank with Apache License 2.0 6 votes vote down vote up
private double score(Blog blog1, Blog blog2){
    //分词
    List<Word> blog1Words = WordSegmenter.seg(blog1.getTitle()+"\n"+blog1.getContent());
    List<Word> blog2Words = WordSegmenter.seg(blog2.getTitle()+"\n"+blog2.getContent());
    //词频统计
    Map<Word, AtomicInteger> blog1WordsFre = frequence(blog1Words);
    Map<Word, AtomicInteger> blog2WordsFre = frequence(blog2Words);
    //输出详细信息
    if(LOGGER.isDebugEnabled()){
        showDetail(blog1, blog1Words, blog1WordsFre);
        showDetail(blog2, blog2Words, blog2WordsFre);
    }
    //使用简单共有词判定
    return simpleScore(blog1WordsFre, blog2WordsFre);
    //使用余弦相似度判定
    //return cosScore(blog1WordsFre, blog2WordsFre);
}
 
Example 3
Source File: wordTest.java    From Doctor with Apache License 2.0 5 votes vote down vote up
@Test
public void fenci() {
    String word="有低热乏力倦怠体重下降";
    //设置词典
    WordConfTools.set("dic.path", "classpath:wordLink.txt");
    //设置最多多少个字为一词
    WordConfTools.set("intercept.length","10");
    DictionaryFactory.reload();//更改词典路径之后,重新加载词典
    List<Word> words = WordSegmenter.seg(word, SegmentationAlgorithm.MaximumMatching);
    System.out.println(words);
}
 
Example 4
Source File: TextSearcher.java    From jsearch with Apache License 2.0 5 votes vote down vote up
public List<Doc> hit(String keyword, SearchMode searchMode){
    long start = System.currentTimeMillis();
    LOGGER.info("search keyword: " + keyword);
    List<Word> words = WordSegmenter.seg(keyword, segmentationAlgorithm);
    LOGGER.info("seg result: "+words);
    //搜索结果文档
    Set<Doc> result = new ConcurrentSkipListSet<>();
    if(words.size()==1){
        //单 词 查询
        result.addAll(term(words.get(0).getText()));
    }else if(words.size() > 1){
        //多 词 查询
        result.addAll(term(words.get(0).getText()));
        for(int i=1; i<words.size(); i++){
            if(searchMode==SearchMode.INTERSECTION) {
                SearchMode.intersection(result, term(words.get(i).getText()));
            }else {
                SearchMode.union(result, term(words.get(i).getText()));
            }
        }
    }
    //文档评分排序
    List<Doc> finalResult = result.parallelStream()
            //评分
            .map(doc -> {
                doc.setScore(score.score(doc, words.stream().map(word -> word.getText()).collect(Collectors.toList())));
                return doc;
            })
            //排序
            .sorted((a, b) -> b.getScore().compareTo(a.getScore()))
            .collect(Collectors.toList());
    long cost = System.currentTimeMillis()-start;
    LOGGER.info("hit count: "+result.size());
    LOGGER.info("query index cost: "+cost+" ms");
    return finalResult;
}
 
Example 5
Source File: WordParser.java    From QuestionAnsweringSystem with Apache License 2.0 5 votes vote down vote up
/**
 * 带词性标注(包括细分词性标注)的分析方法
 *
 * @param str 需要分词的文本
 * @return 分词结果
 */
public static List<Word> parseWithoutStopWords(String str) {
    List<Word> words = WordSegmenter.seg(str, SegmentationAlgorithm.MaxNgramScore);
    //词性标注
    PartOfSpeechTagging.process(words);
    return words;
}