org.apdplat.word.segmentation.Word Java Examples

The following examples show how to use org.apdplat.word.segmentation.Word. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SynonymTagging.java    From word with Apache License 2.0 6 votes vote down vote up
public static void process(List<Word> words, boolean direct){
    if(LOGGER.isDebugEnabled()) {
        LOGGER.debug("对分词结果进行同义标注之前:{}", words);
    }
    //同义标注
    for(Word word : words){
        if(direct){
            if(LOGGER.isDebugEnabled()) {
                LOGGER.debug("直接模式");
            }
            processDirectSynonym(word);
        }else{
            if(LOGGER.isDebugEnabled()) {
                LOGGER.debug("间接接模式");
            }
            processIndirectSynonym(word);
        }
    }
    if(LOGGER.isDebugEnabled()) {
        LOGGER.debug("对分词结果进行同义标注之后:{}", words);
    }
}
 
Example #2
Source File: SynonymTagging.java    From word with Apache License 2.0 6 votes vote down vote up
private static void indirectSynonym(Word word, Set<Word> allSynonym){
    String[] synonym = GENERIC_TRIE.get(word.getText());
    if(synonym!=null && synonym.length>1){
        int len = allSynonym.size();
        //有同义词
        List<Word> synonymList = toWord(synonym);
        allSynonym.addAll(synonymList);
        //有新的同义词进入,就要接着检查是否有间接同义词
        if(allSynonym.size()>len) {
            //间接关系的同义词,A和B是同义词,A和C是同义词,B和D是同义词,C和E是同义词
            //则A B C D E都是一组同义词
            for (Word item : allSynonym) {
                indirectSynonym(item, allSynonym);
            }
        }
    }
}
 
Example #3
Source File: ITEYEBlogSimilarChecker.java    From rank with Apache License 2.0 6 votes vote down vote up
private void showDetail(Blog blog, List<Word> blogWords, Map<Word, AtomicInteger> blogWordsFre){
    LOGGER.debug("博文URL:");
    LOGGER.debug("\t"+blog.getUrl());
    LOGGER.debug("博文标题:");
    LOGGER.debug("\t"+blog.getTitle());
    LOGGER.debug("博文内容:");
    LOGGER.debug("\t"+blog.getContent());
    LOGGER.debug("博文长度:"+blog.getContent().length());
    LOGGER.debug("博文分词结果:");
    LOGGER.debug("\t" + blogWords);
    LOGGER.debug("博文词频统计:");
    AtomicInteger c = new AtomicInteger();
    blogWordsFre
            .entrySet()
            .stream()
            .sorted((a,b)->b.getValue().get()-a.getValue().get())
            .forEach(e->LOGGER.debug("\t"+c.incrementAndGet()+"、"+e.getKey()+"="+e.getValue()));
}
 
Example #4
Source File: ITEYEBlogSimilarChecker.java    From rank with Apache License 2.0 6 votes vote down vote up
/**
 * 判定相似性的方式一:简单共有词
 * @param blog1WordsFre
 * @param blog2WordsFre
 * @return
 */
private double simpleScore(Map<Word, AtomicInteger> blog1WordsFre, Map<Word, AtomicInteger> blog2WordsFre){
    //判断有几个相同的词
    AtomicInteger intersectionLength = new AtomicInteger();
    blog1WordsFre.keySet().forEach(word -> {
        if (blog2WordsFre.keySet().contains(word)) {
            intersectionLength.incrementAndGet();
        }
    });
    LOGGER.info("网页1有的词数:" + blog1WordsFre.size());
    LOGGER.info("网页2有的词数:" + blog2WordsFre.size());
    LOGGER.info("网页1和2共有的词数:" + intersectionLength.get());
    double score = intersectionLength.get()/(double)Math.min(blog1WordsFre.size(), blog2WordsFre.size());
    LOGGER.info("相似度分值="+intersectionLength.get()+"/(double)Math.min("+blog1WordsFre.size()+", "+blog2WordsFre.size()+")="+score);
    return score;
}
 
Example #5
Source File: GenericWebPageSimilarChecker.java    From rank with Apache License 2.0 6 votes vote down vote up
/**
 * 判定相似性的方式一:简单共有词
 * @param webPage1WordsFre
 * @param webPage2WordsFre
 * @return
 */
private double simpleScore(Map<Word, AtomicInteger> webPage1WordsFre, Map<Word, AtomicInteger> webPage2WordsFre){
    //判断有几个相同的词
    AtomicInteger intersectionLength = new AtomicInteger();
    webPage1WordsFre.keySet().forEach(word -> {
        if (webPage2WordsFre.keySet().contains(word)) {
            intersectionLength.incrementAndGet();
        }
    });
    LOGGER.info("网页1有的词数:" + webPage1WordsFre.size());
    LOGGER.info("网页2有的词数:" + webPage2WordsFre.size());
    LOGGER.info("网页1和2共有的词数:" + intersectionLength.get());
    double score = intersectionLength.get()/(double)Math.min(webPage1WordsFre.size(), webPage2WordsFre.size());
    LOGGER.info("相似度分值="+intersectionLength.get()+"/(double)Math.min("+webPage1WordsFre.size()+", "+webPage2WordsFre.size()+")="+score);
    return score;
}
 
Example #6
Source File: GenericWebPageSimilarChecker.java    From rank with Apache License 2.0 6 votes vote down vote up
private double score(WebPage webPage1, WebPage webPage2){
    //分词
    List<Word> webPage1Words = WordSegmenter.seg(webPage1.getTitle()+"\n"+webPage1.getContent());
    List<Word> webPage2Words = WordSegmenter.seg(webPage2.getTitle()+"\n"+webPage2.getContent());
    //词频统计
    Map<Word, AtomicInteger> webPage1WordsFre = frequence(webPage1Words);
    Map<Word, AtomicInteger> webPage2WordsFre = frequence(webPage2Words);
    //输出详细信息
    if(LOGGER.isDebugEnabled()){
        showDetail(webPage1, webPage1Words, webPage1WordsFre);
        showDetail(webPage2, webPage2Words, webPage2WordsFre);
    }
    //使用简单共有词判定
    return simpleScore(webPage1WordsFre, webPage2WordsFre);
    //使用余弦相似度判定
    //return cosScore(webPage1WordsFre, webPage2WordsFre);
}
 
Example #7
Source File: MainPartExtracter.java    From QuestionAnsweringSystem with Apache License 2.0 6 votes vote down vote up
public String getQuestionMainPartNaturePattern(String question, String mainPart) {
    Map<String, String> map = new HashMap<>();
    //分词
    List<Word> words = WordParser.parse(question);
    for (Word word : words) {
        map.put(word.getText(), word.getPartOfSpeech().getPos());
    }
    StringBuilder patterns = new StringBuilder();
    String[] items = mainPart.split(" ");
    int i = 0;
    for (String item : items) {
        if ((i++) > 0) {
            patterns.append("/");
        }
        patterns.append(map.get(item));
    }
    return patterns.toString().trim();
}
 
Example #8
Source File: EditDistanceTextSimilarity.java    From word with Apache License 2.0 6 votes vote down vote up
/**
 * 计算相似度分值
 * @param words1 词列表1
 * @param words2 词列表2
 * @return 相似度分值
 */
@Override
protected double scoreImpl(List<Word> words1, List<Word> words2){
    //文本1
    StringBuilder text1 = new StringBuilder();
    words1.forEach(word -> text1.append(word.getText()));
    //文本2
    StringBuilder text2 = new StringBuilder();
    words2.forEach(word -> text2.append(word.getText()));
    int maxTextLength = Math.max(text1.length(), text2.length());
    if(maxTextLength == 0){
        //两个空字符串
        return 1.0;
    }
    //计算文本1和文本2的编辑距离
    int editDistance = editDistance(text1.toString(), text2.toString());
    double score = (1 - editDistance / (double)maxTextLength);
    if(LOGGER.isDebugEnabled()){
        LOGGER.debug("文本1:"+text1.toString());
        LOGGER.debug("文本2:"+text2.toString());
        LOGGER.debug("文本1和文本2的编辑距离:"+editDistance);
        LOGGER.debug("文本1和文本2的最大长度:"+maxTextLength);
        LOGGER.debug("文本1和文本2的相似度分值:1 - "+editDistance+" / (double)"+maxTextLength+"="+score);
    }
    return score;
}
 
Example #9
Source File: SimHashPlusHammingDistanceTextSimilarity.java    From word with Apache License 2.0 6 votes vote down vote up
/**
 * 计算词列表的SimHash值
 * @param words 词列表
 * @return SimHash值
 */
private String simHash(List<Word> words) {
    float[] hashBit = new float[hashBitCount];
    words.forEach(word -> {
        float weight = word.getWeight()==null?1:word.getWeight();
        BigInteger hash = hash(word.getText());
        for (int i = 0; i < hashBitCount; i++) {
            BigInteger bitMask = new BigInteger("1").shiftLeft(i);
            if (hash.and(bitMask).signum() != 0) {
                hashBit[i] += weight;
            } else {
                hashBit[i] -= weight;
            }
        }
    });
    StringBuffer fingerprint = new StringBuffer();
    for (int i = 0; i < hashBitCount; i++) {
        if (hashBit[i] >= 0) {
            fingerprint.append("1");
        }else{
            fingerprint.append("0");
        }
    }
    return fingerprint.toString();
}
 
Example #10
Source File: JaroDistanceTextSimilarity.java    From word with Apache License 2.0 6 votes vote down vote up
/**
 * 计算相似度分值
 * @param words1 词列表1
 * @param words2 词列表2
 * @return 相似度分值
 */
@Override
protected double scoreImpl(List<Word> words1, List<Word> words2){
    //文本1
    StringBuilder text1 = new StringBuilder();
    words1.forEach(word -> text1.append(word.getText()));
    //文本2
    StringBuilder text2 = new StringBuilder();
    words2.forEach(word -> text2.append(word.getText()));
    //计算文本1和文本2的Jaro距离
    //Jaro距离也就是相似度分值
    double score = jaroDistance(text1.toString(), text2.toString());
    if(LOGGER.isDebugEnabled()){
        LOGGER.debug("文本1:"+text1.toString());
        LOGGER.debug("文本2:"+text2.toString());
        LOGGER.debug("文本1和文本2的相似度分值:"+score);
    }
    return score;
}
 
Example #11
Source File: PinyinTagging.java    From word with Apache License 2.0 5 votes vote down vote up
public static void process(List<Word> words){
    for (Word word : words){
        String wordText = word.getText();
        word.setFullPinYin(getFullPinYin(wordText));
        word.setAcronymPinYin(getAcronymPinYin(wordText));
    }
}
 
Example #12
Source File: AntonymTagging.java    From word with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    List<Word> words = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching).seg("5月初有哪些电影值得观看");
    System.out.println(words);
    AntonymTagging.process(words);
    System.out.println(words);
    words = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching).seg("由于工作不到位、服务不完善导致顾客在用餐时发生不愉快的事情,餐厅方面应该向顾客作出真诚的道歉,而不是敷衍了事。");
    System.out.println(words);
    AntonymTagging.process(words);
    System.out.println(words);
}
 
Example #13
Source File: WordSegmenter.java    From word with Apache License 2.0 5 votes vote down vote up
/**
 * 对文本进行分词,移除停用词
 * 使用双向最大匹配算法
 * @param text 文本
 * @return 分词结果
 */
public static List<Word> seg(String text){
    List<Word> words = SegmentationFactory.getSegmentation(SegmentationAlgorithm.MaxNgramScore).seg(text);
    //停用词过滤
    StopWord.filterStopWords(words);
    return words;
}
 
Example #14
Source File: PartOfSpeechTagging.java    From word with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    List<Word> words = WordSegmenter.segWithStopWords("我爱中国,我爱杨尚川");
    System.out.println("未标注词性:"+words);
    //词性标注
    PartOfSpeechTagging.process(words);
    System.out.println("标注词性:"+words);
}
 
Example #15
Source File: AntonymTagging.java    From word with Apache License 2.0 5 votes vote down vote up
public static void process(List<Word> words){
    if(LOGGER.isDebugEnabled()) {
        LOGGER.debug("对分词结果进行反义标注之前:{}", words);
    }
    //反义标注
    words.stream().forEach(word -> process(word));
    if(LOGGER.isDebugEnabled()) {
        LOGGER.debug("对分词结果进行反义标注之后:{}", words);
    }
}
 
Example #16
Source File: AntonymTagging.java    From word with Apache License 2.0 5 votes vote down vote up
private static void process(Word word){
    String[] antonym = GENERIC_TRIE.get(word.getText());
    if(antonym!=null && antonym.length>1){
        //有反义词
        List<Word> antonymList = toWord(antonym);
        word.setAntonym(antonymList);
    }
}
 
Example #17
Source File: MaxNgramScore.java    From word with Apache License 2.0 5 votes vote down vote up
/**
 * 反向遍历生成分词结果
 * @param node 结束虚拟节点
 * @return 分词结果
 */
private List<Word> toWords(Node node){
    Stack<String> stack = new Stack<>();
    while ((node = node.getPrevious()) != null) {
        if(!"S".equals(node.getText())) {
            stack.push(node.getText());
        }
    }
    int len = stack.size();
    List<Word> list = new ArrayList<>(len);
    for(int i=0; i<len; i++){
        list.add(new Word(stack.pop()));
    }
    return list;
}
 
Example #18
Source File: MaximumMatching.java    From word with Apache License 2.0 5 votes vote down vote up
@Override
public List<Word> segImpl(String text) {
    List<Word> result = new ArrayList<>();
    //文本长度
    final int textLen=text.length();
    //从未分词的文本中截取的长度
    int len=getInterceptLength();
    //剩下未分词的文本的索引
    int start=0;
    //只要有词未切分完就一直继续
    while(start<textLen){
        if(len>textLen-start){
            //如果未分词的文本的长度小于截取的长度
            //则缩短截取的长度
            len=textLen-start;
        }
        //用长为len的字符串查词典,并做特殊情况识别
        while(!getDictionary().contains(text, start, len) && !RecognitionTool.recog(text, start, len)){
            //如果长度为一且在词典中未找到匹配
            //则按长度为一切分
            if(len==1){
                break;
            }
            //如果查不到,则长度减一后继续
            len--;
        }
        addWord(result, text, start, len);
        //从待分词文本中向后移动索引,滑过已经分词的文本
        start+=len;
        //每一次成功切词后都要重置截取长度
        len=getInterceptLength();
    }
    return result;
}
 
Example #19
Source File: SynonymTagging.java    From word with Apache License 2.0 5 votes vote down vote up
private static void processDirectSynonym(Word word){
    String[] synonym = GENERIC_TRIE.get(word.getText());
    if(synonym!=null && synonym.length>1){
        //有同义词
        List<Word> synonymList = toWord(synonym);
        synonymList.remove(word);
        word.setSynonym(synonymList);
    }
}
 
Example #20
Source File: Trigram.java    From word with Apache License 2.0 5 votes vote down vote up
/**
 * 一次性计算多种分词结果的三元模型分值
 * @param sentences 多种分词结果
 * @return 分词结果及其对应的分值
 */
public static Map<List<Word>, Float> trigram(List<Word>... sentences){
    Map<List<Word>, Float> map = new HashMap<>();
    //计算多种分词结果的分值
    for(List<Word> sentence : sentences){
        if(map.get(sentence) != null){
            //相同的分词结果只计算一次分值
            continue;
        }
        float score=0;
        //计算其中一种分词结果的分值
        if(sentence.size() > 2){
            for(int i=0; i<sentence.size()-2; i++){
                String first = sentence.get(i).getText();
                String second = sentence.get(i+1).getText();
                String third = sentence.get(i+2).getText();
                float trigramScore = getScore(first, second, third);
                if(trigramScore > 0){
                    score += trigramScore;
                }
            }
        }
        map.put(sentence, score);
    }
    
    return map;
}
 
Example #21
Source File: AntonymTagging.java    From word with Apache License 2.0 5 votes vote down vote up
private static List<Word> toWord(String[] words){
    List<Word> result = new ArrayList<>(words.length);
    for (String word : words){
        result.add(new Word(word));
    }
    return result;
}
 
Example #22
Source File: FullSegmentation.java    From word with Apache License 2.0 5 votes vote down vote up
/**
 * 从树叶开始反向遍历生成全切分结果
 * @param leaf 树叶节点集合
 * @return 全切分结果集合
 */
private List<Word>[] toWords(List<Node> leaf){
    List<Word>[] result = new ArrayList[leaf.size()];
    int i = 0;
    if(LOGGER.isDebugEnabled()) {
        LOGGER.debug("全切分结果:");
    }
    for(Node node : leaf){
        result[i++] = toWords(node);
        if(LOGGER.isDebugEnabled()) {
            LOGGER.debug("\t" + i + ":" + result[i - 1]);
        }
    }
    return result;
}
 
Example #23
Source File: FullSegmentation.java    From word with Apache License 2.0 5 votes vote down vote up
@Override
public List<Word> segImpl(String text) {
    if(text.length() > PROCESS_TEXT_LENGTH_LESS_THAN){
        return RMM.segImpl(text);
    }
    //获取全切分结果
    List<Word>[] array = fullSeg(text);
    //利用ngram计算分值
    Map<List<Word>, Float> words = ngram(array);
    //歧义消解(ngram分值优先、词个数少优先)
    List<Word> result = disambiguity(words);
    return result;
}
 
Example #24
Source File: MinimalWordCount.java    From word with Apache License 2.0 5 votes vote down vote up
/**
 * 反向遍历生成分词结果
 * @param node 结束虚拟节点
 * @return 分词结果
 */
private List<Word> toWords(Node node){
    Stack<String> stack = new Stack<>();
    while ((node = node.getPrevious()) != null) {
        if(!"S".equals(node.getText())) {
            stack.push(node.getText());
        }
    }
    int len = stack.size();
    List<Word> list = new ArrayList<>(len);
    for(int i=0; i<len; i++){
        list.add(new Word(stack.pop()));
    }
    return list;
}
 
Example #25
Source File: SynonymTagging.java    From word with Apache License 2.0 5 votes vote down vote up
private static List<Word> toWord(String[] words){
    List<Word> result = new ArrayList<>(words.length);
    for (String word : words){
        result.add(new Word(word));
    }
    return result;
}
 
Example #26
Source File: SynonymTagging.java    From word with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    List<Word> words = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching).seg("楚离陌千方百计为无情找回记忆");
    System.out.println(words);
    SynonymTagging.process(words);
    System.out.println(words);
    SynonymTagging.process(words, false);
    System.out.println(words);
    words = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching).seg("手劲大的老人往往更长寿");
    System.out.println(words);
    SynonymTagging.process(words);
    System.out.println(words);
    SynonymTagging.process(words, false);
    System.out.println(words);
}
 
Example #27
Source File: MinimumMatching.java    From word with Apache License 2.0 5 votes vote down vote up
@Override
public List<Word> segImpl(String text) {
    List<Word> result = new ArrayList<>();
    //文本长度
    final int textLen=text.length();
    //从未分词的文本中截取的长度
    int len=1;
    //剩下未分词的文本的索引
    int start=0;
    //只要有词未切分完就一直继续
    while(start<textLen){
        //用长为len的字符串查词典,并做特殊情况识别
        while(!getDictionary().contains(text, start, len) && !RecognitionTool.recog(text, start, len)){
            //如果长度为词典最大长度且在词典中未找到匹配
            //或已经遍历完剩下的文本且在词典中未找到匹配
            //则按长度为一切分
            if(len==getInterceptLength() || len==textLen-start){
                //重置截取长度为一
                len=1;
                break;
            }
            //如果查不到,则长度加一后继续
            len++;
        }
        addWord(result, text, start, len);
        //从待分词文本中向后移动索引,滑过已经分词的文本
        start+=len;
        //每一次成功切词后都要重置截取长度
        len=1;
    }
    return result;
}
 
Example #28
Source File: Bigram.java    From word with Apache License 2.0 5 votes vote down vote up
public static float sentenceScore(List<Word> words){
    if(words.size() > 1){
        float total = words.size() - 1;
        float match = 0;
        for(int i=0; i<words.size()-1; i++){
            if(getScore(words.get(i).getText(), words.get(i+1).getText()) > 0){
                match++;
            }
        }
        return match/total;
    }
    return 0;
}
 
Example #29
Source File: Bigram.java    From word with Apache License 2.0 5 votes vote down vote up
/**
 * 计算分词结果的二元模型分值
 * @param words 分词结果
 * @return 二元模型分值
 */
public static float bigram(List<Word> words){
    if(words.size() > 1){
        float score=0;
        for(int i=0; i<words.size()-1; i++){
            score += getScore(words.get(i).getText(), words.get(i+1).getText());
        }
        return score;
    }
    return 0;
}
 
Example #30
Source File: SynonymTagging.java    From word with Apache License 2.0 5 votes vote down vote up
private static void processIndirectSynonym(Word word){
    Set<Word> synonymList = new ConcurrentSkipListSet<>();
    indirectSynonym(word, synonymList);
    if(!synonymList.isEmpty()){
        synonymList.remove(word);
        word.setSynonym(new ArrayList<>(synonymList));
    }
}