Python jieba.analyse.extract_tags() Examples

The following are 5 code examples of jieba.analyse.extract_tags(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module jieba.analyse , or try the search function .
Example #1
Source File: gensim_jb.py    From nlp_learning with MIT License 6 votes vote down vote up
def xz_keywords():
    """
    关键字提取
    """
    key_words = extract_tags(xz_text, topK=300, withWeight=True, allowPOS=())
    # 停用词
    stopwords = pd.read_csv("data/stop_words.txt", index_col=False,
                            quoting=3, sep="\n", names=['stopword'], encoding='utf-8')
    words = [word for word, wegiht in key_words]
    keywords_df = pd.DataFrame({'keywords': words})    

    # 去掉停用词
    keywords_df = keywords_df[~keywords_df.keywords.isin(stopwords.stopword.tolist())]

    word_freq = []
    for word in keywords_df.keywords.tolist():
        for w, k in key_words:
            if word == w:
                word_freq.append((word, k))
    print(word_freq)
    show_wordCloud(word_freq) 
Example #2
Source File: jieba_segment.py    From nlp_learning with MIT License 6 votes vote down vote up
def jieba_keywords():
    """
    关键字提取
    """
    
    key_words = extract_tags(st_text, topK=300, withWeight=True, allowPOS=())
    # 停用词
    stopwords = pd.read_csv("data/origin/stop_words.txt", index_col=False,
                            quoting=3, sep="\n", names=['stopword'], encoding='utf-8')
    words = [word for word, weight in key_words]
    keywords_df = pd.DataFrame({'keywords': words})    

    # 去掉停用词
    keywords_df = keywords_df[~keywords_df.keywords.isin(stopwords.stopword.tolist())]

    word_freq = []
    for word in keywords_df.keywords.tolist():
        for w, k in key_words:
            if word == w:
                word_freq.append((word, k))
    print("================去掉停用词之后================")
    print(word_freq)

    show_wordCloud(word_freq) 
Example #3
Source File: semantic.py    From chat with MIT License 6 votes vote down vote up
def get_tag(sentence, config):
    """Get semantic tag of sentence. 获取句子语义标签。
    """
    iquestion = sentence.format(**config)
    try:
        keywords = analyse.extract_tags(iquestion, topK=1)
        keyword = keywords[0]
    except IndexError:
        keyword = iquestion
    tags = synonym_cut(keyword, 'wf') # tuple list
    if tags:
        tag = tags[0][1]
        if not tag:
            tag = keyword
    else:
        tag = keyword
    return tag 
Example #4
Source File: textSimilarity.py    From text-similarity with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def countIDF(self,text,topK):
        '''
        text:字符串,topK根据TF-IDF得到前topk个关键词的词频,用于计算相似度
        return 词频vector
        '''
        tfidf = analyse.extract_tags

        cipin = {} #统计分词后的词频

        fenci = jieba.cut(text)

        #记录每个词频的频率
        for word in fenci:
            if word not in cipin.keys():
                cipin[word] = 0
            cipin[word] += 1

        # 基于tfidf算法抽取前10个关键词,包含每个词项的权重
        keywords = tfidf(text,topK,withWeight=True)

        ans = []
        # keywords.count(keyword)得到keyword的词频
        # help(tfidf)
        # 输出抽取出的关键词
        for keyword in keywords:
            #print(keyword ," ",cipin[keyword[0]])
            ans.append(cipin[keyword[0]]) #得到前topk频繁词项的词频

        return ans 
Example #5
Source File: semantic.py    From chat with MIT License 5 votes vote down vote up
def synonym_cut(sentence, pattern="wf"):
    """Cut the sentence into a synonym vector tag.
    将句子切分为同义词向量标签。

    If a word in this sentence was not found in the synonym dictionary,
    it will be marked with default value of the word segmentation tool.
    如果同义词词典中没有则标注为切词工具默认的词性。

    Args:
        pattern: 'w'-分词, 'k'-唯一关键词,'t'-关键词列表, 'wf'-分词标签, 'tf-关键词标签'。
    """
    # 句尾标点符号过滤
    sentence = sentence.rstrip(''.join(punctuation_all))
    # 句尾语气词过滤
    sentence = sentence.rstrip(tone_words)
    synonym_vector = []
    if pattern == "w":
        synonym_vector = [item for item in jieba.cut(sentence) if item not in filter_characters]
    elif pattern == "k":
        synonym_vector = analyse.extract_tags(sentence, topK=1)
    elif pattern == "t":
        synonym_vector = analyse.extract_tags(sentence, topK=10)
    elif pattern == "wf":
        result = posseg.cut(sentence)
        # synonym_vector = [(item.word, item.flag) for item in result \
        # if item.word not in filter_characters]
        # Modify in 2017.4.27 
        for item in result:
            if item.word not in filter_characters:
                if len(item.flag) < 4:
                    item.flag = list(posseg.cut(item.word))[0].flag
                synonym_vector.append((item.word, item.flag))
    elif pattern == "tf":
        result = posseg.cut(sentence)
        tags = analyse.extract_tags(sentence, topK=10)
        for item in result:
            if item.word in tags:
                synonym_vector.append((item.word, item.flag))
    return synonym_vector