# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time     :2019/8/21 22:01
# @author   :Mo
# @function : textrank using tfidf of sklearn, pagerank of networkx


from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import networkx as nx
import jieba
import re


def cut_sentence(sentence):
    """
        分句
    :param sentence:str
    :return:list
    """
    re_sen = re.compile('[.。?!?!\n\r]')
    sentences = re_sen.split(sentence)
    return sentences


def tdidf_sim(sentences):
    """
       tfidf相似度
    :param sentences: 
    :return: 
    """
    # tfidf计算
    model = TfidfVectorizer(tokenizer=jieba.cut,
                            ngram_range=(1, 2), # 3,5
                            stop_words=[' ', '\t', '\n'],  # 停用词
                            max_features=10000,
                            token_pattern=r"(?u)\b\w+\b",  # 过滤停用词
                            min_df=1,
                            max_df=0.9,
                            use_idf=1,  # 光滑
                            smooth_idf=1,  # 光滑
                            sublinear_tf=1, )  # 光滑
    matrix = model.fit_transform(sentences)
    matrix_norm = TfidfTransformer().fit_transform(matrix)
    return matrix_norm


def textrank_tfidf(sentences, topk=6):
    """
        使用tf-idf作为相似度, networkx.pagerank获取中心句子作为摘要
    :param sentences: str, docs of text
    :param topk:int
    :return:list
    """
    # 切句子
    sentences = list(cut_sentence(sentences))
    # tf-idf相似度
    matrix_norm = tdidf_sim(sentences)
    # 构建相似度矩阵
    tfidf_sim = nx.from_scipy_sparse_matrix(matrix_norm * matrix_norm.T)
    # nx.pagerank
    sens_scores = nx.pagerank(tfidf_sim)
    # 得分排序
    sen_rank = sorted(sens_scores.items(), key=lambda x: x[1], reverse=True)
    # 保留topk个, 防止越界
    topk = min(len(sentences), topk)
    # 返回原句子和得分
    return [(sr[1], sentences[sr[0]]) for sr in sen_rank][0:topk]


if __name__ == '__main__':
    doc = "是上世纪90年代末提出的一种计算网页权重的算法。" \
          "当时,互联网技术突飞猛进,各种网页网站爆炸式增长," \
          "业界急需一种相对比较准确的网页重要性计算方法," \
          "是人们能够从海量互联网世界中找出自己需要的信息。" \
          "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。" \
          "Google把从A页面到B页面的链接解释为A页面给B页面投票," \
          "Google根据投票来源甚至来源的来源,即链接到A页面的页面" \
          "和投票目标的等级来决定新的等级。简单的说," \
          "一个高等级的页面可以使其他低等级页面的等级提升。" \
          "PageRank The PageRank Citation Ranking: Bringing Order to the Web,"\
          "具体说来就是,PageRank有两个基本思想,也可以说是假设," \
          "即数量假设:一个网页被越多的其他页面链接,就越重);" \
          "质量假设:一个网页越是被高质量的网页链接,就越重要。" \
          "总的来说就是一句话,从全局角度考虑,获取重要的信息。"
    doc = doc.encode('utf-8').decode('utf-8')
    for score_sen in textrank_tfidf(doc, 32):
        print(score_sen)