python source code of textrank

# -*- coding: utf-8 -*-
from __future__ import division
import networkx
from .baseclass import BaseSummarizer


class TextRankSummarizer(BaseSummarizer):

    def summarize(self, text, length=5, weighting='frequency', norm=None):
        """
        Implements the TextRank summarization algorithm, which follows closely to the PageRank algorithm for ranking
        web pages.

        :param text: a string of text to be summarized, path to a text file, or URL starting with http
        :param length: the length of the output summary; either a number of sentences (e.g. 5) or a percentage
        of the original document (e.g. 0.5)
        :param weighting: 'frequency', 'binary' or 'tfidf' weighting of sentence terms ('frequency' by default)
        :param norm: if 'l1' or 'l2', normalizes words by the length of their associated sentence to "down-weight"
        the voting power of long sentences (None by default)
        :return: list of sentences for the summary
        """

        text = self._parse_input(text)

        sentences, unprocessed_sentences = self._tokenizer.tokenize_sentences(text)

        length = self._parse_summary_length(length, len(sentences))
        if length == len(sentences):
            return unprocessed_sentences

        # Compute the word frequency matrix. If norm is set to 'l1' or 'l2' then words are normalized
        # by the length of their associated sentences (such that each vector of sentence terms sums to 1).
        word_matrix = self._compute_matrix(sentences, weighting=weighting, norm=norm)

        # Build the similarity graph by calculating the number of overlapping words between all
        # combinations of sentences.
        similarity_matrix = (word_matrix * word_matrix.T)

        similarity_graph = networkx.from_scipy_sparse_matrix(similarity_matrix)
        scores = networkx.pagerank(similarity_graph)

        ranked_sentences = sorted(
            ((score, ndx) for ndx, score in scores.items()), reverse=True
        )

        top_sentences = [ranked_sentences[i][1] for i in range(length)]
        top_sentences.sort()

        return [unprocessed_sentences[i] for i in top_sentences]