python source code of doc

"""
Clean text, make it readable and obtain metadata from it.
"""

import functools
import re
import unicodedata
from collections import Counter
from urllib.parse import urlparse

import cld2
import numpy
import spacy
import spacy.matcher
import textacy
import textacy.keyterms
import textacy.text_utils
from bs4 import BeautifulSoup
from datasketch import MinHash
from gensim.models.keyedvectors import KeyedVectors
from gensim.summarization.summarizer import summarize

from textpipe.data.emoji import EMOJI_TO_UNICODE_NAME, EMOJI_TO_SENTIMENT
from textpipe.wrappers import RedisKeyedVectors
from textpipe.util import getattr_


class TextpipeMissingModelException(Exception):
    """Raised when the requested model is missing"""


class RedisIDFWeightingMismatchException(Exception):
    """Raised when an idf weighting scheme is specified that does not match the specified weighting
    scheme in RedisKeyedVector"""


class Doc:
    """
    Create a doc instance of text, obtain cleaned, readable text and
    metadata from this doc.

    Properties:
    raw: incoming, unedited text
    language: 2-letter code for the language of the text
    is_detected_language: is the language detected or specified beforehand
    is_reliable_language: is the language specified or was it reliably detected
    hint_language: language you expect your text to be
    _spacy_nlps: nested dictionary {lang: {model_id: model}} with loaded spacy language modules
    """

    # pylint: disable=too-many-instance-attributes
    # pylint: disable=too-many-arguments
    # pylint: disable=too-many-public-methods

    def __init__(self,
                 raw,
                 language=None,
                 hint_language='en',
                 spacy_nlps=None,
                 gensim_vectors=None):
        self.raw = raw
        self._language = language
        self.hint_language = hint_language
        self._spacy_nlps = spacy_nlps if spacy_nlps is not None else dict()
        self._gensim_vectors = gensim_vectors if gensim_vectors is not None else dict()
        self.is_detected_language = language is None
        self._is_reliable_language = True if language else None
        self._text_stats = {}
        self.nr_train_tokens = 0

    @property
    def language(self):
        """
        Provided or detected language of a text

        >>> from textpipe.doc import Doc
        >>> Doc('Test sentence for testing text').language
        'en'
        >>> Doc('Test sentence for testing text', language='en').language
        'en'
        >>> Doc('Test', hint_language='nl').language
        'nl'
        """
        if not self._language:
            self._is_reliable_language, self._language = self.detect_language(self.hint_language)
        return self._language

    @property
    def is_reliable_language(self):
        """
        True if the language was specified or if was it reliably detected

        >>> from textpipe.doc import Doc
        >>> Doc('...').is_reliable_language
        False
        >>> Doc('Test', hint_language='nl').is_reliable_language
        True
        """
        if self._is_reliable_language is None:
            self._is_reliable_language, self._language = self.detect_language(self.hint_language)
        return self._is_reliable_language

    @functools.lru_cache()
    def detect_language(self, hint_language=None):
        """
        Detected the language of a text if no language was provided along with the text

        Args:
        hint_language: language you expect your text to be

        Returns:
        is_reliable: is the top language is much better than 2nd best language?
        language: 2-letter code for the language of the text

        >>> from textpipe.doc import Doc
        >>> doc = Doc('Test')
        >>> doc.detect_language()
        (True, 'en')
        >>> doc.detect_language('nl')
        (True, 'nl')
        >>> Doc('...').detect_language()
        (False, 'un')
        """

        none_utf_chars_removed = ''.join([l for l in self.clean
                                          if unicodedata.category(l)[0] not in {'M', 'C'}])
        is_reliable, _, best_guesses = cld2.detect(none_utf_chars_removed,
                                                   hintLanguage=hint_language,
                                                   bestEffort=True)

        if not best_guesses or len(best_guesses[0]) != 4 or best_guesses[0][1] == 'un':
            return False, 'un'

        return is_reliable, best_guesses[0][1]

    @property
    def _spacy_doc(self):
        """
        Loads the default spacy doc or creates one if necessary

        >>> from textpipe.doc import Doc
        >>> doc = Doc('Test sentence for testing text')
        >>> type(doc._spacy_doc)
        <class 'spacy.tokens.doc.Doc'>
        """
        lang = self.language if self.is_reliable_language else self.hint_language

        return self._load_spacy_doc(lang)

    @functools.lru_cache()
    def _load_spacy_doc(self, lang, model_name=None):
        """
        Loads a spacy doc or creates one if necessary
        """
        # Load default spacy model if necessary, if not loaded already
        if lang not in self._spacy_nlps or (model_name is None and
                                            model_name not in self._spacy_nlps[lang]):
            if lang not in self._spacy_nlps:
                self._spacy_nlps[lang] = {}
            self._spacy_nlps[lang][None] = self._get_default_nlp(lang)
        if model_name not in self._spacy_nlps[lang] and model_name is not None:
            raise TextpipeMissingModelException(f'Custom model {model_name} '
                                                f'is missing.')
        nlp = self._spacy_nlps[lang][model_name]
        doc = nlp(self.clean_text())
        return doc

    @staticmethod
    @functools.lru_cache()
    def _get_default_nlp(lang):
        """
        Loads the spacy default language module for the Doc's language
        """
        try:
            return spacy.load('{}_core_{}_sm'.format(lang, 'web' if lang == 'en' else 'news'))
        except IOError:
            raise TextpipeMissingModelException(f'Default model for language "{lang}" '
                                                f'is not available.')

    @property
    def clean(self):
        """
        Cleaned text with sensible defaults.

        >>> from textpipe.doc import Doc
        >>> doc = Doc('“Please clean this piece… of text</b>„')
        >>> doc.clean
        '"Please clean this piece... of text"'
        """
        return self.clean_text()

    @functools.lru_cache()
    def clean_text(self, remove_html=True, clean_dots=True, clean_quotes=True,
                   clean_whitespace=True):
        """
        Clean HTML and normalise punctuation.

        >>> from textpipe.doc import Doc
        >>> doc = Doc('“Please clean this piece… of text</b>„')
        >>> doc.clean_text(False, False, False, False) == doc.raw
        True
        """
        text = self.raw
        if remove_html:
            text = BeautifulSoup(text, 'html.parser').get_text()  # remove HTML

        # Three regexes below adapted from Blendle cleaner.py
        # https://github.com/blendle/research-summarization/blob/master/enrichers/cleaner.py#L29
        if clean_dots:
            text = re.sub(r'…', '...', text)
        if clean_quotes:
            text = re.sub(r'[`‘’‛⸂⸃⸌⸍⸜⸝]', "'", text)
            text = re.sub(r'[„“]|(\'\')|(,,)', '"', text)
        if clean_whitespace:
            text = re.sub(r'\s+', ' ', text).strip()

        return text

    @property
    def ents(self):
        """
        A list of the named entities with sensible defaults.

        >>> from textpipe.doc import Doc
        >>> doc = Doc('Sentence for testing Google text')
        >>> doc.ents
        [('Google', 'ORG')]
        """
        return self.find_ents()

    @functools.lru_cache()
    def find_ents(self, model_name=None, ent_attributes=('text', 'label_')):
        """
        Extract a list of the named entities in text, with the possibility of using a custom model.

        >>> from textpipe.doc import Doc
        >>> doc = Doc('Sentence for testing Google text')
        >>> doc.find_ents()
        [('Google', 'ORG')]
        """
        lang = self.language if self.is_reliable_language else self.hint_language
        return list({tuple(getattr_(ent, attr) for attr in ent_attributes)
                     for ent in self._load_spacy_doc(lang, model_name).ents})

    def match(self, matcher):
        """
        Run a SpaCy matcher over the cleaned content

        >>> import spacy.matcher
        >>> from textpipe.doc import Doc
        >>> matcher = spacy.matcher.Matcher(spacy.lang.en.English().vocab)
        >>> matcher.add('HASHTAG', None, [{'ORTH': '#'}, {'IS_ASCII': True}])
        >>> Doc('Test with #hashtag').match(matcher)
        [('#hashtag', 'HASHTAG')]
        """
        return [(self._spacy_doc[start:end].text, matcher.vocab.strings[match_id])
                for match_id, start, end in matcher(self._spacy_doc)]

    @property
    def emojis(self):
        """
        Emojis detected using SpaCy matcher over the cleaned content, with unicode name and
        sentiment score.

        >>> from pprint import pprint
        >>> from textpipe.doc import Doc
        >>> pprint(Doc('Test with emoji 😀 😋 ').emojis)
        [('😀', 'GRINNING FACE', 0.571753986332574),
         ('😋', 'FACE SAVOURING DELICIOUS FOOD', 0.6335149863760218)]
        """
        detected_emojis = []
        matcher = spacy.matcher.Matcher(self._spacy_doc.vocab)
        for emoji, unicode_name in EMOJI_TO_UNICODE_NAME.items():
            matcher.add(unicode_name, None, [{'ORTH': emoji}])

        for emoji, unicode_name in self.match(matcher):
            detected_emojis.append((emoji, unicode_name, EMOJI_TO_SENTIMENT[emoji]))

        return detected_emojis

    @property
    def nsents(self):
        """
        Extract the number of sentences from text

        >>> from textpipe.doc import Doc
        >>> doc = Doc('Test sentence for testing text. And another sentence for testing!')
        >>> doc.nsents
        2
        """
        return len(list(self._spacy_doc.sents))

    @property
    def sents(self):
        """
        Extract the text and character offset (begin) of sentences from text

        >>> from pprint import pprint
        >>> from textpipe.doc import Doc
        >>> doc = Doc('Test sentence for testing text. '
        ...           'And another one with, some, punctuation! And stuff.')
        >>> pprint(doc.sents)
        [('Test sentence for testing text.', 0),
         ('And another one with, some, punctuation!', 32),
         ('And stuff.', 73)]
        """

        return [(span.text, span.start_char) for span in self._spacy_doc.sents]

    @property
    def nwords(self):
        """
        Extract the number of words from text

        >>> from textpipe.doc import Doc
        >>> doc = Doc('Test sentence for testing text')
        >>> doc.nwords
        5
        """
        return len(self.words)

    @property
    def words(self):
        """
        Extract the text and character offset (begin) of words from text

        >>> from textpipe.doc import Doc
        >>> doc = Doc('Test sentence for testing text.')
        >>> doc.words
        [('Test', 0), ('sentence', 5), ('for', 14), ('testing', 18), ('text', 26), ('.', 30)]
        """

        return [(token.text, token.idx) for token in self._spacy_doc]

    @property
    def word_counts(self):
        """
        Extract words with their counts

        >>> from pprint import pprint
        >>> from textpipe.doc import Doc
        >>> pprint(Doc('Test sentence for testing vectorisation of a sentence.').word_counts)
        {'.': 1,
         'Test': 1,
         'a': 1,
         'for': 1,
         'of': 1,
         'sentence': 2,
         'testing': 1,
         'vectorisation': 1}
        """

        return dict(Counter(word for word, _ in self.words))

    @property
    def complexity(self):
        """
        Determine the complexity of text using the Flesch
        reading ease test ranging from 0.0 - 100.0 with 0.0
        being the most difficult to read.

        >>> from textpipe.doc import Doc
        >>> doc = Doc('Test sentence for testing text')
        >>> doc.complexity
        83.32000000000004
        """
        if not self._text_stats:
            self._text_stats = textacy.TextStats(self._spacy_doc)
        if self._text_stats.n_syllables == 0:
            return 100
        return self._text_stats.flesch_reading_ease

    @property
    def sentiment(self):
        """
        Returns polarity score (-1 to 1) and a subjectivity score (0 to 1)

        Currently only English, Dutch, French and Italian supported

        >>> from textpipe.doc import Doc
        >>> doc = Doc('Dit is een leuke zin.')
        >>> doc.sentiment
        (0.6, 0.9666666666666667)
        """

        if self.language == 'en':
            from pattern.text.en import sentiment as sentiment_en  # pylint: disable=import-outside-toplevel
            return sentiment_en(self.clean)

        if self.language == 'nl':
            from pattern.text.nl import sentiment as sentiment_nl  # pylint: disable=import-outside-toplevel
            return sentiment_nl(self.clean)

        if self.language == 'fr':
            from pattern.text.fr import sentiment as sentiment_fr  # pylint: disable=import-outside-toplevel
            return sentiment_fr(self.clean)

        if self.language == 'it':
            from pattern.text.it import sentiment as sentiment_it  # pylint: disable=import-outside-toplevel
            return sentiment_it(self.clean)

        raise TextpipeMissingModelException(f'No sentiment model for {self.language}')

    @functools.lru_cache()
    def extract_keyterms(self, ranker='textrank', n_terms=10, **kwargs):
        """
        Extract and rank key terms in the document by proxying to
        `textacy.keyterms`. Returns a list of (term, score) tuples. Depending
        on the ranking algorithm used, terms can consist of multiple words.

        Available rankers are TextRank (textrank), SingleRank (singlerank) and
        SGRank ('sgrank').

        >>> from pprint import pprint
        >>> from textpipe.doc import Doc
        >>> doc = Doc('Amsterdam is the awesome capital of the Netherlands.')
        >>> pprint(doc.extract_keyterms(n_terms=3))
        [('awesome', 0.32456160227748454),
         ('capital', 0.32456160227748454),
         ('Amsterdam', 0.17543839772251532)]
        >>> pprint(doc.extract_keyterms(ranker='sgrank'))
        [('awesome capital', 0.5638711013322963),
         ('Netherlands', 0.22636566128805719),
         ('Amsterdam', 0.20976323737964653)]
        >>> pprint(doc.extract_keyterms(ranker='sgrank', ngrams=(1)))
        [('Netherlands', 0.4020557546031188),
         ('capital', 0.29395103364295216),
         ('awesome', 0.18105611227666252),
         ('Amsterdam', 0.12293709947726655)]
        """
        if self.nwords < 1:
            return []
        rankers = ['textrank', 'sgrank', 'singlerank']
        if ranker not in rankers:
            raise ValueError(f'ranker "{ranker}" not available; use one '
                             f'of {rankers}')
        ranking_fn = getattr(textacy.keyterms, ranker)
        return ranking_fn(self._spacy_doc, n_keyterms=n_terms, **kwargs)

    @property
    def keyterms(self):
        """
        Return textranked keyterms for the document.

        >>> from pprint import pprint
        >>> from textpipe.doc import Doc
        >>> doc = Doc('Amsterdam is the awesome capital of the Netherlands.')
        >>> pprint(doc.extract_keyterms(n_terms=3))
        [('awesome', 0.32456160227748454),
         ('capital', 0.32456160227748454),
         ('Amsterdam', 0.17543839772251532)]
        """
        return self.extract_keyterms()

    @property
    def minhash(self):
        """
        A cheap way to compute a hash for finding similarity of docs
        Source: https://ekzhu.github.io/datasketch/minhash.html

        >>> from textpipe.doc import Doc
        >>> doc = Doc('Sentence for computing the minhash')
        >>> doc.minhash[:5]
        [407326892, 814360600, 1099082245, 1176349439, 1735256]
        """
        return self.find_minhash()

    @functools.lru_cache()
    def find_minhash(self, num_perm=128):
        """
        Compute minhash, cached.
        """
        words = self.words
        doc_hash = MinHash(num_perm=num_perm)
        for word, _ in words:
            doc_hash.update(word.encode('utf8'))
        return list(doc_hash.digest())

    def similarity(self, other_doc, metric='jaccard', hash_method='minhash'):
        """
        Computes similarity for two documents.
        Only minhash Jaccard similarity is implemented.

        >>> from textpipe.doc import Doc
        >>> doc1 = Doc('Sentence for computing the minhash')
        >>> doc2 = Doc('Sentence for computing the similarity')
        >>> doc1.similarity(doc2)
        0.7265625
        """
        if hash_method == 'minhash' and metric == 'jaccard':
            hash1 = MinHash(hashvalues=self.minhash)
            hash2 = MinHash(hashvalues=other_doc.minhash)
            return hash1.jaccard(hash2)

        raise NotImplementedError(f'Metric/hash method combination {metric}'
                                  f'/{hash_method} is not implemented as similarity metric')

    @property
    def word_vectors(self):
        """
        Returns word embeddings for the words in the document.
        """
        return self.generate_word_vectors()

    @functools.lru_cache()
    def generate_word_vectors(self, model_name=None):
        """
        Returns word embeddings for the words in the document.
        The default spacy models don't have "true" word vectors
        but only context-sensitive tensors that are within the document.

        Returns:
        A dictionary mapping words from the document to a dict with the
        corresponding values of the following variables:

        has vector: Does the token have a vector representation?
        vector norm: The L2 norm of the token's vector (the square root of the
                    sum of the values squared)
        OOV: Out-of-vocabulary (This variable always gets the value True since
                                there are no vectors included in the model)
        vector: The vector representation of the word

        >>> from textpipe.doc import Doc
        >>> doc = Doc('Test sentence')
        >>> doc.word_vectors['Test']['is_oov']
        True
        >>> len(doc.word_vectors['Test']['vector'])
        96
        >>> doc.word_vectors['Test']['vector_norm'] == doc.word_vectors['sentence']['vector_norm']
        False
        """
        lang = self.language if self.is_reliable_language else self.hint_language
        return {token.text: {'has_vector': token.has_vector,
                             'vector_norm': token.vector_norm,
                             'is_oov': token.is_oov,
                             'vector': token.vector.tolist()}
                for token in self._load_spacy_doc(lang, model_name)}

    @property
    def doc_vector(self):
        """
        Returns document embeddings based on the words in the document.

        >>> import numpy
        >>> from textpipe.doc import Doc
        >>> numpy.array_equiv(Doc('a b').doc_vector, Doc('a b').doc_vector)
        True
        >>> numpy.array_equiv(Doc('a b').doc_vector, Doc('a a b').doc_vector)
        False
        """
        return self.aggregate_word_vectors()

    @functools.lru_cache()
    def aggregate_word_vectors(self,
                               model_name=None,
                               aggregation='mean',
                               normalize=False,
                               exclude_oov=False):
        """
        Returns document embeddings based on the words in the document.

        >>> import numpy
        >>> from textpipe.doc import Doc
        >>> doc1 = Doc('a b')
        >>> doc2 = Doc('a a b')
        >>> numpy.array_equiv(doc1.aggregate_word_vectors(), doc1.aggregate_word_vectors())
        True
        >>> numpy.array_equiv(doc1.aggregate_word_vectors(), doc2.aggregate_word_vectors())
        False
        >>> numpy.array_equiv(doc1.aggregate_word_vectors(aggregation='mean'),
        ...                   doc2.aggregate_word_vectors(aggregation='sum'))
        False
        >>> numpy.array_equiv(doc1.aggregate_word_vectors(aggregation='mean'),
        ...                   doc2.aggregate_word_vectors(aggregation='var'))
        False
        >>> numpy.array_equiv(doc1.aggregate_word_vectors(aggregation='sum'),
        ...                   doc2.aggregate_word_vectors(aggregation='var'))
        False
        >>> doc = Doc('sentence with an out of vector word lsseofn')
        >>> len(doc.aggregate_word_vectors())
        96
        >>> numpy.array_equiv(doc.aggregate_word_vectors(exclude_oov=False),
        ...                   doc.aggregate_word_vectors(exclude_oov=True))
        False
        """
        lang = self.language if self.is_reliable_language else self.hint_language
        tokens = [token for token in self._load_spacy_doc(lang, model_name)
                  if not exclude_oov or not token.is_oov]
        vectors = [token.vector / token.vector_norm if normalize else token.vector
                   for token in tokens]

        if aggregation == 'mean':
            return numpy.mean(vectors, axis=0).tolist()

        if aggregation == 'sum':
            return numpy.sum(vectors, axis=0).tolist()

        if aggregation == 'var':
            return numpy.var(vectors, axis=0).tolist()

        raise NotImplementedError(f'Aggregation method {aggregation} is not implemented.')

    def _load_gensim_word2vec_model(self,
                                    model_uri=None,
                                    max_lru_cache_size=1024):
        """
        Loads pre-trained Gensim word2vec keyed vector model from either local or Redis

        >>> from textpipe.doc import Doc
        >>> model = Doc('')._load_gensim_word2vec_model('tests/models/gensim_test_nl.kv')
        >>> type(model)
        <class 'gensim.models.keyedvectors.Word2VecKeyedVectors'>
        """
        lang = self.language if self.is_reliable_language else self.hint_language
        if not self._gensim_vectors or lang not in self._gensim_vectors:
            if urlparse(model_uri).scheme == 'redis':
                vectors = RedisKeyedVectors(model_uri,
                                            lang,
                                            max_lru_cache_size)
                if not vectors.exists:
                    raise TextpipeMissingModelException(f'Redis does not contain a model '
                                                        f'for language {lang}. The model '
                                                        f'needs to be loaded before use '
                                                        f'(see load_keyed_vectors_into_redis).')
            elif model_uri:
                try:
                    vectors = KeyedVectors.load(model_uri, mmap='r')
                    self.nr_train_tokens = sum(token_vocab.count for token_vocab in
                                               vectors.vocab.values())
                except FileNotFoundError:
                    raise TextpipeMissingModelException(
                        f'Gensim keyed vector file {model_uri} is not available.')
            else:
                raise TextpipeMissingModelException(
                    'Either specify model filename or redis URI')
            self._gensim_vectors[lang] = vectors
        return self._gensim_vectors[lang]

    @functools.lru_cache()
    def generate_gensim_document_embedding(self,
                                           model_uri=None,
                                           lowercase=True,
                                           max_lru_cache_size=1024,
                                           idf_weighting='naive'):
        """
        Returns document embeddings generated with Gensim word2vec model.
        idf_weighting scheme can be 'naive' or 'log'

        >>> import numpy
        >>> from textpipe.doc import Doc
        >>> doc1 = Doc('textmining is verwant aan tekstanalyse')
        >>> doc2 = Doc('textmining is verwant aan textmining')
        >>> doc3 = Doc('tekstanalyse is verwant aan textmining')
        >>> test_model_file = 'tests/models/gensim_test_nl.kv'
        >>> numpy.allclose(doc1.generate_gensim_document_embedding(model_uri=test_model_file), \
                           doc2.generate_gensim_document_embedding(model_uri=test_model_file))
        False
        >>> numpy.allclose(doc1.generate_gensim_document_embedding(model_uri=test_model_file), \
                           doc3.generate_gensim_document_embedding(model_uri=test_model_file))
        True
        """
        if not model_uri:
            raise TextpipeMissingModelException('No Gensim keyed vector location specified.')

        model = self._load_gensim_word2vec_model(model_uri,
                                                 max_lru_cache_size)

        if lowercase:
            prepared_word_counts = [(word.lower(), count)
                                    for word, count in self.word_counts.items()
                                    if word.lower() in model]
        else:
            prepared_word_counts = [(word, count) for word, count in self.word_counts.items()
                                    if word in model]

        if not prepared_word_counts:
            return []

        if isinstance(model, RedisKeyedVectors):
            # For redis, the word vectors are already divided by the idf when a word2vec model
            # was loaded (see RedisKeyedVectors.load_keyed_vectors_into_redis)
            if model.idf_weighting != idf_weighting:
                raise RedisIDFWeightingMismatchException(f'The specified document embedding idf '
                                                         f'weighting "{idf_weighting}" does not '
                                                         f'match weighting in RedisKeyedVector "'
                                                         f'{model.idf_weighting}"')
            vectors = [model[word] * count
                       for word, count in prepared_word_counts]
        else:
            vectors = []
            for word, count in prepared_word_counts:
                if idf_weighting == 'naive':
                    idf = model.vocab[word].count
                elif idf_weighting == 'log':
                    idf = (numpy.log(self.nr_train_tokens / (model.vocab[word].count + 1)) + 1)
                else:
                    raise ValueError(f'idf_weighting "{idf_weighting}" not available; use '
                                     f'"naive" or "log"')

                vectors.append(model[word] * (count / idf))
        return list(sum(vectors))

    @functools.lru_cache()
    def generate_textrank_summary(self, ratio=0.2, word_count=None):
        """
        returns a textrank summary of the document (extractive summary) generated with gensim
        returns an empty summary if the text could not be compressed
        if both ratio and word_count are provided, ratio is ignored
        """
        try:
            return summarize(self._spacy_doc.text, ratio=ratio, word_count=word_count, split=True)
        except ValueError:
            return []

    @property
    def summary(self):
        """
        returns a textrank summary of the document (extractive summary)

        >>> from pprint import pprint
        >>> from textpipe.doc import Doc
        >>> text = '''Rice Pudding - Poem by Alan Alexander Milne
        ... What is the matter with Mary Jane?
        ... She's crying with all her might and main,
        ... And she won't eat her dinner - rice pudding again -
        ... What is the matter with Mary Jane?
        ... What is the matter with Mary Jane?
        ... I've promised her dolls and a daisy-chain,
        ... And a book about animals - all in vain -
        ... What is the matter with Mary Jane?
        ... What is the matter with Mary Jane?
        ... She's perfectly well, and she hasn't a pain;
        ... But, look at her, now she's beginning again! -
        ... What is the matter with Mary Jane?
        ... What is the matter with Mary Jane?
        ... I've promised her sweets and a ride in the train,
        ... And I've begged her to stop for a bit and explain -
        ... What is the matter with Mary Jane?
        ... What is the matter with Mary Jane?
        ... She's perfectly well and she hasn't a pain,
        ... And it's lovely rice pudding for dinner again!
        ... What is the matter with Mary Jane?'''
        >>> document = Doc(text)
        >>> pprint(document.summary)
        ["She's crying with all her might and main, And she won't eat her dinner - "
         'rice pudding again - What is the matter with Mary Jane?',
         "She's perfectly well and she hasn't a pain, And it's lovely rice pudding for "
         'dinner again!']
        >>> document = Doc('just 1 sentence.')
        >>> document.summary
        []
        """
        return self.generate_textrank_summary()

    def extract_lead(self, nsents=3):
        """
        returns the lead-3 sentences (text only) of the document
        if the text is smaller than the requested N, return full text

        >>> from pprint import pprint
        >>> from textpipe.doc import Doc
        >>> text = '''Rice Pudding - Poem by Alan Alexander Milne.
        ... What is the matter with Mary Jane?
        ... She's crying with all her might and main,
        ... And she won't eat her dinner - rice pudding again.
        ... What is the matter with Mary Jane? '''
        >>> document = Doc(text)
        >>> pprint(document.extract_lead())
        ['Rice Pudding - Poem by Alan Alexander Milne.',
         'What is the matter with Mary Jane?',
         "She's crying with all her might and main, And she won't eat her dinner - "
         'rice pudding again.']
        """
        return [s[0] for s in self.sents[:nsents]]

    @property
    def cats(self):
        """
        A dict of categories and their probability in the text.

        >>> from textpipe.doc import Doc
        >>> doc = Doc('Sentence for testing text categorization')
        >>> doc.cats
        {}
        """
        return self.get_cats()

    @functools.lru_cache()
    def get_cats(self, model_name=None):
        """
        Extract a dict of categories and their probability in the text, with the possibility
        of using a custom model.

        >>> from textpipe.doc import Doc
        >>> doc = Doc('Sentence for testing text categorization')
        >>> doc.get_cats()
        {}
        """
        lang = self.language if self.is_reliable_language else self.hint_language
        return self._load_spacy_doc(lang, model_name).cats