python source code of document

#!/usr/bin/env python3

from typing import List, Tuple

from contextlib import redirect_stdout
import os
import nltk

# Do not print log messages:
with redirect_stdout(open(os.devnull, "w")):
    nltk.download("punkt")
    nltk.download("wordnet")
    nltk.download("averaged_perceptron_tagger")


def cached(method):
    """
    Method decorator for the Document class, caching results if enabled
    """
    result = {}

    def wrapper(*args):
        if not args[0]._use_cache:
            return method(*args)
        params = tuple(args)
        if params not in result:
            result[params] = method(*args)
        return result[params]

    return wrapper


class Document:
    """
    Base class for Document, a datatype that stores text and enables
    basic textwise feature extraction, such as listing words or sentences.
    """

    def __init__(self, document: "Document", **kwargs) -> None:
        """
        Create a new Document.

        Arguments:
            document (str): The string, or a pathlike, to load
            use_cache (bool: True): Cache sentences and words. This is
                especially useful if you anticipate referencing these functions
                commonly. Disable this on ridiculously large documents to save
                on memory usage.

        """
        # Pass `use_cache=False` if you want to disable caching
        self._use_cache = True
        if "no_cache" in kwargs:
            self._use_cache = not kwargs["no_cache"]

        # Try to load the document from disk.
        if not os.path.exists(document):
            self._text = document
        else:
            # If you fail to load from disk, it's because it's a string!
            with open(document, "r") as f:
                self._text = f.read()

    def text(self) -> str:
        """
        Returns the text of the document.

        Returns:
            str
        """
        return self._text

    @cached
    def sentences(self) -> List[str]:
        """
        Compute a list of sentences.

        Uses nltk.sent_tokenize.

        Returns:
            List[str]

        """
        return [s.replace("\n", " ") for s in nltk.sent_tokenize(self._text)]

    @cached
    def words(self) -> List[str]:
        """
        Compute a list of words from this Document.

        Uses nltk.word_tokenize.

        Returns:
            List[str]

        """
        return nltk.word_tokenize(self._text)

    @cached
    def words_with_indices(self) -> List[Tuple[str, int, int]]:
        """
        Compute a list of words, with beginning and end indices

        Returns:
            List[Tuple[str, int, int]]
        """
        offset = 0
        token_indices = []
        for word in self.words():
            offset = self._text.find(word, offset)
            token_indices.append((word, offset, offset + len(word)))
            offset += len(word)
        return token_indices

    @cached
    def words_by_part_of_speech(self) -> dict:
        """
        Compute the parts of speech for each word in the document.

        Uses nltk.pos_tag.

        Returns:
            dict

        """
        words = self.words()
        tagged = nltk.pos_tag(words)
        categories = {}
        for _type in {t[1] for t in tagged}:
            categories[_type] = [t[0] for t in tagged if t[1] == _type]
        return categories

    @cached
    def stemmed_words(self) -> List:
        """
        Compute the stems of words.

        Uses nltk.PorterStemmer.

        Returns:
            List

        """
        words = self.words()
        porter = nltk.PorterStemmer()
        return [porter.stem(w) for w in words]