python source code of corpus

import nltk
from toolz.functoolz import partial
from nltk.stem.porter import PorterStemmer


class CorpusBaseProcessor(object):
    """
    Class that processes a corpus
    """
    def transform(self, docs):
        """
        Parameter:
        -----------
        docs: list of (string|list of tokens)
            input corpus
        
        Return:
        ----------
        list of (string|list of tokens):
            transformed corpus
        """
        raise NotImplemented


class CorpusWordLengthFilter(CorpusBaseProcessor):
    def __init__(self, minlen=2, maxlen=35):
        self._min = minlen
        self._max = maxlen

    def transform(self, docs):
        """
        Parameters:
        ----------
        docs: list of list of str
            the tokenized corpus
        """
        assert isinstance(docs[0], list)
        valid_length = (lambda word:
                        len(word) >= self._min and
                        len(word) <= self._max)
        filter_tokens = partial(filter, valid_length)
        return map(filter_tokens, docs)
    

porter_stemmer = PorterStemmer()


class CorpusStemmer(CorpusBaseProcessor):
    def __init__(self, stem_func=porter_stemmer.stem):
        """
        Parameter:
        --------------
        stem_func: function that accepts one token and stem it
        """
        self._stem_func = stem_func

    def transform(self, docs):
        """
        Parameter:
        -------------
        docs: list of list of str
            the documents

        Return:
        -------------
        list of list of str: the stemmed corpus
        """
        assert isinstance(docs[0], list)
        stem_tokens = partial(map, self._stem_func)
        return map(stem_tokens, docs)


class CorpusPOSTagger(CorpusBaseProcessor):
    def __init__(self, pos_tag_func=nltk.pos_tag):
        """
        Parameter:
        --------------
        pos_tag_func: pos_tag function that accepts list of tokens
            and POS tag them
        """
        self._pos_tag_func = pos_tag_func

    def transform(self, docs):
        """
        Parameter:
        -------------
        docs: list of list of str
            the documents

        Return:
        -------------
        list of list of str: the tagged corpus
        """
        assert isinstance(docs[0], list)
        return map(self._pos_tag_func, docs)