Python nltk.text() Examples

The following are code examples for showing how to use nltk.text(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: atap   Author: foxbook   File: vectorization.py    Apache License 2.0 5 votes vote down vote up
def tokenize(text):
    stem = nltk.stem.SnowballStemmer('english')
    text = text.lower()

    for token in nltk.word_tokenize(text):
        if token in string.punctuation: continue
        yield stem.stem(token)


# The corpus object 
Example 2
Project: atap   Author: foxbook   File: vectorization.py    Apache License 2.0 5 votes vote down vote up
def sklearn_frequency_vectorize(corpus):
    # The Scikit-Learn frequency vectorize method
    from sklearn.feature_extraction.text import CountVectorizer

    vectorizer = CountVectorizer()
    return vectorizer.fit_transform(corpus) 
Example 3
Project: atap   Author: foxbook   File: vectorization.py    Apache License 2.0 5 votes vote down vote up
def sklearn_one_hot_vectorize(corpus):
    # The Sklearn one hot vectorize method

    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.preprocessing import Binarizer

    freq    = CountVectorizer()
    vectors = freq.fit_transform(corpus)

    print(len(vectors.toarray()[0]))

    onehot  = Binarizer()
    vectors = onehot.fit_transform(vectors.toarray())

    print(len(vectors[0])) 
Example 4
Project: atap   Author: foxbook   File: vectorization.py    Apache License 2.0 5 votes vote down vote up
def nltk_tfidf_vectorize(corpus):

    from nltk.text import TextCollection

    corpus = [list(tokenize(doc)) for doc in corpus]
    texts = TextCollection(corpus)

    for doc in corpus:
        yield {
            term: texts.tf_idf(term, doc)
            for term in doc
        } 
Example 5
Project: atap   Author: foxbook   File: vectorization.py    Apache License 2.0 5 votes vote down vote up
def sklearn_tfidf_vectorize(corpus):
    from sklearn.feature_extraction.text import TfidfVectorizer

    tfidf = TfidfVectorizer()
    return tfidf.fit_transform(corpus)