Python nltk.PorterStemmer() Examples

The following are 10 code examples for showing how to use nltk.PorterStemmer(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module nltk , or try the search function .

Example 1
Project: metal   Author: HazyResearch   File: ngram_featurizer.py    License: Apache License 2.0 6 votes vote down vote up
def __init__(
        self,
        anonymize=True,
        trim_window=5,
        lowercase=True,
        drop_stopwords=True,
        stem=True,
        ngram_range=(1, 3),
        **vectorizer_kwargs,
    ):
        self.anonymize = anonymize
        self.lowercase = lowercase
        self.drop_stopwords = drop_stopwords
        if drop_stopwords:
            nltk.download("stopwords")
            self.stopwords = set(nltk.corpus.stopwords.words("english"))
        self.trim_window = trim_window
        self.stem = stem
        if stem:
            self.porter = nltk.PorterStemmer()

        self.vectorizer = CountVectorizer(
            ngram_range=ngram_range, binary=True, **vectorizer_kwargs
        ) 
Example 2
Project: nlp-akash   Author: akashp1712   File: TF_IDF_Summarization.py    License: MIT License 6 votes vote down vote up
def _create_frequency_table(text_string) -> dict:
    """
    we create a dictionary for the word frequency table.
    For this, we should only use the words that are not part of the stopWords array.

    Removing stop words and making frequency table
    Stemmer - an algorithm to bring words to its root word.
    :rtype: dict
    """
    stopWords = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    ps = PorterStemmer()

    freqTable = dict()
    for word in words:
        word = ps.stem(word)
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable 
Example 3
Project: nlp-akash   Author: akashp1712   File: TF_IDF_Summarization.py    License: MIT License 6 votes vote down vote up
def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix 
Example 4
Project: corpus-to-graph-ml   Author: CatalystCode   File: data_preparation_tools.py    License: MIT License 5 votes vote down vote up
def stem_text(sent, context=None):
    processed_tokens = []
    tokens = nltk.word_tokenize(sent)
    porter = nltk.PorterStemmer()
    for t in tokens:
        t = porter.stem(t)
        processed_tokens.append(t)

    return " ".join(processed_tokens)

# Split to train and test sample sets: 
Example 5
Project: gender-bias   Author: gender-bias   File: document.py    License: MIT License 5 votes vote down vote up
def stemmed_words(self) -> List:
        """
        Compute the stems of words.

        Uses nltk.PorterStemmer.

        Returns:
            List

        """
        words = self.words()
        porter = nltk.PorterStemmer()
        return [porter.stem(w) for w in words] 
Example 6
Project: freesound-datasets   Author: MTG   File: utils.py    License: GNU Affero General Public License v3.0 5 votes vote down vote up
def stem(word):
    ps = PorterStemmer()
    return ps.stem(word) 
Example 7
Project: propara   Author: allenai   File: eval.py    License: Apache License 2.0 5 votes vote down vote up
def stem(cls, w: str):
        if not w or len(w.strip()) == 0:
            return ""
        w_lower = w.lower()
        # Remove leading articles from the phrase (e.g., the rays => rays).
        # FIXME: change this logic to accept a list of leading articles.
        if w_lower.startswith("a "):
            w_lower = w_lower[2:]
        elif w_lower.startswith("an "):
            w_lower = w_lower[3:]
        elif w_lower.startswith("the "):
            w_lower = w_lower[4:]
        elif w_lower.startswith("your "):
            w_lower = w_lower[5:]
        elif w_lower.startswith("his "):
            w_lower = w_lower[4:]
        elif w_lower.startswith("their "):
            w_lower = w_lower[6:]
        elif w_lower.startswith("my "):
            w_lower = w_lower[3:]
        elif w_lower.startswith("another "):
            w_lower = w_lower[8:]
        elif w_lower.startswith("other "):
            w_lower = w_lower[6:]
        elif w_lower.startswith("this "):
            w_lower = w_lower[5:]
        elif w_lower.startswith("that "):
            w_lower = w_lower[5:]
        # Porter stemmer: rays => ray
        return PorterStemmer().stem(w_lower).strip() 
Example 8
Project: Quora   Author: KevinLiao159   File: nlp.py    License: MIT License 5 votes vote down vote up
def stemming(tokens):
    """
    stem tokens
    """
    porter = nltk.PorterStemmer()
    return [porter.stem(t) for t in tokens] 
Example 9
Project: Sarcasm-Detection   Author: MirunaPislar   File: extract_statistical_features.py    License: MIT License 5 votes vote down vote up
def get_ngrams(tokens, n, use_just_words=False, stem=False, for_semantics=False):
    if len(n) < 1:
        return {}
    if not for_semantics:
        if stem:
            porter = PorterStemmer()
            tokens = [porter.stem(t.lower()) for t in tokens]
        if use_just_words:
            tokens = [t.lower() for t in tokens if not t.startswith('@') and not t.startswith('#')
                      and t not in string.punctuation]
    ngram_tokens = []
    for i in n:
        for gram in ngrams(tokens, i):
            string_token = 'gram '
            for j in range(i):
                string_token += gram[j] + ' '
            ngram_tokens.append(string_token)
    ngram_features = {i: ngram_tokens.count(i) for i in set(ngram_tokens)}
    return ngram_features


# Get sentiment features -- a total of 18 features derived
# Emoji features: a count of the positive, negative and neutral emojis
# along with the ratio of positive to negative emojis and negative to neutral
# Using the MPQA subjectivity lexicon, we have to check words for their part of speech
# and obtain features: a count of positive, negative and neutral words, as well as
# a count of the strong and weak subjectives, along with their ratios and a total sentiment words.
# Also using VADER sentiment analyser to obtain a score of sentiments held in a tweet (4 features) 
Example 10
Project: document-qa   Author: allenai   File: text_utils.py    License: Apache License 2.0 5 votes vote down vote up
def __init__(self, lower: bool = True, stemmer="port"):
        self.lower = lower
        self.stemmer = stemmer
        if stemmer == "port":
            self._stemmer = PorterStemmer()
            self._stem = self._stemmer.stem
        elif stemmer == "wordnet":
            self._stemmer = WordNetLemmatizer()
            self._stem = self._stemmer.lemmatize
        else:
            raise ValueError(stemmer)
        # stemming is slow, so we cache words as we go
        self.normalize_cache = {}