Python nltk.PorterStemmer() Examples

The following are 10 code examples of nltk.PorterStemmer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk , or try the search function .
Example #1
Source File: ngram_featurizer.py    From metal with Apache License 2.0 6 votes vote down vote up
def __init__(
        self,
        anonymize=True,
        trim_window=5,
        lowercase=True,
        drop_stopwords=True,
        stem=True,
        ngram_range=(1, 3),
        **vectorizer_kwargs,
    ):
        self.anonymize = anonymize
        self.lowercase = lowercase
        self.drop_stopwords = drop_stopwords
        if drop_stopwords:
            nltk.download("stopwords")
            self.stopwords = set(nltk.corpus.stopwords.words("english"))
        self.trim_window = trim_window
        self.stem = stem
        if stem:
            self.porter = nltk.PorterStemmer()

        self.vectorizer = CountVectorizer(
            ngram_range=ngram_range, binary=True, **vectorizer_kwargs
        ) 
Example #2
Source File: TF_IDF_Summarization.py    From nlp-akash with MIT License 6 votes vote down vote up
def _create_frequency_table(text_string) -> dict:
    """
    we create a dictionary for the word frequency table.
    For this, we should only use the words that are not part of the stopWords array.

    Removing stop words and making frequency table
    Stemmer - an algorithm to bring words to its root word.
    :rtype: dict
    """
    stopWords = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    ps = PorterStemmer()

    freqTable = dict()
    for word in words:
        word = ps.stem(word)
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable 
Example #3
Source File: TF_IDF_Summarization.py    From nlp-akash with MIT License 6 votes vote down vote up
def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix 
Example #4
Source File: data_preparation_tools.py    From corpus-to-graph-ml with MIT License 5 votes vote down vote up
def stem_text(sent, context=None):
    processed_tokens = []
    tokens = nltk.word_tokenize(sent)
    porter = nltk.PorterStemmer()
    for t in tokens:
        t = porter.stem(t)
        processed_tokens.append(t)

    return " ".join(processed_tokens)

# Split to train and test sample sets: 
Example #5
Source File: document.py    From gender-bias with MIT License 5 votes vote down vote up
def stemmed_words(self) -> List:
        """
        Compute the stems of words.

        Uses nltk.PorterStemmer.

        Returns:
            List

        """
        words = self.words()
        porter = nltk.PorterStemmer()
        return [porter.stem(w) for w in words] 
Example #6
Source File: utils.py    From freesound-datasets with GNU Affero General Public License v3.0 5 votes vote down vote up
def stem(word):
    ps = PorterStemmer()
    return ps.stem(word) 
Example #7
Source File: eval.py    From propara with Apache License 2.0 5 votes vote down vote up
def stem(cls, w: str):
        if not w or len(w.strip()) == 0:
            return ""
        w_lower = w.lower()
        # Remove leading articles from the phrase (e.g., the rays => rays).
        # FIXME: change this logic to accept a list of leading articles.
        if w_lower.startswith("a "):
            w_lower = w_lower[2:]
        elif w_lower.startswith("an "):
            w_lower = w_lower[3:]
        elif w_lower.startswith("the "):
            w_lower = w_lower[4:]
        elif w_lower.startswith("your "):
            w_lower = w_lower[5:]
        elif w_lower.startswith("his "):
            w_lower = w_lower[4:]
        elif w_lower.startswith("their "):
            w_lower = w_lower[6:]
        elif w_lower.startswith("my "):
            w_lower = w_lower[3:]
        elif w_lower.startswith("another "):
            w_lower = w_lower[8:]
        elif w_lower.startswith("other "):
            w_lower = w_lower[6:]
        elif w_lower.startswith("this "):
            w_lower = w_lower[5:]
        elif w_lower.startswith("that "):
            w_lower = w_lower[5:]
        # Porter stemmer: rays => ray
        return PorterStemmer().stem(w_lower).strip() 
Example #8
Source File: nlp.py    From Quora with MIT License 5 votes vote down vote up
def stemming(tokens):
    """
    stem tokens
    """
    porter = nltk.PorterStemmer()
    return [porter.stem(t) for t in tokens] 
Example #9
Source File: extract_statistical_features.py    From Sarcasm-Detection with MIT License 5 votes vote down vote up
def get_ngrams(tokens, n, use_just_words=False, stem=False, for_semantics=False):
    if len(n) < 1:
        return {}
    if not for_semantics:
        if stem:
            porter = PorterStemmer()
            tokens = [porter.stem(t.lower()) for t in tokens]
        if use_just_words:
            tokens = [t.lower() for t in tokens if not t.startswith('@') and not t.startswith('#')
                      and t not in string.punctuation]
    ngram_tokens = []
    for i in n:
        for gram in ngrams(tokens, i):
            string_token = 'gram '
            for j in range(i):
                string_token += gram[j] + ' '
            ngram_tokens.append(string_token)
    ngram_features = {i: ngram_tokens.count(i) for i in set(ngram_tokens)}
    return ngram_features


# Get sentiment features -- a total of 18 features derived
# Emoji features: a count of the positive, negative and neutral emojis
# along with the ratio of positive to negative emojis and negative to neutral
# Using the MPQA subjectivity lexicon, we have to check words for their part of speech
# and obtain features: a count of positive, negative and neutral words, as well as
# a count of the strong and weak subjectives, along with their ratios and a total sentiment words.
# Also using VADER sentiment analyser to obtain a score of sentiments held in a tweet (4 features) 
Example #10
Source File: text_utils.py    From document-qa with Apache License 2.0 5 votes vote down vote up
def __init__(self, lower: bool = True, stemmer="port"):
        self.lower = lower
        self.stemmer = stemmer
        if stemmer == "port":
            self._stemmer = PorterStemmer()
            self._stem = self._stemmer.stem
        elif stemmer == "wordnet":
            self._stemmer = WordNetLemmatizer()
            self._stem = self._stemmer.lemmatize
        else:
            raise ValueError(stemmer)
        # stemming is slow, so we cache words as we go
        self.normalize_cache = {}