Python nltk.PorterStemmer() Examples
The following are 10 code examples for showing how to use nltk.PorterStemmer(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
You may check out the related API usage on the sidebar.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example 1
Project: metal Author: HazyResearch File: ngram_featurizer.py License: Apache License 2.0 | 6 votes |
def __init__( self, anonymize=True, trim_window=5, lowercase=True, drop_stopwords=True, stem=True, ngram_range=(1, 3), **vectorizer_kwargs, ): self.anonymize = anonymize self.lowercase = lowercase self.drop_stopwords = drop_stopwords if drop_stopwords: nltk.download("stopwords") self.stopwords = set(nltk.corpus.stopwords.words("english")) self.trim_window = trim_window self.stem = stem if stem: self.porter = nltk.PorterStemmer() self.vectorizer = CountVectorizer( ngram_range=ngram_range, binary=True, **vectorizer_kwargs )
Example 2
Project: nlp-akash Author: akashp1712 File: TF_IDF_Summarization.py License: MIT License | 6 votes |
def _create_frequency_table(text_string) -> dict: """ we create a dictionary for the word frequency table. For this, we should only use the words that are not part of the stopWords array. Removing stop words and making frequency table Stemmer - an algorithm to bring words to its root word. :rtype: dict """ stopWords = set(stopwords.words("english")) words = word_tokenize(text_string) ps = PorterStemmer() freqTable = dict() for word in words: word = ps.stem(word) if word in stopWords: continue if word in freqTable: freqTable[word] += 1 else: freqTable[word] = 1 return freqTable
Example 3
Project: nlp-akash Author: akashp1712 File: TF_IDF_Summarization.py License: MIT License | 6 votes |
def _create_frequency_matrix(sentences): frequency_matrix = {} stopWords = set(stopwords.words("english")) ps = PorterStemmer() for sent in sentences: freq_table = {} words = word_tokenize(sent) for word in words: word = word.lower() word = ps.stem(word) if word in stopWords: continue if word in freq_table: freq_table[word] += 1 else: freq_table[word] = 1 frequency_matrix[sent[:15]] = freq_table return frequency_matrix
Example 4
Project: corpus-to-graph-ml Author: CatalystCode File: data_preparation_tools.py License: MIT License | 5 votes |
def stem_text(sent, context=None): processed_tokens = [] tokens = nltk.word_tokenize(sent) porter = nltk.PorterStemmer() for t in tokens: t = porter.stem(t) processed_tokens.append(t) return " ".join(processed_tokens) # Split to train and test sample sets:
Example 5
Project: gender-bias Author: gender-bias File: document.py License: MIT License | 5 votes |
def stemmed_words(self) -> List: """ Compute the stems of words. Uses nltk.PorterStemmer. Returns: List """ words = self.words() porter = nltk.PorterStemmer() return [porter.stem(w) for w in words]
Example 6
Project: freesound-datasets Author: MTG File: utils.py License: GNU Affero General Public License v3.0 | 5 votes |
def stem(word): ps = PorterStemmer() return ps.stem(word)
Example 7
Project: propara Author: allenai File: eval.py License: Apache License 2.0 | 5 votes |
def stem(cls, w: str): if not w or len(w.strip()) == 0: return "" w_lower = w.lower() # Remove leading articles from the phrase (e.g., the rays => rays). # FIXME: change this logic to accept a list of leading articles. if w_lower.startswith("a "): w_lower = w_lower[2:] elif w_lower.startswith("an "): w_lower = w_lower[3:] elif w_lower.startswith("the "): w_lower = w_lower[4:] elif w_lower.startswith("your "): w_lower = w_lower[5:] elif w_lower.startswith("his "): w_lower = w_lower[4:] elif w_lower.startswith("their "): w_lower = w_lower[6:] elif w_lower.startswith("my "): w_lower = w_lower[3:] elif w_lower.startswith("another "): w_lower = w_lower[8:] elif w_lower.startswith("other "): w_lower = w_lower[6:] elif w_lower.startswith("this "): w_lower = w_lower[5:] elif w_lower.startswith("that "): w_lower = w_lower[5:] # Porter stemmer: rays => ray return PorterStemmer().stem(w_lower).strip()
Example 8
Project: Quora Author: KevinLiao159 File: nlp.py License: MIT License | 5 votes |
def stemming(tokens): """ stem tokens """ porter = nltk.PorterStemmer() return [porter.stem(t) for t in tokens]
Example 9
Project: Sarcasm-Detection Author: MirunaPislar File: extract_statistical_features.py License: MIT License | 5 votes |
def get_ngrams(tokens, n, use_just_words=False, stem=False, for_semantics=False): if len(n) < 1: return {} if not for_semantics: if stem: porter = PorterStemmer() tokens = [porter.stem(t.lower()) for t in tokens] if use_just_words: tokens = [t.lower() for t in tokens if not t.startswith('@') and not t.startswith('#') and t not in string.punctuation] ngram_tokens = [] for i in n: for gram in ngrams(tokens, i): string_token = 'gram ' for j in range(i): string_token += gram[j] + ' ' ngram_tokens.append(string_token) ngram_features = {i: ngram_tokens.count(i) for i in set(ngram_tokens)} return ngram_features # Get sentiment features -- a total of 18 features derived # Emoji features: a count of the positive, negative and neutral emojis # along with the ratio of positive to negative emojis and negative to neutral # Using the MPQA subjectivity lexicon, we have to check words for their part of speech # and obtain features: a count of positive, negative and neutral words, as well as # a count of the strong and weak subjectives, along with their ratios and a total sentiment words. # Also using VADER sentiment analyser to obtain a score of sentiments held in a tweet (4 features)
Example 10
Project: document-qa Author: allenai File: text_utils.py License: Apache License 2.0 | 5 votes |
def __init__(self, lower: bool = True, stemmer="port"): self.lower = lower self.stemmer = stemmer if stemmer == "port": self._stemmer = PorterStemmer() self._stem = self._stemmer.stem elif stemmer == "wordnet": self._stemmer = WordNetLemmatizer() self._stem = self._stemmer.lemmatize else: raise ValueError(stemmer) # stemming is slow, so we cache words as we go self.normalize_cache = {}