Python nltk.stem.SnowballStemmer() Examples

The following are 6 code examples for showing how to use nltk.stem.SnowballStemmer(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module nltk.stem , or try the search function .

Example 1
Project: Fake_News_Detection   Author: nishitpatel01   File: DataPrep.py    License: MIT License 6 votes vote down vote up
def data_qualityCheck():
    
    print("Checking data qualitites...")
    train_news.isnull().sum()
    train_news.info()
        
    print("check finished.")

    #below datasets were used to 
    test_news.isnull().sum()
    test_news.info()

    valid_news.isnull().sum()
    valid_news.info()

#run the below function call to see the quality check results
#data_qualityCheck()



#eng_stemmer = SnowballStemmer('english')
#stopwords = set(nltk.corpus.stopwords.words('english'))

#Stemming 
Example 2
Project: Interactive-Semantic-Parsing   Author: LittleYUYU   File: ppdb.py    License: Apache License 2.0 6 votes vote down vote up
def collect_pairs_by_rel(filename, rel):
    """ Collect pairs from PPDB maintaining the specified relation. """
    stemmer = SnowballStemmer("english")

    with open(filename, "r") as f:
        data = f.readlines()

    phrase2paraphrase = dict()

    for item in data:
        item = item.strip()
        phrase = item.split('|||')[1].strip()
        paraphrase = item.split('|||')[2].strip()

        if stemmer.stem(phrase) == stemmer.stem(paraphrase):
            continue

        entailment = item.split('|||')[-1].strip()

        if entailment == rel:
            add_to_dict_of_set(phrase, paraphrase, phrase2paraphrase)
            add_to_dict_of_set(paraphrase, phrase, phrase2paraphrase)

    print("Size: %d" % len(phrase2paraphrase))
    return phrase2paraphrase 
Example 3
Project: PyTLDR   Author: jaijuneja   File: tokenizer.py    License: GNU General Public License v3.0 6 votes vote down vote up
def __init__(self, language='english', stopwords=None, stemming=True):
        if stemming:
            self._stemmer = SnowballStemmer(language)
        else:
            self._stemmer = None

        if isinstance(stopwords, list):
            self._stopwords = stopwords
        elif isinstance(stopwords, (str, unicode)):
            # stopwords argument is a path
            try:
                self._stopwords = self._load_stopwords(stopwords)
            except IOError:
                raise IOError('stopwords argument must be a path to a .txt file, a list of word strings '
                              'or None (which loads the default list)')
        else:
            # Load built-in stopwords
            stopwords_dir = 'stopwords/{0}.txt'.format(language.lower())
            application_root = os.path.dirname(__file__)
            stopwords_file = os.path.join(application_root, '..', stopwords_dir)
            self._stopwords = self._load_stopwords(stopwords_file) 
Example 4
Project: summarize   Author: despawnerer   File: language.py    License: MIT License 5 votes vote down vote up
def __init__(self, language):
        self.language = language
        self.stopwords = corpus.stopwords.words(language)
        self.stemmer = stem.SnowballStemmer(language) 
Example 5
Project: Interactive-Semantic-Parsing   Author: LittleYUYU   File: ppdb.py    License: Apache License 2.0 5 votes vote down vote up
def clean_paraphrase(paraphrase_dict):
    stemmer = SnowballStemmer("english")
    paraphrase_dict_clean = dict()
    print("Size: %d" % len(paraphrase_dict))

    for phrase, paraphrases in paraphrase_dict.items():
        new_paraphrases = set()
        for paraphrase in paraphrases:
            if stemmer.stem(phrase) != stemmer.stem(paraphrase):
                new_paraphrases.add(paraphrase)
        if len(new_paraphrases):
            paraphrase_dict_clean[phrase] = new_paraphrases
    print("Size: %d" % len(paraphrase_dict_clean))
    return paraphrase_dict_clean 
Example 6
Project: pliers   Author: tyarkoni   File: test_text_filters.py    License: BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def test_word_stemming_filter():
    stim = ComplexTextStim(join(TEXT_DIR, 'sample_text.txt'),
                           columns='to', default_duration=1)

    # With all defaults (porter stemmer)
    filt = WordStemmingFilter()
    assert isinstance(filt.stemmer, nls.PorterStemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    target = ['some', 'sampl', 'text', 'for', 'test', 'annot']
    assert stems == target

    # Try a different stemmer
    filt = WordStemmingFilter(stemmer='snowball', language='english')
    assert isinstance(filt.stemmer, nls.SnowballStemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    assert stems == target

    # Handles StemmerI stemmer
    stemmer = nls.SnowballStemmer(language='english')
    filt = WordStemmingFilter(stemmer=stemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    assert stems == target

    # Try lemmatization filter
    try:
        nltk.find('taggers/universal_tagset')
    except LookupError:
        nltk.download('universal_tagset')
    try:
        nltk.find('corpora/wordnet')
    except LookupError:
        nltk.download('wordnet')
    stim = ComplexTextStim(text='These are tests for Stemming filters')
    filt = WordStemmingFilter(stemmer='wordnet')
    lemmatized = filt.transform(stim)
    lemmas = [l.text for l in lemmatized]
    target = ['these', 'be', 'test', 'for', 'stem', 'filter']
    assert lemmas == target

    # Try case sensitive
    filt = WordStemmingFilter(stemmer='wordnet', case_sensitive=True)
    lemmatized = filt.transform(stim)
    lemmas = [l.text for l in lemmatized]
    target = ['These', 'be', 'test', 'for', 'Stemming', 'filter']
    assert lemmas == target

    # Fails on invalid values
    with pytest.raises(ValueError):
        filt = WordStemmingFilter(stemmer='nonexistent_stemmer')

    # Try a long text stim
    stim2 = TextStim(text='theres something happening here')
    filt = WordStemmingFilter()
    assert filt.transform(stim2).text == 'there someth happen here'