Python nltk.stem.SnowballStemmer() Examples

The following are 6 code examples of nltk.stem.SnowballStemmer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.stem , or try the search function

Example #1

Source File: DataPrep.py From Fake_News_Detection with MIT License

6 votes

def data_qualityCheck():
    
    print("Checking data qualitites...")
    train_news.isnull().sum()
    train_news.info()
        
    print("check finished.")

    #below datasets were used to 
    test_news.isnull().sum()
    test_news.info()

    valid_news.isnull().sum()
    valid_news.info()

#run the below function call to see the quality check results
#data_qualityCheck()



#eng_stemmer = SnowballStemmer('english')
#stopwords = set(nltk.corpus.stopwords.words('english'))

#Stemming

Example #2

Source File: ppdb.py From Interactive-Semantic-Parsing with Apache License 2.0

6 votes

def collect_pairs_by_rel(filename, rel):
    """ Collect pairs from PPDB maintaining the specified relation. """
    stemmer = SnowballStemmer("english")

    with open(filename, "r") as f:
        data = f.readlines()

    phrase2paraphrase = dict()

    for item in data:
        item = item.strip()
        phrase = item.split('|||')[1].strip()
        paraphrase = item.split('|||')[2].strip()

        if stemmer.stem(phrase) == stemmer.stem(paraphrase):
            continue

        entailment = item.split('|||')[-1].strip()

        if entailment == rel:
            add_to_dict_of_set(phrase, paraphrase, phrase2paraphrase)
            add_to_dict_of_set(paraphrase, phrase, phrase2paraphrase)

    print("Size: %d" % len(phrase2paraphrase))
    return phrase2paraphrase

Example #3

Source File: tokenizer.py From PyTLDR with GNU General Public License v3.0

6 votes

def __init__(self, language='english', stopwords=None, stemming=True):
        if stemming:
            self._stemmer = SnowballStemmer(language)
        else:
            self._stemmer = None

        if isinstance(stopwords, list):
            self._stopwords = stopwords
        elif isinstance(stopwords, (str, unicode)):
            # stopwords argument is a path
            try:
                self._stopwords = self._load_stopwords(stopwords)
            except IOError:
                raise IOError('stopwords argument must be a path to a .txt file, a list of word strings '
                              'or None (which loads the default list)')
        else:
            # Load built-in stopwords
            stopwords_dir = 'stopwords/{0}.txt'.format(language.lower())
            application_root = os.path.dirname(__file__)
            stopwords_file = os.path.join(application_root, '..', stopwords_dir)
            self._stopwords = self._load_stopwords(stopwords_file)

Example #4

Source File: language.py From summarize with MIT License

5 votes

def __init__(self, language):
        self.language = language
        self.stopwords = corpus.stopwords.words(language)
        self.stemmer = stem.SnowballStemmer(language)

Example #5

Source File: ppdb.py From Interactive-Semantic-Parsing with Apache License 2.0

5 votes

def clean_paraphrase(paraphrase_dict):
    stemmer = SnowballStemmer("english")
    paraphrase_dict_clean = dict()
    print("Size: %d" % len(paraphrase_dict))

    for phrase, paraphrases in paraphrase_dict.items():
        new_paraphrases = set()
        for paraphrase in paraphrases:
            if stemmer.stem(phrase) != stemmer.stem(paraphrase):
                new_paraphrases.add(paraphrase)
        if len(new_paraphrases):
            paraphrase_dict_clean[phrase] = new_paraphrases
    print("Size: %d" % len(paraphrase_dict_clean))
    return paraphrase_dict_clean

Example #6

Source File: test_text_filters.py From pliers with BSD 3-Clause "New" or "Revised" License

4 votes

def test_word_stemming_filter():
    stim = ComplexTextStim(join(TEXT_DIR, 'sample_text.txt'),
                           columns='to', default_duration=1)

    # With all defaults (porter stemmer)
    filt = WordStemmingFilter()
    assert isinstance(filt.stemmer, nls.PorterStemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    target = ['some', 'sampl', 'text', 'for', 'test', 'annot']
    assert stems == target

    # Try a different stemmer
    filt = WordStemmingFilter(stemmer='snowball', language='english')
    assert isinstance(filt.stemmer, nls.SnowballStemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    assert stems == target

    # Handles StemmerI stemmer
    stemmer = nls.SnowballStemmer(language='english')
    filt = WordStemmingFilter(stemmer=stemmer)
    stemmed = filt.transform(stim)
    stems = [s.text for s in stemmed]
    assert stems == target

    # Try lemmatization filter
    try:
        nltk.find('taggers/universal_tagset')
    except LookupError:
        nltk.download('universal_tagset')
    try:
        nltk.find('corpora/wordnet')
    except LookupError:
        nltk.download('wordnet')
    stim = ComplexTextStim(text='These are tests for Stemming filters')
    filt = WordStemmingFilter(stemmer='wordnet')
    lemmatized = filt.transform(stim)
    lemmas = [l.text for l in lemmatized]
    target = ['these', 'be', 'test', 'for', 'stem', 'filter']
    assert lemmas == target

    # Try case sensitive
    filt = WordStemmingFilter(stemmer='wordnet', case_sensitive=True)
    lemmatized = filt.transform(stim)
    lemmas = [l.text for l in lemmatized]
    target = ['These', 'be', 'test', 'for', 'Stemming', 'filter']
    assert lemmas == target

    # Fails on invalid values
    with pytest.raises(ValueError):
        filt = WordStemmingFilter(stemmer='nonexistent_stemmer')

    # Try a long text stim
    stim2 = TextStim(text='theres something happening here')
    filt = WordStemmingFilter()
    assert filt.transform(stim2).text == 'there someth happen here'