Python sklearn.feature_extraction.text.ENGLISH_STOP_WORDS Examples

The following are 6 code examples of sklearn.feature_extraction.text.ENGLISH_STOP_WORDS(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.feature_extraction.text , or try the search function

Example #1

Source File: test_TermDocMat.py From scattertext with Apache License 2.0

4 votes

def test_get_stoplisted_unigram_corpus(self):
        tdm = make_a_test_term_doc_matrix()
        uni_tdm = tdm.get_stoplisted_unigram_corpus()
        term_df = tdm.get_term_freq_df()
        uni_term_df = uni_tdm.get_term_freq_df()
        self.assertEqual(set(term for term in term_df.index
                             if ' ' not in term
                             and "'" not in term
                             and term not in ENGLISH_STOP_WORDS),
                         set(uni_term_df.index)),

Example #2

Source File: test_TermDocMat.py From scattertext with Apache License 2.0

4 votes

def test_allow_single_quotes_in_unigrams(self):
        tdm = make_a_test_term_doc_matrix()
        self.assertEqual(type(tdm.allow_single_quotes_in_unigrams()), type(tdm))
        uni_tdm = tdm.get_stoplisted_unigram_corpus()
        term_df = tdm.get_term_freq_df()
        uni_term_df = uni_tdm.get_term_freq_df()
        self.assertEqual(set(term for term in term_df.index
                             if ' ' not in term
                             and term not in ENGLISH_STOP_WORDS),
                         set(uni_term_df.index)),

Example #3

Source File: test_TermDocMat.py From scattertext with Apache License 2.0

4 votes

def _assert_stoplisted_minus_joe(self, tdm, uni_tdm):
        term_df = tdm.get_term_freq_df()
        uni_term_df = uni_tdm.get_term_freq_df()
        self.assertEqual(set(term for term in term_df.index
                             if ' ' not in term
                             and 'joe' != term.lower()
                             and "'" not in term
                             and term not in ENGLISH_STOP_WORDS),
                         set(uni_term_df.index)),

Example #4

Source File: test_text.py From Mastering-Elasticsearch-7.0 with MIT License

4 votes

def test_countvectorizer_stop_words():
    cv = CountVectorizer()
    cv.set_params(stop_words='english')
    assert_equal(cv.get_stop_words(), ENGLISH_STOP_WORDS)
    cv.set_params(stop_words='_bad_str_stop_')
    assert_raises(ValueError, cv.get_stop_words)
    cv.set_params(stop_words='_bad_unicode_stop_')
    assert_raises(ValueError, cv.get_stop_words)
    stoplist = ['some', 'other', 'words']
    cv.set_params(stop_words=stoplist)
    assert_equal(cv.get_stop_words(), set(stoplist))

Example #5

Source File: field_types.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0

4 votes

def _build_stop_words(self) -> Set[str]:
        additional_stop_words = self.field.get_vectorizer_stop_words()
        if additional_stop_words:
            stop_words = set(ENGLISH_STOP_WORDS)
            stop_words.update(additional_stop_words)
            return stop_words
        else:
            return ENGLISH_STOP_WORDS

Example #6

Source File: test_text.py From twitter-stock-recommendation with MIT License

4 votes

def test_countvectorizer_stop_words():
    cv = CountVectorizer()
    cv.set_params(stop_words='english')
    assert_equal(cv.get_stop_words(), ENGLISH_STOP_WORDS)
    cv.set_params(stop_words='_bad_str_stop_')
    assert_raises(ValueError, cv.get_stop_words)
    cv.set_params(stop_words='_bad_unicode_stop_')
    assert_raises(ValueError, cv.get_stop_words)
    stoplist = ['some', 'other', 'words']
    cv.set_params(stop_words=stoplist)
    assert_equal(cv.get_stop_words(), set(stoplist))