Python sklearn.feature_extraction.text.ENGLISH_STOP_WORDS Examples

The following are 6 code examples of sklearn.feature_extraction.text.ENGLISH_STOP_WORDS(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.feature_extraction.text , or try the search function .
Example #1
Source File: test_TermDocMat.py    From scattertext with Apache License 2.0 4 votes vote down vote up
def test_get_stoplisted_unigram_corpus(self):
        tdm = make_a_test_term_doc_matrix()
        uni_tdm = tdm.get_stoplisted_unigram_corpus()
        term_df = tdm.get_term_freq_df()
        uni_term_df = uni_tdm.get_term_freq_df()
        self.assertEqual(set(term for term in term_df.index
                             if ' ' not in term
                             and "'" not in term
                             and term not in ENGLISH_STOP_WORDS),
                         set(uni_term_df.index)), 
Example #2
Source File: test_TermDocMat.py    From scattertext with Apache License 2.0 4 votes vote down vote up
def test_allow_single_quotes_in_unigrams(self):
        tdm = make_a_test_term_doc_matrix()
        self.assertEqual(type(tdm.allow_single_quotes_in_unigrams()), type(tdm))
        uni_tdm = tdm.get_stoplisted_unigram_corpus()
        term_df = tdm.get_term_freq_df()
        uni_term_df = uni_tdm.get_term_freq_df()
        self.assertEqual(set(term for term in term_df.index
                             if ' ' not in term
                             and term not in ENGLISH_STOP_WORDS),
                         set(uni_term_df.index)), 
Example #3
Source File: test_TermDocMat.py    From scattertext with Apache License 2.0 4 votes vote down vote up
def _assert_stoplisted_minus_joe(self, tdm, uni_tdm):
        term_df = tdm.get_term_freq_df()
        uni_term_df = uni_tdm.get_term_freq_df()
        self.assertEqual(set(term for term in term_df.index
                             if ' ' not in term
                             and 'joe' != term.lower()
                             and "'" not in term
                             and term not in ENGLISH_STOP_WORDS),
                         set(uni_term_df.index)), 
Example #4
Source File: test_text.py    From Mastering-Elasticsearch-7.0 with MIT License 4 votes vote down vote up
def test_countvectorizer_stop_words():
    cv = CountVectorizer()
    cv.set_params(stop_words='english')
    assert_equal(cv.get_stop_words(), ENGLISH_STOP_WORDS)
    cv.set_params(stop_words='_bad_str_stop_')
    assert_raises(ValueError, cv.get_stop_words)
    cv.set_params(stop_words='_bad_unicode_stop_')
    assert_raises(ValueError, cv.get_stop_words)
    stoplist = ['some', 'other', 'words']
    cv.set_params(stop_words=stoplist)
    assert_equal(cv.get_stop_words(), set(stoplist)) 
Example #5
Source File: field_types.py    From lexpredict-contraxsuite with GNU Affero General Public License v3.0 4 votes vote down vote up
def _build_stop_words(self) -> Set[str]:
        additional_stop_words = self.field.get_vectorizer_stop_words()
        if additional_stop_words:
            stop_words = set(ENGLISH_STOP_WORDS)
            stop_words.update(additional_stop_words)
            return stop_words
        else:
            return ENGLISH_STOP_WORDS 
Example #6
Source File: test_text.py    From twitter-stock-recommendation with MIT License 4 votes vote down vote up
def test_countvectorizer_stop_words():
    cv = CountVectorizer()
    cv.set_params(stop_words='english')
    assert_equal(cv.get_stop_words(), ENGLISH_STOP_WORDS)
    cv.set_params(stop_words='_bad_str_stop_')
    assert_raises(ValueError, cv.get_stop_words)
    cv.set_params(stop_words='_bad_unicode_stop_')
    assert_raises(ValueError, cv.get_stop_words)
    stoplist = ['some', 'other', 'words']
    cv.set_params(stop_words=stoplist)
    assert_equal(cv.get_stop_words(), set(stoplist))