Python sklearn.feature_extraction.stop_words.ENGLISH_STOP_WORDS Examples

The following are 9 code examples of sklearn.feature_extraction.stop_words.ENGLISH_STOP_WORDS(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.feature_extraction.stop_words , or try the search function .
Example #1
Source File: nmf_context_extractor.py    From yelp with GNU Lesser General Public License v2.1 7 votes vote down vote up
def build_document_term_matrix(self):

        self.tfidf_vectorizer = TfidfVectorizer(
            stop_words=ENGLISH_STOP_WORDS, lowercase=True,
            strip_accents="unicode",
            use_idf=True, norm="l2", min_df=Constants.MIN_DICTIONARY_WORD_COUNT,
            max_df=Constants.MAX_DICTIONARY_WORD_COUNT, ngram_range=(1, 1))
        self.document_term_matrix = \
            self.tfidf_vectorizer.fit_transform(self.target_bows)

        vocabulary = self.tfidf_vectorizer.vocabulary_
        num_terms = len(vocabulary)
        self.terms = [""] * num_terms
        for term in vocabulary.keys():
            self.terms[vocabulary[term]] = term

        print "Created document-term matrix of size %d x %d" % (
            self.document_term_matrix.shape[0],
            self.document_term_matrix.shape[1]
        ) 
Example #2
Source File: lexicon_helper.py    From linguistic-style-transfer with Apache License 2.0 5 votes vote down vote up
def get_stopwords():
    nltk_stopwords = set(stopwords.words('english'))
    sklearn_stopwords = stop_words.ENGLISH_STOP_WORDS

    all_stopwords = set()
    all_stopwords |= spacy_stopwords
    all_stopwords |= nltk_stopwords
    all_stopwords |= sklearn_stopwords

    return all_stopwords 
Example #3
Source File: STFIWF.py    From 2016CCF-sougou with Apache License 2.0 5 votes vote down vote up
def _check_stop_list(stop):
    if stop == "english":
        return ENGLISH_STOP_WORDS
    elif isinstance(stop, six.string_types):
        raise ValueError("not a built-in stop list: %s" % stop)
    elif stop is None:
        return None
    else:  # assume it's a collection
        return frozenset(stop) 
Example #4
Source File: sklearn_intent_classifer.py    From ai-chatbot-framework with MIT License 5 votes vote down vote up
def __init__(self):

        self.model = None

        self.spacynlp = spacy.load('en')

        self.stopwords = set(STOP_WORDS +
                             ["n't", "'s", "'m", "ca"] +
                             list(ENGLISH_STOP_WORDS))

        self.punctuations = " ".join(string.punctuation).split(" ") + \
                            ["-----", "---", "...", "'ve"] 
Example #5
Source File: STFIWF.py    From 2016_CCFsougou2 with MIT License 5 votes vote down vote up
def _check_stop_list(stop):
    if stop == "english":
        return ENGLISH_STOP_WORDS
    elif isinstance(stop, six.string_types):
        raise ValueError("not a built-in stop list: %s" % stop)
    elif stop is None:
        return None
    else:  # assume it's a collection
        return frozenset(stop) 
Example #6
Source File: kaggle18.py    From modin with Apache License 2.0 5 votes vote down vote up
def wordCount(text):
    try:
        text = text.lower()
        regex = re.compile("[" + re.escape(string.punctuation) + "0-9\\r\\t\\n]")
        txt = regex.sub(" ", text)
        words = [
            w
            for w in txt.split(" ")
            if w not in stop_words.ENGLISH_STOP_WORDS and len(w) > 3
        ]
        return len(words)
    except Exception:
        return 0 
Example #7
Source File: open_ended_coders.py    From mpeds with MIT License 5 votes vote down vote up
def _loadSpecialWords(self):
        ''' Load stop words, number prefixes, news agencies, and protest subject words. '''
        self.S_PREFIX  = ['around', 'up to', 'as many as', 'some', 'many', 'nearly', 'more than', 'about']

        self.P_SUBJ   = {
            'protest': ['protesters', 'protestors', 'demonstrators', 'activists', 'strikers', 'marchers', 'signatures',
            	'counter-demonstrators', 'counter-demonstraters', 'counter-protesters', 'counter-protestors', 'counterprotesters',
            	'counterprotestors']
                }

        self.AGW = ['Agence France-Presse, English Service', 'Associated Press Worldstream, English Service']

        self.SWS = list(stop_words.ENGLISH_STOP_WORDS) 
Example #8
Source File: STFIWF.py    From 2016CCF_BDCI_Sougou with MIT License 5 votes vote down vote up
def _check_stop_list(stop):
    if stop == "english":
        return ENGLISH_STOP_WORDS
    elif isinstance(stop, six.string_types):
        raise ValueError("not a built-in stop list: %s" % stop)
    elif stop is None:
        return None
    else:  # assume it's a collection
        return frozenset(stop) 
Example #9
Source File: normalize_text.py    From altair with Apache License 2.0 4 votes vote down vote up
def normalize_text(raw_text, remove_stop_words=True, only_letters=True, return_list=False, remove_one_char_words=True, **kwargs):
    '''
    Algorithm to convert raw text to a return a clean text string
    Method modified from code available at:
    https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words
    Args:
        raw_text: Original text to clean and normalize
        remove_stop_words: Boolean value to trigger removal of stop words
        only_letters: Boolean value to trigger removal of characters that are not letters
        return_list: Boolean value to trigger return value as a list of words
        remove_one_char_words: Boolean value to trigger removal of words that are only a single character
    Returns:
        clean_text: Either a string or a list of words that has been filtered based on function parameters.

    '''
    # Remove web links
    clean_text = link_re.sub('', raw_text)

    # Remove HTML
    # Suppress UserWarnings from BeautifulSoup due to text with tech info (ex: code, directory structure)
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=UserWarning)
        clean_text = BeautifulSoup(clean_text, "lxml").get_text()

    # Only keep letters or keep letters and numbers
    if only_letters: 
        clean_text = letter_re.sub(" ", clean_text)
    else:
        clean_text = letter_number_re.sub(" ",clean_text)

    # Convert to lower case, split into individual words
    clean_text = clean_text.lower().split()

    # If numbers are allowed in words, remove candidate words that only contain numbers
    if not only_letters:
        clean_text = [w for w in clean_text if not all(i.isdigit() for i in w)]

    # Remove stop words
    if remove_stop_words:
        clean_text = [w for w in clean_text if not w in python_stop_words]
        clean_text = [w for w in clean_text if not w in ENGLISH_STOP_WORDS]

    # Remove words that are only a single character in length
    if remove_one_char_words: clean_text = [w for w in clean_text if len(w)>1]

    # Return as string or list based on parameters
    if return_list:
        return clean_text
    else:
        return " ".join(clean_text)