Python nltk.corpus.stopwords.words() Examples

The following are 30 code examples for showing how to use nltk.corpus.stopwords.words(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module nltk.corpus.stopwords , or try the search function .

Example 1
Project: SEDST   Author: AuCson   File: unsup_net.py    License: MIT License 7 votes vote down vote up
def pz_selective_sampling(self, pz_proba):
        """
        Selective sampling of pz(do max-sampling but prevent repeated words)
        """
        pz_proba = pz_proba.data
        z_proba, z_token = torch.topk(pz_proba, pz_proba.size(0), dim=2)
        z_token = z_token.transpose(0, 1)  # [B,Tz,top_Tz]
        all_sampled_z = []
        for b in range(z_token.size(0)):
            sampled_z = []
            for t in range(z_token.size(1)):
                for i in range(z_token.size(2)):
                    if z_token[b][t][i] not in sampled_z:
                        sampled_z.append(z_token[b][t][i])
                        break
            all_sampled_z.append(sampled_z)
        return all_sampled_z 
Example 2
Project: razzy-spinner   Author: rafasashi   File: text.py    License: GNU General Public License v3.0 6 votes vote down vote up
def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            #print("Building collocations list")
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; ")) 
Example 3
Project: razzy-spinner   Author: rafasashi   File: texttiling.py    License: GNU General Public License v3.0 6 votes vote down vote up
def __init__(self,
                 w=20,
                 k=10,
                 similarity_method=BLOCK_COMPARISON,
                 stopwords=None,
                 smoothing_method=DEFAULT_SMOOTHING,
                 smoothing_width=2,
                 smoothing_rounds=1,
                 cutoff_policy=HC,
                 demo_mode=False):


        if stopwords is None:
            from nltk.corpus import stopwords
            stopwords = stopwords.words('english')
        self.__dict__.update(locals())
        del self.__dict__['self'] 
Example 4
Project: razzy-spinner   Author: rafasashi   File: collocations.py    License: GNU General Public License v3.0 6 votes vote down vote up
def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  When window_size > 2, count non-contiguous bigrams, in the
        style of Church and Hanks's (1990) association ratio.
        """
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError("Specify window_size at least 2")

        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            wfd[w1] += 1
            for w2 in window[1:]:
                if w2 is not None:
                    bfd[(w1, w2)] += 1
        return cls(wfd, bfd, window_size=window_size) 
Example 5
Project: razzy-spinner   Author: rafasashi   File: collocations.py    License: GNU General Public License v3.0 6 votes vote down vote up
def from_words(cls, words, window_size=3):
        """Construct a TrigramCollocationFinder for all trigrams in the given
        sequence.
        """
        if window_size < 3:
            raise ValueError("Specify window_size at least 3")

        wfd = FreqDist()
        wildfd = FreqDist()
        bfd = FreqDist()
        tfd = FreqDist()
        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            for w2, w3 in _itertools.combinations(window[1:], 2):
                wfd[w1] += 1
                if w2 is None:
                    continue
                bfd[(w1, w2)] += 1
                if w3 is None:
                    continue
                wildfd[(w1, w3)] += 1
                tfd[(w1, w2, w3)] += 1
        return cls(wfd, bfd, wildfd, tfd) 
Example 6
Project: razzy-spinner   Author: rafasashi   File: util.py    License: GNU General Public License v3.0 6 votes vote down vote up
def extract_unigram_feats(document, unigrams, handle_negation=False):
    """
    Populate a dictionary of unigram features, reflecting the presence/absence in
    the document of each of the tokens in `unigrams`.

    :param document: a list of words/tokens.
    :param unigrams: a list of words/tokens whose presence/absence has to be
        checked in `document`.
    :param handle_negation: if `handle_negation == True` apply `mark_negation`
        method to `document` before checking for unigram presence/absence.
    :return: a dictionary of unigram features {unigram : boolean}.

    >>> words = ['ice', 'police', 'riot']
    >>> document = 'ice is melting due to global warming'.split()
    >>> sorted(extract_unigram_feats(document, words).items())
    [('contains(ice)', True), ('contains(police)', False), ('contains(riot)', False)]
    """
    features = {}
    if handle_negation:
        document = mark_negation(document)
    for word in unigrams:
        features['contains({0})'.format(word)] = word in set(document)
    return features 
Example 7
Project: razzy-spinner   Author: rafasashi   File: util.py    License: GNU General Public License v3.0 6 votes vote down vote up
def extract_bigram_feats(document, bigrams):
    """
    Populate a dictionary of bigram features, reflecting the presence/absence in
    the document of each of the tokens in `bigrams`. This extractor function only
    considers contiguous bigrams obtained by `nltk.bigrams`.

    :param document: a list of words/tokens.
    :param unigrams: a list of bigrams whose presence/absence has to be
        checked in `document`.
    :return: a dictionary of bigram features {bigram : boolean}.

    >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
    >>> document = 'ice is melting due to global warming'.split()
    >>> sorted(extract_bigram_feats(document, bigrams).items())
    [('contains(global - warming)', True), ('contains(love - you)', False),
    ('contains(police - prevented)', False)]
    """
    features = {}
    for bigr in bigrams:
        features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
    return features

#////////////////////////////////////////////////////////////
#{ Helper Functions
#//////////////////////////////////////////////////////////// 
Example 8
Project: Document-Classifier-LSTM   Author: AlexGidiotis   File: data_prep.py    License: MIT License 6 votes vote down vote up
def preprocess(text):
	min_length = 3
	text = re.sub('\d+','#',text)
	text = re.sub('\.',' eos ',text)
	# Tokenize
	words = map(lambda word: word.lower(), word_tokenize(text))
	tokens = words
	# Remove non characters
	p = re.compile('[a-zA-Z#]+')
	# Filter tokens (we do not remove stopwords)
	filtered_tokens = list(filter(lambda token: p.match(token) and len(token)>=min_length and (token not in english_stopwords), tokens))
	# Encode to ascii
	filtered_tokens = [token.encode('ascii','ignore') for token in filtered_tokens]

	return filtered_tokens


# Modify this path 
Example 9
Project: lisc   Author: lisc-tools   File: utils.py    License: Apache License 2.0 6 votes vote down vote up
def convert_string(text):
    """Convert a str of text into tokenized and selected list of words.

    Parameters
    ----------
    text : str
        Text as one long string.

    Returns
    -------
    words_cleaned : list of str
        List of tokenized words, after processing.

    Notes
    -----
    This function sets text to lower case, and removes stopwords and punctuation.
    """

    words = word_tokenize(text)
    words_cleaned = [word.lower() for word in words if (
        (not word.lower() in stopwords.words('english')) & word.isalnum())]

    return words_cleaned 
Example 10
Project: Projects   Author: iamshang1   File: preprocessing.py    License: MIT License 6 votes vote down vote up
def _tokenize(self, row):
        '''
        clean texts by removing special and non-chars
        stems each word and removes stop words
        return list of tokenized words for each row
        '''
        chars = re.sub(r'-|&quot;|&amp;',' ',row) #replace dashes, quotes, and ampersands
        chars = self.regex.sub('',chars) # remove nonchars
        wordlist = str(chars).split()
        if self.stemming:
            wordlist = [self.stemmer.stem(word.lower()) for word in wordlist if word.lower() not in self.stop] # stem and remove stopwords
        else:
            wordlist = [word.lower() for word in wordlist if word.lower() not in self.stop]
        
        #create bigrams if enabled
        if self.bigrams:
            bigrams = []
            for i in range(len(wordlist)-1):
                bigrams.append(wordlist[i]+" "+wordlist[i+1])
            wordlist = wordlist + bigrams
            
        return wordlist 
Example 11
Project: comparable-text-miner   Author: motazsaad   File: textpro.py    License: Apache License 2.0 6 votes vote down vote up
def lightStemAr(word_list):
	result = []
	arstemmer = ISRIStemmer()
	for word in word_list:
		word = arstemmer.norm(word, num=1)  #  remove diacritics which representing Arabic short vowels  
		if not word in arstemmer.stop_words:   # exclude stop words from being processed
			word = arstemmer.pre32(word)        # remove length three and length two prefixes in this order
			word = arstemmer.suf32(word)        # remove length three and length two suffixes in this order
			word = arstemmer.waw(word)          # remove connective ‘و’ if it precedes a word beginning with ‘و’
			word = arstemmer.norm(word, num=2)       # normalize initial hamza to bare alif
		result.append(word)
	return ' '.join(result)

###################################################################################

# combine rooting and light stemming: if light stemming alogrithm manage to reduce word form, then the light stem is returned, else, the root is returned 
Example 12
Project: SOQAL   Author: husseinmozannar   File: slidingwindow_distance.py    License: MIT License 6 votes vote down vote up
def distance_based_helper(self, P, Q, A):
        res = []
        U = set(stopwords.words('arabic')) & set(P)
        SQ = list(set(P) & set(Q) - U)
        for i in range(0, len(A)):
            SA = list(((set(A[i]) & set(P)) - set(Q)) - U)
            d = len(P) + 1
            if(len(SQ) == 0 or len(SA) == 0):
                d = 1
            else:
                for q in SQ:
                    for a in SA:
                        d = min(d, self.dist(P, q, a))
            d *= 1 / (len(P) - 1)
            res.append(d)
        return res 
Example 13
Project: kaggle-HomeDepot   Author: ChenglongChen   File: text_processing.py    License: MIT License 6 votes vote down vote up
def get_last_words_from_parsed_title(s):
    words=s.split()
    if len(words)==0:
        last_word=""
        word_before_last=""
        word2_before_last=""
    else:
        last_word=words[len(words)-1]
        word_before_last=""
        word2_before_last=""
        if len(words)>1:
            word_before_last=words[len(words)-2]
            if word_before_last=="and":
                word_before_last=""
            if len(words)>2 and word_before_last!="and":
                word2_before_last=words[len(words)-3]
                if word2_before_last=="and":
                    word2_before_last=""
    return last_word, word_before_last, word2_before_last 
Example 14
Project: steppy-toolkit   Author: minerva-ml   File: text.py    License: MIT License 5 votes vote down vote up
def _use_stopwords(self, x):
        words = tokenizer.tokenize(x)
        words = [w for w in words if not w in eng_stopwords]
        x = " ".join(words)
        return x 
Example 15
Project: steppy-toolkit   Author: minerva-ml   File: text.py    License: MIT License 5 votes vote down vote up
def _apostrophes(self, x):
        words = tokenizer.tokenize(x)
        words = [APOSTROPHES_WORDS[word] if word in APOSTROPHES_WORDS else word for word in words]
        words = [lem.lemmatize(word, "v") for word in words]
        words = [w for w in words if not w in eng_stopwords]
        x = " ".join(words)
        return x 
Example 16
Project: SEDST   Author: AuCson   File: metric.py    License: MIT License 5 votes vote down vote up
def _lemmatize(self, sent):
        words = [wn.lemmatize(_) for _ in sent.split()]
        return ' '.join(words) 
Example 17
Project: product-classifier   Author: georgetown-analytics   File: features.py    License: MIT License 5 votes vote down vote up
def __init__(self, stoplist=None, punct=None, lemmatizer=None):
        # Load stopwords, punctuation, and lemmatizer
        # This takes a bit of work, so we only want to do it once!
        self.stopwords   = stoplist or stopwords.words('english')
        self.punctuation = punct or string.punctuation
        self.lemmatizer  = lemmatizer or WordNetLemmatizer() 
Example 18
Project: product-classifier   Author: georgetown-analytics   File: features.py    License: MIT License 5 votes vote down vote up
def tokenize(self, text):
        """
        Returns a list of individual tokens from the text utilizing NLTK's
        tokenize built in utility (far better than split on space). It also
        removes any stopwords and punctuation from the text, as well as
        ensure that every token is normalized.

        For now, token = word as in bag of words (the feature we're using).
        """
        for token in wordpunct_tokenize(text):
            token = self.normalize(token)
            if token in self.punctuation: continue
            if token in self.stopwords: continue
            yield token 
Example 19
Project: product-classifier   Author: georgetown-analytics   File: features.py    License: MIT License 5 votes vote down vote up
def normalize(self, word):
        """
        Ensures words are in the same class (lemma) as well as lowercase
        """
        word = word.lower()
        return self.lemmatizer.lemmatize(word) 
Example 20
Project: product-classifier   Author: georgetown-analytics   File: features.py    License: MIT License 5 votes vote down vote up
def featurize(self, name, description=None, keywords=None):
        """
        Returns a dictionary of features to use with the Maximum Entropy
        classifier. In this case we're using a "bag of words" approach.
        """

        # Get the bag of words from the name
        tokens = set(self.tokenize(name))

        # Add the bag of words from the description (union)
        if description is not None:
            tokens = tokens | set(self.tokenize(description))

        # Get the bag of keywords
        keywords = set(self.tokenize(keywords)) if keywords else set([])

        # Create the features
        features = {}
        for token in tokens:
            features[token] = True
        for keyword in keywords:
            features["KEYWORD(%s)" % keyword] = True

        return features

##########################################################################
## Development testing
########################################################################## 
Example 21
Project: razzy-spinner   Author: rafasashi   File: text.py    License: GNU General Public License v3.0 5 votes vote down vote up
def word_similarity_dict(self, word):
        """
        Return a dictionary mapping from words to 'similarity scores,'
        indicating how often these two words occur in the same
        context.
        """
        word = self._key(word)
        word_contexts = set(self._word_to_contexts[word])

        scores = {}
        for w, w_contexts in self._word_to_contexts.items():
            scores[w] = f_measure(word_contexts, set(w_contexts))

        return scores 
Example 22
Project: razzy-spinner   Author: rafasashi   File: text.py    License: GNU General Public License v3.0 5 votes vote down vote up
def common_contexts(self, words, fail_on_unknown=False):
        """
        Find contexts where the specified words can all appear; and
        return a frequency distribution mapping each context to the
        number of times that context was used.

        :param words: The words used to seed the similarity search
        :type words: str
        :param fail_on_unknown: If true, then raise a value error if
            any of the given words do not occur at all in the index.
        """
        words = [self._key(w) for w in words]
        contexts = [set(self._word_to_contexts[w]) for w in words]
        empty = [words[i] for i in range(len(words)) if not contexts[i]]
        common = reduce(set.intersection, contexts)
        if empty and fail_on_unknown:
            raise ValueError("The following word(s) were not found:",
                             " ".join(words))
        elif not common:
            # nothing in common -- just return an empty freqdist.
            return FreqDist()
        else:
            fd = FreqDist(c for w in words
                          for c in self._word_to_contexts[w]
                          if c in common)
            return fd 
Example 23
Project: razzy-spinner   Author: rafasashi   File: text.py    License: GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, tokens, key=lambda x:x):
        """
        Construct a new concordance index.

        :param tokens: The document (list of tokens) that this
            concordance index was created from.  This list can be used
            to access the context of a given word occurrence.
        :param key: A function that maps each token to a normalized
            version that will be used as a key in the index.  E.g., if
            you use ``key=lambda s:s.lower()``, then the index will be
            case-insensitive.
        """
        self._tokens = tokens
        """The document (list of tokens) that this concordance index
           was created from."""

        self._key = key
        """Function mapping each token to an index key (or None)."""

        self._offsets = defaultdict(list)
        """Dictionary mapping words (or keys) to lists of offset
           indices."""

        # Initialize the index (self._offsets)
        for index, word in enumerate(tokens):
            word = self._key(word)
            self._offsets[word].append(index) 
Example 24
Project: razzy-spinner   Author: rafasashi   File: text.py    License: GNU General Public License v3.0 5 votes vote down vote up
def print_concordance(self, word, width=75, lines=25):
        """
        Print a concordance for ``word`` with the specified context window.

        :param word: The target word
        :type word: str
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param lines: The number of lines to display (default=25)
        :type lines: int
        """
        half_width = (width - len(word) - 2) // 2
        context = width // 4 # approx number of words of context

        offsets = self.offsets(word)
        if offsets:
            lines = min(lines, len(offsets))
            print("Displaying %s of %s matches:" % (lines, len(offsets)))
            for i in offsets:
                if lines <= 0:
                    break
                left = (' ' * half_width +
                        ' '.join(self._tokens[i-context:i]))
                right = ' '.join(self._tokens[i+1:i+context])
                left = left[-half_width:]
                right = right[:half_width]
                print(left, self._tokens[i], right)
                lines -= 1
        else:
            print("No matches") 
Example 25
Project: razzy-spinner   Author: rafasashi   File: text.py    License: GNU General Public License v3.0 5 votes vote down vote up
def common_contexts(self, words, num=20):
        """
        Find contexts where the specified words appear; list
        most frequent common contexts first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.common_contexts()
        """
        if '_word_context_index' not in self.__dict__:
            #print('Building word-context index...')
            self._word_context_index = ContextIndex(self.tokens,
                                                    key=lambda s:s.lower())

        try:
            fd = self._word_context_index.common_contexts(words, True)
            if not fd:
                print("No common contexts were found")
            else:
                ranked_contexts = [w for w, _ in fd.most_common(num)]
                print(tokenwrap(w1+"_"+w2 for w1,w2 in ranked_contexts))

        except ValueError as e:
            print(e) 
Example 26
Project: razzy-spinner   Author: rafasashi   File: text.py    License: GNU General Public License v3.0 5 votes vote down vote up
def dispersion_plot(self, words):
        """
        Produce a plot showing the distribution of the words through the text.
        Requires pylab to be installed.

        :param words: The words to be plotted
        :type words: list(str)
        :seealso: nltk.draw.dispersion_plot()
        """
        from nltk.draw import dispersion_plot
        dispersion_plot(self, words) 
Example 27
Project: razzy-spinner   Author: rafasashi   File: text.py    License: GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, source):
        if hasattr(source, 'words'): # bridge to the text corpus reader
            source = [source.words(f) for f in source.fileids()]

        self._texts = source
        Text.__init__(self, LazyConcatenation(source))
        self._idf_cache = {} 
Example 28
Project: razzy-spinner   Author: rafasashi   File: text.py    License: GNU General Public License v3.0 5 votes vote down vote up
def demo():
    from nltk.corpus import brown
    text = Text(brown.words(categories='news'))
    print(text)
    print()
    print("Concordance:")
    text.concordance('news')
    print()
    print("Distributionally similar words:")
    text.similar('news')
    print()
    print("Collocations:")
    text.collocations()
    print()
    #print("Automatically generated text:")
    #text.generate()
    #print()
    print("Dispersion plot:")
    text.dispersion_plot(['news', 'report', 'said', 'announced'])
    print()
    print("Vocabulary plot:")
    text.plot(50)
    print()
    print("Indexing:")
    print("text[3]:", text[3])
    print("text[3:5]:", text[3:5])
    print("text.vocab()['news']:", text.vocab()['news']) 
Example 29
Project: razzy-spinner   Author: rafasashi   File: collocations.py    License: GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, word_fd, bigram_fd, window_size=2):
        """Construct a BigramCollocationFinder, given FreqDists for
        appearances of words and (possibly non-contiguous) bigrams.
        """
        AbstractCollocationFinder.__init__(self, word_fd, bigram_fd)
        self.window_size = window_size 
Example 30
Project: razzy-spinner   Author: rafasashi   File: collocations.py    License: GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, word_fd, bigram_fd, wildcard_fd, trigram_fd):
        """Construct a TrigramCollocationFinder, given FreqDists for
        appearances of words, bigrams, two words with any word between them,
        and trigrams.
        """
        AbstractCollocationFinder.__init__(self, word_fd, trigram_fd)
        self.wildcard_fd = wildcard_fd
        self.bigram_fd = bigram_fd