Python nltk.corpus.brown.words() Examples
The following are 30
code examples of nltk.corpus.brown.words().
These examples are extracted from open source projects.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.corpus.brown
, or try the search function
.

Example #1
Source Project: razzy-spinner Author: rafasashi File: text.py License: GNU General Public License v3.0 | 6 votes |
def collocations(self, num=20, window_size=2): """ Print collocations derived from the text, ignoring stopwords. :seealso: find_collocations :param num: The maximum number of collocations to print. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size): self._num = num self._window_size = window_size #print("Building collocations list") from nltk.corpus import stopwords ignored_words = stopwords.words('english') finder = BigramCollocationFinder.from_words(self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num) colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations] print(tokenwrap(colloc_strings, separator="; "))
Example #2
Source Project: Python-Machine-Learning-Cookbook-Second-Edition Author: PacktPublishing File: chunking.py License: MIT License | 6 votes |
def splitter(data, num_words): words = data.split(' ') output = [] cur_count = 0 cur_words = [] for word in words: cur_words.append(word) cur_count += 1 if cur_count == num_words: output.append(' '.join(cur_words)) cur_words = [] cur_count = 0 output.append(' '.join(cur_words) ) return output
Example #3
Source Project: luscan-devel Author: blackye File: text.py License: GNU General Public License v2.0 | 6 votes |
def collocations(self, num=20, window_size=2): """ Print collocations derived from the text, ignoring stopwords. :seealso: find_collocations :param num: The maximum number of collocations to print. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size): self._num = num self._window_size = window_size print "Building collocations list" from nltk.corpus import stopwords ignored_words = stopwords.words('english') finder = BigramCollocationFinder.from_words(self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num) colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations] print tokenwrap(colloc_strings, separator="; ")
Example #4
Source Project: Raspberry-Pi-3-Cookbook-for-Python-Programmers-Third-Edition Author: PacktPublishing File: chunking.py License: MIT License | 6 votes |
def splitter(content, num_of_words): words = content.split(' ') result = [] current_count = 0 current_words = [] for word in words: current_words.append(word) current_count += 1 if current_count == num_of_words: result.append(' '.join(current_words)) current_words = [] current_count = 0 result.append(' '.join(current_words)) return result
Example #5
Source Project: PyRATA Author: nicolashernandez File: phrase-extraction.py License: Apache License 2.0 | 6 votes |
def brown_data(): """return the text_length first tokens of the brown corpus tagged in pyrata format""" tokens = brown.words() tokens = tokens[:text_length] pos_tags = nltk.pos_tag(tokens) return [{'raw':w, 'pos':p} for (w, p) in pos_tags] # """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # TEST # """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
Example #6
Source Project: PyRATA Author: nicolashernandez File: FullNP.py License: Apache License 2.0 | 6 votes |
def brown_data(): """return the text_length first tokens of the brown corpus tagged in pyrata format""" tokens = brown.words() tokens = tokens[:text_length] pos_tags = nltk.pos_tag(tokens) return [{'raw':w, 'pos':p} for (w, p) in pos_tags] # """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # TEST # """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
Example #7
Source Project: V1EngineeringInc-Docs Author: V1EngineeringInc File: text.py License: Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def common_contexts(self, words, fail_on_unknown=False): """ Find contexts where the specified words can all appear; and return a frequency distribution mapping each context to the number of times that context was used. :param words: The words used to seed the similarity search :type words: str :param fail_on_unknown: If true, then raise a value error if any of the given words do not occur at all in the index. """ words = [self._key(w) for w in words] contexts = [set(self._word_to_contexts[w]) for w in words] empty = [words[i] for i in range(len(words)) if not contexts[i]] common = reduce(set.intersection, contexts) if empty and fail_on_unknown: raise ValueError("The following word(s) were not found:", " ".join(words)) elif not common: # nothing in common -- just return an empty freqdist. return FreqDist() else: fd = FreqDist( c for w in words for c in self._word_to_contexts[w] if c in common ) return fd
Example #8
Source Project: V1EngineeringInc-Docs Author: V1EngineeringInc File: text.py License: Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def __init__(self, tokens, key=lambda x: x): """ Construct a new concordance index. :param tokens: The document (list of tokens) that this concordance index was created from. This list can be used to access the context of a given word occurrence. :param key: A function that maps each token to a normalized version that will be used as a key in the index. E.g., if you use ``key=lambda s:s.lower()``, then the index will be case-insensitive. """ self._tokens = tokens """The document (list of tokens) that this concordance index was created from.""" self._key = key """Function mapping each token to an index key (or None).""" self._offsets = defaultdict(list) """Dictionary mapping words (or keys) to lists of offset indices.""" # Initialize the index (self._offsets) for index, word in enumerate(tokens): word = self._key(word) self._offsets[word].append(index)
Example #9
Source Project: razzy-spinner Author: rafasashi File: text.py License: GNU General Public License v3.0 | 5 votes |
def word_similarity_dict(self, word): """ Return a dictionary mapping from words to 'similarity scores,' indicating how often these two words occur in the same context. """ word = self._key(word) word_contexts = set(self._word_to_contexts[word]) scores = {} for w, w_contexts in self._word_to_contexts.items(): scores[w] = f_measure(word_contexts, set(w_contexts)) return scores
Example #10
Source Project: razzy-spinner Author: rafasashi File: text.py License: GNU General Public License v3.0 | 5 votes |
def common_contexts(self, words, fail_on_unknown=False): """ Find contexts where the specified words can all appear; and return a frequency distribution mapping each context to the number of times that context was used. :param words: The words used to seed the similarity search :type words: str :param fail_on_unknown: If true, then raise a value error if any of the given words do not occur at all in the index. """ words = [self._key(w) for w in words] contexts = [set(self._word_to_contexts[w]) for w in words] empty = [words[i] for i in range(len(words)) if not contexts[i]] common = reduce(set.intersection, contexts) if empty and fail_on_unknown: raise ValueError("The following word(s) were not found:", " ".join(words)) elif not common: # nothing in common -- just return an empty freqdist. return FreqDist() else: fd = FreqDist(c for w in words for c in self._word_to_contexts[w] if c in common) return fd
Example #11
Source Project: razzy-spinner Author: rafasashi File: text.py License: GNU General Public License v3.0 | 5 votes |
def __init__(self, tokens, key=lambda x:x): """ Construct a new concordance index. :param tokens: The document (list of tokens) that this concordance index was created from. This list can be used to access the context of a given word occurrence. :param key: A function that maps each token to a normalized version that will be used as a key in the index. E.g., if you use ``key=lambda s:s.lower()``, then the index will be case-insensitive. """ self._tokens = tokens """The document (list of tokens) that this concordance index was created from.""" self._key = key """Function mapping each token to an index key (or None).""" self._offsets = defaultdict(list) """Dictionary mapping words (or keys) to lists of offset indices.""" # Initialize the index (self._offsets) for index, word in enumerate(tokens): word = self._key(word) self._offsets[word].append(index)
Example #12
Source Project: razzy-spinner Author: rafasashi File: text.py License: GNU General Public License v3.0 | 5 votes |
def print_concordance(self, word, width=75, lines=25): """ Print a concordance for ``word`` with the specified context window. :param word: The target word :type word: str :param width: The width of each line, in characters (default=80) :type width: int :param lines: The number of lines to display (default=25) :type lines: int """ half_width = (width - len(word) - 2) // 2 context = width // 4 # approx number of words of context offsets = self.offsets(word) if offsets: lines = min(lines, len(offsets)) print("Displaying %s of %s matches:" % (lines, len(offsets))) for i in offsets: if lines <= 0: break left = (' ' * half_width + ' '.join(self._tokens[i-context:i])) right = ' '.join(self._tokens[i+1:i+context]) left = left[-half_width:] right = right[:half_width] print(left, self._tokens[i], right) lines -= 1 else: print("No matches")
Example #13
Source Project: razzy-spinner Author: rafasashi File: text.py License: GNU General Public License v3.0 | 5 votes |
def common_contexts(self, words, num=20): """ Find contexts where the specified words appear; list most frequent common contexts first. :param word: The word used to seed the similarity search :type word: str :param num: The number of words to generate (default=20) :type num: int :seealso: ContextIndex.common_contexts() """ if '_word_context_index' not in self.__dict__: #print('Building word-context index...') self._word_context_index = ContextIndex(self.tokens, key=lambda s:s.lower()) try: fd = self._word_context_index.common_contexts(words, True) if not fd: print("No common contexts were found") else: ranked_contexts = [w for w, _ in fd.most_common(num)] print(tokenwrap(w1+"_"+w2 for w1,w2 in ranked_contexts)) except ValueError as e: print(e)
Example #14
Source Project: razzy-spinner Author: rafasashi File: text.py License: GNU General Public License v3.0 | 5 votes |
def dispersion_plot(self, words): """ Produce a plot showing the distribution of the words through the text. Requires pylab to be installed. :param words: The words to be plotted :type words: list(str) :seealso: nltk.draw.dispersion_plot() """ from nltk.draw import dispersion_plot dispersion_plot(self, words)
Example #15
Source Project: razzy-spinner Author: rafasashi File: text.py License: GNU General Public License v3.0 | 5 votes |
def __init__(self, source): if hasattr(source, 'words'): # bridge to the text corpus reader source = [source.words(f) for f in source.fileids()] self._texts = source Text.__init__(self, LazyConcatenation(source)) self._idf_cache = {}
Example #16
Source Project: razzy-spinner Author: rafasashi File: text.py License: GNU General Public License v3.0 | 5 votes |
def demo(): from nltk.corpus import brown text = Text(brown.words(categories='news')) print(text) print() print("Concordance:") text.concordance('news') print() print("Distributionally similar words:") text.similar('news') print() print("Collocations:") text.collocations() print() #print("Automatically generated text:") #text.generate() #print() print("Dispersion plot:") text.dispersion_plot(['news', 'report', 'said', 'announced']) print() print("Vocabulary plot:") text.plot(50) print() print("Indexing:") print("text[3]:", text[3]) print("text[3:5]:", text[3:5]) print("text.vocab()['news']:", text.vocab()['news'])
Example #17
Source Project: combine-FEVER-NSMN Author: easonnie File: nltk_utils.py License: MIT License | 5 votes |
def get_nltk_freq_words(): """Use Brown corpus frequent words More corpora: https://www.nltk.org/book/ch02.html """ freq_dict = nltk.FreqDist(brown.words()) for fileid in gutenberg.fileids(): freq_dict.update(nltk.FreqDist(gutenberg.words(fileid))) freq_words = [k for k, v in freq_dict.items() if v > 10] return freq_words, freq_dict
Example #18
Source Project: luscan-devel Author: blackye File: text.py License: GNU General Public License v2.0 | 5 votes |
def word_similarity_dict(self, word): """ Return a dictionary mapping from words to 'similarity scores,' indicating how often these two words occur in the same context. """ word = self._key(word) word_contexts = set(self._word_to_contexts[word]) scores = {} for w, w_contexts in self._word_to_contexts.items(): scores[w] = f_measure(word_contexts, set(w_contexts)) return scores
Example #19
Source Project: luscan-devel Author: blackye File: text.py License: GNU General Public License v2.0 | 5 votes |
def common_contexts(self, words, fail_on_unknown=False): """ Find contexts where the specified words can all appear; and return a frequency distribution mapping each context to the number of times that context was used. :param words: The words used to seed the similarity search :type words: str :param fail_on_unknown: If true, then raise a value error if any of the given words do not occur at all in the index. """ words = [self._key(w) for w in words] contexts = [set(self._word_to_contexts[w]) for w in words] empty = [words[i] for i in range(len(words)) if not contexts[i]] common = reduce(set.intersection, contexts) if empty and fail_on_unknown: raise ValueError("The following word(s) were not found:", " ".join(words)) elif not common: # nothing in common -- just return an empty freqdist. return FreqDist() else: fd = FreqDist(c for w in words for c in self._word_to_contexts[w] if c in common) return fd
Example #20
Source Project: luscan-devel Author: blackye File: text.py License: GNU General Public License v2.0 | 5 votes |
def __init__(self, tokens, key=lambda x:x): """ Construct a new concordance index. :param tokens: The document (list of tokens) that this concordance index was created from. This list can be used to access the context of a given word occurrence. :param key: A function that maps each token to a normalized version that will be used as a key in the index. E.g., if you use ``key=lambda s:s.lower()``, then the index will be case-insensitive. """ self._tokens = tokens """The document (list of tokens) that this concordance index was created from.""" self._key = key """Function mapping each token to an index key (or None).""" self._offsets = defaultdict(list) """Dictionary mapping words (or keys) to lists of offset indices.""" # Initialize the index (self._offsets) for index, word in enumerate(tokens): word = self._key(word) self._offsets[word].append(index)
Example #21
Source Project: luscan-devel Author: blackye File: text.py License: GNU General Public License v2.0 | 5 votes |
def print_concordance(self, word, width=75, lines=25): """ Print a concordance for ``word`` with the specified context window. :param word: The target word :type word: str :param width: The width of each line, in characters (default=80) :type width: int :param lines: The number of lines to display (default=25) :type lines: int """ half_width = (width - len(word) - 2) / 2 context = width/4 # approx number of words of context offsets = self.offsets(word) if offsets: lines = min(lines, len(offsets)) print "Displaying %s of %s matches:" % (lines, len(offsets)) for i in offsets: if lines <= 0: break left = (' ' * half_width + ' '.join(self._tokens[i-context:i])) right = ' '.join(self._tokens[i+1:i+context]) left = left[-half_width:] right = right[:half_width] print left, self._tokens[i], right lines -= 1 else: print "No matches"
Example #22
Source Project: luscan-devel Author: blackye File: text.py License: GNU General Public License v2.0 | 5 votes |
def common_contexts(self, words, num=20): """ Find contexts where the specified words appear; list most frequent common contexts first. :param word: The word used to seed the similarity search :type word: str :param num: The number of words to generate (default=20) :type num: int :seealso: ContextIndex.common_contexts() """ if '_word_context_index' not in self.__dict__: print 'Building word-context index...' self._word_context_index = ContextIndex(self.tokens, key=lambda s:s.lower()) try: fd = self._word_context_index.common_contexts(words, True) if not fd: print "No common contexts were found" else: ranked_contexts = fd.keys()[:num] print tokenwrap(w1+"_"+w2 for w1,w2 in ranked_contexts) except ValueError, e: print e
Example #23
Source Project: luscan-devel Author: blackye File: text.py License: GNU General Public License v2.0 | 5 votes |
def dispersion_plot(self, words): """ Produce a plot showing the distribution of the words through the text. Requires pylab to be installed. :param words: The words to be plotted :type word: str :seealso: nltk.draw.dispersion_plot() """ from nltk.draw import dispersion_plot dispersion_plot(self, words)
Example #24
Source Project: luscan-devel Author: blackye File: text.py License: GNU General Public License v2.0 | 5 votes |
def __init__(self, source, name=None): if hasattr(source, 'words'): # bridge to the text corpus reader source = [source.words(f) for f in source.fileids()] self._texts = source Text.__init__(self, LazyConcatenation(source)) self._idf_cache = {}
Example #25
Source Project: luscan-devel Author: blackye File: text.py License: GNU General Public License v2.0 | 5 votes |
def demo(): from nltk.corpus import brown text = Text(brown.words(categories='news')) print text print print "Concordance:" text.concordance('news') print print "Distributionally similar words:" text.similar('news') print print "Collocations:" text.collocations() print print "Automatically generated text:" text.generate() print print "Dispersion plot:" text.dispersion_plot(['news', 'report', 'said', 'announced']) print print "Vocabulary plot:" text.plot(50) print print "Indexing:" print "text[3]:", text[3] print "text[3:5]:", text[3:5] print "text.vocab()['news']:", text.vocab()['news']
Example #26
Source Project: normalise Author: EFord36 File: spellcheck.py License: GNU General Public License v3.0 | 5 votes |
def words(text): return re.findall('[a-z]+', text.lower())
Example #27
Source Project: normalise Author: EFord36 File: spellcheck.py License: GNU General Public License v3.0 | 5 votes |
def known(words): return set(w for w in words if w in NWORDS)
Example #28
Source Project: PyRATA Author: nicolashernandez File: do_benchmark.py License: Apache License 2.0 | 5 votes |
def measure_pattern_time_v2(iteration_number, size, pattern): gw = execnet.makegateway("popen//python=python2.7") channel = gw.remote_exec(""" from nltk.corpus import brown words = brown.words()[:%s] text = ' '.join(words) from pattern.en import parsetree text_tree = parsetree(text, tokenize = True, # Split punctuation marks from words? tags = True, # Parse part-of-speech tags? (NN, JJ, ...) chunks = False, # Parse chunks? (NP, VP, PNP, ...) relations = False, # Parse chunk relations? (-SBJ, -OBJ, ...) lemmata = False, # Parse lemmata? (ate => eat) encoding = 'utf-8', # Input string encoding. tagset = None) # Penn Treebank II (default) or UNIVERSAL. from pattern.search import search def measure_pattern_search(): global pattern_search_result #Make measure_me able to modify the value pattern_search_result = search("%s", text_tree) #print ("clip.pattern len(result)="+str(len(pattern_search_result))) from timeit import Timer pattern_search_time = Timer(measure_pattern_search) #print ('pattern_search_time') def pattern_search_timeit(): runtimes = [pattern_search_time.timeit(number=1) for i in range (0, %s)] average = sum(runtimes)/len(runtimes) # return ''.join(['timit: #runs=', str(%s), ' ; average=', str(average),' ; min=', str(min(runtimes))]) return [runtimes, average, min(runtimes), len(pattern_search_result)] channel.send(pattern_search_timeit()) """ % (size, pattern, iteration_number, iteration_number)) channel.send([]) return channel.receive()
Example #29
Source Project: PyRATA Author: nicolashernandez File: do_benchmark.py License: Apache License 2.0 | 5 votes |
def write_pattern_v2(iteration_number, size, pattern): gw = execnet.makegateway("popen//python=python2.7") channel = gw.remote_exec(""" from nltk.corpus import brown size = %s words = brown.words()[:size] text = ' '.join(words) from pattern.en import parsetree text_tree = parsetree(text, tokenize = True, # Split punctuation marks from words? tags = True, # Parse part-of-speech tags? (NN, JJ, ...) chunks = False, # Parse chunks? (NP, VP, PNP, ...) relations = False, # Parse chunk relations? (-SBJ, -OBJ, ...) lemmata = False, # Parse lemmata? (ate => eat) encoding = 'utf-8', # Input string encoding. tagset = None) # Penn Treebank II (default) or UNIVERSAL. def backslash(string): for ch in [' ','?', '+', '*', '.', '[', ']', '~' , '{', '}', '|', '"', "'", ',', ':', '<', '>']: if ch in string: string=string.replace(ch,'_') return string from pattern.search import search pattern = "%s" pattern_search_result = search(pattern, text_tree) measure_pattern_search() filename = '/tmp/benchmark_'+analyzer_name+'_'+str(size)+"_"+str(len(pattern_search_result))+'_'+backslash(pattern) thefile = open(filename, 'w') for item in pattern_search_result: print>>thefile, item channel.send([filename, size, len(pattern_search_result)]) """ % (size, pattern, iteration_number, iteration_number)) channel.send([]) return channel.receive()
Example #30
Source Project: adversarial-squad Author: robinjia File: print_english_words.py License: MIT License | 5 votes |
def main(): freq_dist = FreqDist(w.lower() for w in brown.words() if w not in PUNCTUATION) vocab = [x[0] for x in freq_dist.most_common()[:OPTS.size]] for w in vocab: print w