Python nltk.corpus.brown.words() Examples

The following are 30 code examples for showing how to use nltk.corpus.brown.words(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module nltk.corpus.brown , or try the search function .

Example 1
Project: razzy-spinner   Author: rafasashi   File: text.py    License: GNU General Public License v3.0 6 votes vote down vote up
def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            #print("Building collocations list")
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; ")) 
Example 2
Project: Python-Machine-Learning-Cookbook-Second-Edition   Author: PacktPublishing   File: chunking.py    License: MIT License 6 votes vote down vote up
def splitter(data, num_words):
    words = data.split(' ')
    output = []

    cur_count = 0
    cur_words = []
    for word in words:
        cur_words.append(word)
        cur_count += 1
        if cur_count == num_words:
            output.append(' '.join(cur_words))
            cur_words = []
            cur_count = 0

    output.append(' '.join(cur_words) )

    return output 
Example 3
Project: luscan-devel   Author: blackye   File: text.py    License: GNU General Public License v2.0 6 votes vote down vote up
def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            print "Building collocations list"
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
        print tokenwrap(colloc_strings, separator="; ") 
Example 4
Project: Raspberry-Pi-3-Cookbook-for-Python-Programmers-Third-Edition   Author: PacktPublishing   File: chunking.py    License: MIT License 6 votes vote down vote up
def splitter(content, num_of_words):
	words = content.split(' ')
	result = []

	current_count = 0
	current_words = []

	for word in words:
	 current_words.append(word)
	 current_count += 1

         if current_count == num_of_words:
	  result.append(' '.join(current_words))
          current_words = []
	  current_count = 0

        result.append(' '.join(current_words))
        return result 
Example 5
Project: PyRATA   Author: nicolashernandez   File: phrase-extraction.py    License: Apache License 2.0 6 votes vote down vote up
def brown_data():
  """return the text_length first tokens of the brown corpus tagged in pyrata format"""
  tokens = brown.words()
  tokens = tokens[:text_length]

  pos_tags = nltk.pos_tag(tokens)

  return [{'raw':w, 'pos':p} for (w, p) in pos_tags]


# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# TEST 
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""


# """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" 
Example 6
Project: PyRATA   Author: nicolashernandez   File: FullNP.py    License: Apache License 2.0 6 votes vote down vote up
def brown_data():
  """return the text_length first tokens of the brown corpus tagged in pyrata format"""
  tokens = brown.words()
  tokens = tokens[:text_length]

  pos_tags = nltk.pos_tag(tokens)

  return [{'raw':w, 'pos':p} for (w, p) in pos_tags]


# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# TEST 
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""


# """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" 
Example 7
Project: V1EngineeringInc-Docs   Author: V1EngineeringInc   File: text.py    License: Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def common_contexts(self, words, fail_on_unknown=False):
        """
        Find contexts where the specified words can all appear; and
        return a frequency distribution mapping each context to the
        number of times that context was used.

        :param words: The words used to seed the similarity search
        :type words: str
        :param fail_on_unknown: If true, then raise a value error if
            any of the given words do not occur at all in the index.
        """
        words = [self._key(w) for w in words]
        contexts = [set(self._word_to_contexts[w]) for w in words]
        empty = [words[i] for i in range(len(words)) if not contexts[i]]
        common = reduce(set.intersection, contexts)
        if empty and fail_on_unknown:
            raise ValueError("The following word(s) were not found:", " ".join(words))
        elif not common:
            # nothing in common -- just return an empty freqdist.
            return FreqDist()
        else:
            fd = FreqDist(
                c for w in words for c in self._word_to_contexts[w] if c in common
            )
            return fd 
Example 8
Project: V1EngineeringInc-Docs   Author: V1EngineeringInc   File: text.py    License: Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def __init__(self, tokens, key=lambda x: x):
        """
        Construct a new concordance index.

        :param tokens: The document (list of tokens) that this
            concordance index was created from.  This list can be used
            to access the context of a given word occurrence.
        :param key: A function that maps each token to a normalized
            version that will be used as a key in the index.  E.g., if
            you use ``key=lambda s:s.lower()``, then the index will be
            case-insensitive.
        """
        self._tokens = tokens
        """The document (list of tokens) that this concordance index
           was created from."""

        self._key = key
        """Function mapping each token to an index key (or None)."""

        self._offsets = defaultdict(list)
        """Dictionary mapping words (or keys) to lists of offset indices."""
        # Initialize the index (self._offsets)
        for index, word in enumerate(tokens):
            word = self._key(word)
            self._offsets[word].append(index) 
Example 9
Project: razzy-spinner   Author: rafasashi   File: text.py    License: GNU General Public License v3.0 5 votes vote down vote up
def word_similarity_dict(self, word):
        """
        Return a dictionary mapping from words to 'similarity scores,'
        indicating how often these two words occur in the same
        context.
        """
        word = self._key(word)
        word_contexts = set(self._word_to_contexts[word])

        scores = {}
        for w, w_contexts in self._word_to_contexts.items():
            scores[w] = f_measure(word_contexts, set(w_contexts))

        return scores 
Example 10
Project: razzy-spinner   Author: rafasashi   File: text.py    License: GNU General Public License v3.0 5 votes vote down vote up
def common_contexts(self, words, fail_on_unknown=False):
        """
        Find contexts where the specified words can all appear; and
        return a frequency distribution mapping each context to the
        number of times that context was used.

        :param words: The words used to seed the similarity search
        :type words: str
        :param fail_on_unknown: If true, then raise a value error if
            any of the given words do not occur at all in the index.
        """
        words = [self._key(w) for w in words]
        contexts = [set(self._word_to_contexts[w]) for w in words]
        empty = [words[i] for i in range(len(words)) if not contexts[i]]
        common = reduce(set.intersection, contexts)
        if empty and fail_on_unknown:
            raise ValueError("The following word(s) were not found:",
                             " ".join(words))
        elif not common:
            # nothing in common -- just return an empty freqdist.
            return FreqDist()
        else:
            fd = FreqDist(c for w in words
                          for c in self._word_to_contexts[w]
                          if c in common)
            return fd 
Example 11
Project: razzy-spinner   Author: rafasashi   File: text.py    License: GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, tokens, key=lambda x:x):
        """
        Construct a new concordance index.

        :param tokens: The document (list of tokens) that this
            concordance index was created from.  This list can be used
            to access the context of a given word occurrence.
        :param key: A function that maps each token to a normalized
            version that will be used as a key in the index.  E.g., if
            you use ``key=lambda s:s.lower()``, then the index will be
            case-insensitive.
        """
        self._tokens = tokens
        """The document (list of tokens) that this concordance index
           was created from."""

        self._key = key
        """Function mapping each token to an index key (or None)."""

        self._offsets = defaultdict(list)
        """Dictionary mapping words (or keys) to lists of offset
           indices."""

        # Initialize the index (self._offsets)
        for index, word in enumerate(tokens):
            word = self._key(word)
            self._offsets[word].append(index) 
Example 12
Project: razzy-spinner   Author: rafasashi   File: text.py    License: GNU General Public License v3.0 5 votes vote down vote up
def print_concordance(self, word, width=75, lines=25):
        """
        Print a concordance for ``word`` with the specified context window.

        :param word: The target word
        :type word: str
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param lines: The number of lines to display (default=25)
        :type lines: int
        """
        half_width = (width - len(word) - 2) // 2
        context = width // 4 # approx number of words of context

        offsets = self.offsets(word)
        if offsets:
            lines = min(lines, len(offsets))
            print("Displaying %s of %s matches:" % (lines, len(offsets)))
            for i in offsets:
                if lines <= 0:
                    break
                left = (' ' * half_width +
                        ' '.join(self._tokens[i-context:i]))
                right = ' '.join(self._tokens[i+1:i+context])
                left = left[-half_width:]
                right = right[:half_width]
                print(left, self._tokens[i], right)
                lines -= 1
        else:
            print("No matches") 
Example 13
Project: razzy-spinner   Author: rafasashi   File: text.py    License: GNU General Public License v3.0 5 votes vote down vote up
def common_contexts(self, words, num=20):
        """
        Find contexts where the specified words appear; list
        most frequent common contexts first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.common_contexts()
        """
        if '_word_context_index' not in self.__dict__:
            #print('Building word-context index...')
            self._word_context_index = ContextIndex(self.tokens,
                                                    key=lambda s:s.lower())

        try:
            fd = self._word_context_index.common_contexts(words, True)
            if not fd:
                print("No common contexts were found")
            else:
                ranked_contexts = [w for w, _ in fd.most_common(num)]
                print(tokenwrap(w1+"_"+w2 for w1,w2 in ranked_contexts))

        except ValueError as e:
            print(e) 
Example 14
Project: razzy-spinner   Author: rafasashi   File: text.py    License: GNU General Public License v3.0 5 votes vote down vote up
def dispersion_plot(self, words):
        """
        Produce a plot showing the distribution of the words through the text.
        Requires pylab to be installed.

        :param words: The words to be plotted
        :type words: list(str)
        :seealso: nltk.draw.dispersion_plot()
        """
        from nltk.draw import dispersion_plot
        dispersion_plot(self, words) 
Example 15
Project: razzy-spinner   Author: rafasashi   File: text.py    License: GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, source):
        if hasattr(source, 'words'): # bridge to the text corpus reader
            source = [source.words(f) for f in source.fileids()]

        self._texts = source
        Text.__init__(self, LazyConcatenation(source))
        self._idf_cache = {} 
Example 16
Project: razzy-spinner   Author: rafasashi   File: text.py    License: GNU General Public License v3.0 5 votes vote down vote up
def demo():
    from nltk.corpus import brown
    text = Text(brown.words(categories='news'))
    print(text)
    print()
    print("Concordance:")
    text.concordance('news')
    print()
    print("Distributionally similar words:")
    text.similar('news')
    print()
    print("Collocations:")
    text.collocations()
    print()
    #print("Automatically generated text:")
    #text.generate()
    #print()
    print("Dispersion plot:")
    text.dispersion_plot(['news', 'report', 'said', 'announced'])
    print()
    print("Vocabulary plot:")
    text.plot(50)
    print()
    print("Indexing:")
    print("text[3]:", text[3])
    print("text[3:5]:", text[3:5])
    print("text.vocab()['news']:", text.vocab()['news']) 
Example 17
Project: combine-FEVER-NSMN   Author: easonnie   File: nltk_utils.py    License: MIT License 5 votes vote down vote up
def get_nltk_freq_words():
    """Use Brown corpus frequent words
    More corpora: https://www.nltk.org/book/ch02.html
    """
    freq_dict = nltk.FreqDist(brown.words())

    for fileid in gutenberg.fileids():
        freq_dict.update(nltk.FreqDist(gutenberg.words(fileid)))

    freq_words = [k for k, v in freq_dict.items() if v > 10]
    return freq_words, freq_dict 
Example 18
Project: luscan-devel   Author: blackye   File: text.py    License: GNU General Public License v2.0 5 votes vote down vote up
def word_similarity_dict(self, word):
        """
        Return a dictionary mapping from words to 'similarity scores,'
        indicating how often these two words occur in the same
        context.
        """
        word = self._key(word)
        word_contexts = set(self._word_to_contexts[word])

        scores = {}
        for w, w_contexts in self._word_to_contexts.items():
            scores[w] = f_measure(word_contexts, set(w_contexts))

        return scores 
Example 19
Project: luscan-devel   Author: blackye   File: text.py    License: GNU General Public License v2.0 5 votes vote down vote up
def common_contexts(self, words, fail_on_unknown=False):
        """
        Find contexts where the specified words can all appear; and
        return a frequency distribution mapping each context to the
        number of times that context was used.

        :param words: The words used to seed the similarity search
        :type words: str
        :param fail_on_unknown: If true, then raise a value error if
            any of the given words do not occur at all in the index.
        """
        words = [self._key(w) for w in words]
        contexts = [set(self._word_to_contexts[w]) for w in words]
        empty = [words[i] for i in range(len(words)) if not contexts[i]]
        common = reduce(set.intersection, contexts)
        if empty and fail_on_unknown:
            raise ValueError("The following word(s) were not found:",
                             " ".join(words))
        elif not common:
            # nothing in common -- just return an empty freqdist.
            return FreqDist()
        else:
            fd = FreqDist(c for w in words
                          for c in self._word_to_contexts[w]
                          if c in common)
            return fd 
Example 20
Project: luscan-devel   Author: blackye   File: text.py    License: GNU General Public License v2.0 5 votes vote down vote up
def __init__(self, tokens, key=lambda x:x):
        """
        Construct a new concordance index.

        :param tokens: The document (list of tokens) that this
            concordance index was created from.  This list can be used
            to access the context of a given word occurrence.
        :param key: A function that maps each token to a normalized
            version that will be used as a key in the index.  E.g., if
            you use ``key=lambda s:s.lower()``, then the index will be
            case-insensitive.
        """
        self._tokens = tokens
        """The document (list of tokens) that this concordance index
           was created from."""

        self._key = key
        """Function mapping each token to an index key (or None)."""

        self._offsets = defaultdict(list)
        """Dictionary mapping words (or keys) to lists of offset
           indices."""

        # Initialize the index (self._offsets)
        for index, word in enumerate(tokens):
            word = self._key(word)
            self._offsets[word].append(index) 
Example 21
Project: luscan-devel   Author: blackye   File: text.py    License: GNU General Public License v2.0 5 votes vote down vote up
def print_concordance(self, word, width=75, lines=25):
        """
        Print a concordance for ``word`` with the specified context window.

        :param word: The target word
        :type word: str
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param lines: The number of lines to display (default=25)
        :type lines: int
        """
        half_width = (width - len(word) - 2) / 2
        context = width/4 # approx number of words of context

        offsets = self.offsets(word)
        if offsets:
            lines = min(lines, len(offsets))
            print "Displaying %s of %s matches:" % (lines, len(offsets))
            for i in offsets:
                if lines <= 0:
                    break
                left = (' ' * half_width +
                        ' '.join(self._tokens[i-context:i]))
                right = ' '.join(self._tokens[i+1:i+context])
                left = left[-half_width:]
                right = right[:half_width]
                print left, self._tokens[i], right
                lines -= 1
        else:
            print "No matches" 
Example 22
Project: luscan-devel   Author: blackye   File: text.py    License: GNU General Public License v2.0 5 votes vote down vote up
def common_contexts(self, words, num=20):
        """
        Find contexts where the specified words appear; list
        most frequent common contexts first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.common_contexts()
        """
        if '_word_context_index' not in self.__dict__:
            print 'Building word-context index...'
            self._word_context_index = ContextIndex(self.tokens,
                                                    key=lambda s:s.lower())

        try:
            fd = self._word_context_index.common_contexts(words, True)
            if not fd:
                print "No common contexts were found"
            else:
                ranked_contexts = fd.keys()[:num]
                print tokenwrap(w1+"_"+w2 for w1,w2 in ranked_contexts)

        except ValueError, e:
            print e 
Example 23
Project: luscan-devel   Author: blackye   File: text.py    License: GNU General Public License v2.0 5 votes vote down vote up
def dispersion_plot(self, words):
        """
        Produce a plot showing the distribution of the words through the text.
        Requires pylab to be installed.

        :param words: The words to be plotted
        :type word: str
        :seealso: nltk.draw.dispersion_plot()
        """
        from nltk.draw import dispersion_plot
        dispersion_plot(self, words) 
Example 24
Project: luscan-devel   Author: blackye   File: text.py    License: GNU General Public License v2.0 5 votes vote down vote up
def __init__(self, source, name=None):
        if hasattr(source, 'words'): # bridge to the text corpus reader
            source = [source.words(f) for f in source.fileids()]

        self._texts = source
        Text.__init__(self, LazyConcatenation(source))
        self._idf_cache = {} 
Example 25
Project: luscan-devel   Author: blackye   File: text.py    License: GNU General Public License v2.0 5 votes vote down vote up
def demo():
    from nltk.corpus import brown
    text = Text(brown.words(categories='news'))
    print text
    print
    print "Concordance:"
    text.concordance('news')
    print
    print "Distributionally similar words:"
    text.similar('news')
    print
    print "Collocations:"
    text.collocations()
    print
    print "Automatically generated text:"
    text.generate()
    print
    print "Dispersion plot:"
    text.dispersion_plot(['news', 'report', 'said', 'announced'])
    print
    print "Vocabulary plot:"
    text.plot(50)
    print
    print "Indexing:"
    print "text[3]:", text[3]
    print "text[3:5]:", text[3:5]
    print "text.vocab()['news']:", text.vocab()['news'] 
Example 26
Project: normalise   Author: EFord36   File: spellcheck.py    License: GNU General Public License v3.0 5 votes vote down vote up
def words(text):
    return re.findall('[a-z]+', text.lower()) 
Example 27
Project: normalise   Author: EFord36   File: spellcheck.py    License: GNU General Public License v3.0 5 votes vote down vote up
def known(words):
    return set(w for w in words if w in NWORDS) 
Example 28
Project: PyRATA   Author: nicolashernandez   File: do_benchmark.py    License: Apache License 2.0 5 votes vote down vote up
def measure_pattern_time_v2(iteration_number, size, pattern):
  gw      = execnet.makegateway("popen//python=python2.7")
  channel = gw.remote_exec("""
from nltk.corpus import brown
words = brown.words()[:%s]
text = ' '.join(words)
from pattern.en import parsetree
text_tree = parsetree(text,
 tokenize = True,         # Split punctuation marks from words?
     tags = True,         # Parse part-of-speech tags? (NN, JJ, ...)
   chunks = False,         # Parse chunks? (NP, VP, PNP, ...)
relations = False,        # Parse chunk relations? (-SBJ, -OBJ, ...)
  lemmata = False,        # Parse lemmata? (ate => eat)
 encoding = 'utf-8',       # Input string encoding.
   tagset = None)         # Penn Treebank II (default) or UNIVERSAL.
from pattern.search import search
def measure_pattern_search():
  global pattern_search_result    #Make measure_me able to modify the value
  pattern_search_result = search("%s", text_tree)
  #print ("clip.pattern len(result)="+str(len(pattern_search_result)))
from timeit import Timer
pattern_search_time = Timer(measure_pattern_search)
#print ('pattern_search_time')
def pattern_search_timeit():
  runtimes = [pattern_search_time.timeit(number=1) for i in range (0, %s)]
  average = sum(runtimes)/len(runtimes)
#  return ''.join(['timit: #runs=', str(%s), ' ; average=', str(average),' ; min=', str(min(runtimes))])
  return [runtimes, average, min(runtimes), len(pattern_search_result)]
channel.send(pattern_search_timeit())
  """ % (size, pattern, iteration_number, iteration_number))
  channel.send([])
  return channel.receive() 
Example 29
Project: PyRATA   Author: nicolashernandez   File: do_benchmark.py    License: Apache License 2.0 5 votes vote down vote up
def write_pattern_v2(iteration_number, size, pattern):
  gw      = execnet.makegateway("popen//python=python2.7")
  channel = gw.remote_exec("""
from nltk.corpus import brown
size = %s
words = brown.words()[:size]
text = ' '.join(words)
from pattern.en import parsetree
text_tree = parsetree(text,
 tokenize = True,         # Split punctuation marks from words?
     tags = True,         # Parse part-of-speech tags? (NN, JJ, ...)
   chunks = False,         # Parse chunks? (NP, VP, PNP, ...)
relations = False,        # Parse chunk relations? (-SBJ, -OBJ, ...)
  lemmata = False,        # Parse lemmata? (ate => eat)
 encoding = 'utf-8',       # Input string encoding.
   tagset = None)         # Penn Treebank II (default) or UNIVERSAL.
def backslash(string):
  for ch in [' ','?', '+', '*', '.', '[', ']', '~' , '{', '}', '|', '"', "'", ',', ':', '<', '>']:
    if ch in string:
      string=string.replace(ch,'_')
  return string  
from pattern.search import search
pattern = "%s"
pattern_search_result = search(pattern, text_tree)
measure_pattern_search()
filename = '/tmp/benchmark_'+analyzer_name+'_'+str(size)+"_"+str(len(pattern_search_result))+'_'+backslash(pattern)
thefile = open(filename, 'w')
for item in pattern_search_result:
  print>>thefile, item
channel.send([filename, size, len(pattern_search_result)])
  """ % (size, pattern, iteration_number, iteration_number))
  channel.send([])
  return channel.receive() 
Example 30
Project: adversarial-squad   Author: robinjia   File: print_english_words.py    License: MIT License 5 votes vote down vote up
def main():
  freq_dist = FreqDist(w.lower() for w in brown.words() if w not in PUNCTUATION)
  vocab = [x[0] for x in freq_dist.most_common()[:OPTS.size]]
  for w in vocab:
    print w