Python nltk.corpus.brown.words() Examples

The following are 30 code examples of nltk.corpus.brown.words(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.corpus.brown , or try the search function .
Example #1
Source File: text.py    From razzy-spinner with GNU General Public License v3.0 7 votes vote down vote up
def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            #print("Building collocations list")
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; ")) 
Example #2
Source File: chunking.py    From Raspberry-Pi-3-Cookbook-for-Python-Programmers-Third-Edition with MIT License 6 votes vote down vote up
def splitter(content, num_of_words):
	words = content.split(' ')
	result = []

	current_count = 0
	current_words = []

	for word in words:
	 current_words.append(word)
	 current_count += 1

         if current_count == num_of_words:
	  result.append(' '.join(current_words))
          current_words = []
	  current_count = 0

        result.append(' '.join(current_words))
        return result 
Example #3
Source File: phrase-extraction.py    From PyRATA with Apache License 2.0 6 votes vote down vote up
def brown_data():
  """return the text_length first tokens of the brown corpus tagged in pyrata format"""
  tokens = brown.words()
  tokens = tokens[:text_length]

  pos_tags = nltk.pos_tag(tokens)

  return [{'raw':w, 'pos':p} for (w, p) in pos_tags]


# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# TEST 
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""


# """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" 
Example #4
Source File: FullNP.py    From PyRATA with Apache License 2.0 6 votes vote down vote up
def brown_data():
  """return the text_length first tokens of the brown corpus tagged in pyrata format"""
  tokens = brown.words()
  tokens = tokens[:text_length]

  pos_tags = nltk.pos_tag(tokens)

  return [{'raw':w, 'pos':p} for (w, p) in pos_tags]


# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# TEST 
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""


# """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" 
Example #5
Source File: text.py    From luscan-devel with GNU General Public License v2.0 6 votes vote down vote up
def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            print "Building collocations list"
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
        print tokenwrap(colloc_strings, separator="; ") 
Example #6
Source File: chunking.py    From Python-Machine-Learning-Cookbook-Second-Edition with MIT License 6 votes vote down vote up
def splitter(data, num_words):
    words = data.split(' ')
    output = []

    cur_count = 0
    cur_words = []
    for word in words:
        cur_words.append(word)
        cur_count += 1
        if cur_count == num_words:
            output.append(' '.join(cur_words))
            cur_words = []
            cur_count = 0

    output.append(' '.join(cur_words) )

    return output 
Example #7
Source File: text.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def common_contexts(self, words, fail_on_unknown=False):
        """
        Find contexts where the specified words can all appear; and
        return a frequency distribution mapping each context to the
        number of times that context was used.

        :param words: The words used to seed the similarity search
        :type words: str
        :param fail_on_unknown: If true, then raise a value error if
            any of the given words do not occur at all in the index.
        """
        words = [self._key(w) for w in words]
        contexts = [set(self._word_to_contexts[w]) for w in words]
        empty = [words[i] for i in range(len(words)) if not contexts[i]]
        common = reduce(set.intersection, contexts)
        if empty and fail_on_unknown:
            raise ValueError("The following word(s) were not found:", " ".join(words))
        elif not common:
            # nothing in common -- just return an empty freqdist.
            return FreqDist()
        else:
            fd = FreqDist(
                c for w in words for c in self._word_to_contexts[w] if c in common
            )
            return fd 
Example #8
Source File: text.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def __init__(self, tokens, key=lambda x: x):
        """
        Construct a new concordance index.

        :param tokens: The document (list of tokens) that this
            concordance index was created from.  This list can be used
            to access the context of a given word occurrence.
        :param key: A function that maps each token to a normalized
            version that will be used as a key in the index.  E.g., if
            you use ``key=lambda s:s.lower()``, then the index will be
            case-insensitive.
        """
        self._tokens = tokens
        """The document (list of tokens) that this concordance index
           was created from."""

        self._key = key
        """Function mapping each token to an index key (or None)."""

        self._offsets = defaultdict(list)
        """Dictionary mapping words (or keys) to lists of offset indices."""
        # Initialize the index (self._offsets)
        for index, word in enumerate(tokens):
            word = self._key(word)
            self._offsets[word].append(index) 
Example #9
Source File: text.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def collocation_list(self, num=20, window_size=2):
        """
        Return collocations derived from the text, ignoring stopwords.

        :param num: The maximum number of collocations to return.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not (
            "_collocations" in self.__dict__
            and self._num == num
            and self._window_size == window_size
        ):
            self._num = num
            self._window_size = window_size

            # print("Building collocations list")
            from nltk.corpus import stopwords

            ignored_words = stopwords.words("english")
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        return [w1 + " " + w2 for w1, w2 in self._collocations] 
Example #10
Source File: do_benchmark.py    From PyRATA with Apache License 2.0 5 votes vote down vote up
def measure_pattern_time_v2(iteration_number, size, pattern):
  gw      = execnet.makegateway("popen//python=python2.7")
  channel = gw.remote_exec("""
from nltk.corpus import brown
words = brown.words()[:%s]
text = ' '.join(words)
from pattern.en import parsetree
text_tree = parsetree(text,
 tokenize = True,         # Split punctuation marks from words?
     tags = True,         # Parse part-of-speech tags? (NN, JJ, ...)
   chunks = False,         # Parse chunks? (NP, VP, PNP, ...)
relations = False,        # Parse chunk relations? (-SBJ, -OBJ, ...)
  lemmata = False,        # Parse lemmata? (ate => eat)
 encoding = 'utf-8',       # Input string encoding.
   tagset = None)         # Penn Treebank II (default) or UNIVERSAL.
from pattern.search import search
def measure_pattern_search():
  global pattern_search_result    #Make measure_me able to modify the value
  pattern_search_result = search("%s", text_tree)
  #print ("clip.pattern len(result)="+str(len(pattern_search_result)))
from timeit import Timer
pattern_search_time = Timer(measure_pattern_search)
#print ('pattern_search_time')
def pattern_search_timeit():
  runtimes = [pattern_search_time.timeit(number=1) for i in range (0, %s)]
  average = sum(runtimes)/len(runtimes)
#  return ''.join(['timit: #runs=', str(%s), ' ; average=', str(average),' ; min=', str(min(runtimes))])
  return [runtimes, average, min(runtimes), len(pattern_search_result)]
channel.send(pattern_search_timeit())
  """ % (size, pattern, iteration_number, iteration_number))
  channel.send([])
  return channel.receive() 
Example #11
Source File: do_benchmark.py    From PyRATA with Apache License 2.0 5 votes vote down vote up
def write_pattern_v2(iteration_number, size, pattern):
  gw      = execnet.makegateway("popen//python=python2.7")
  channel = gw.remote_exec("""
from nltk.corpus import brown
size = %s
words = brown.words()[:size]
text = ' '.join(words)
from pattern.en import parsetree
text_tree = parsetree(text,
 tokenize = True,         # Split punctuation marks from words?
     tags = True,         # Parse part-of-speech tags? (NN, JJ, ...)
   chunks = False,         # Parse chunks? (NP, VP, PNP, ...)
relations = False,        # Parse chunk relations? (-SBJ, -OBJ, ...)
  lemmata = False,        # Parse lemmata? (ate => eat)
 encoding = 'utf-8',       # Input string encoding.
   tagset = None)         # Penn Treebank II (default) or UNIVERSAL.
def backslash(string):
  for ch in [' ','?', '+', '*', '.', '[', ']', '~' , '{', '}', '|', '"', "'", ',', ':', '<', '>']:
    if ch in string:
      string=string.replace(ch,'_')
  return string  
from pattern.search import search
pattern = "%s"
pattern_search_result = search(pattern, text_tree)
measure_pattern_search()
filename = '/tmp/benchmark_'+analyzer_name+'_'+str(size)+"_"+str(len(pattern_search_result))+'_'+backslash(pattern)
thefile = open(filename, 'w')
for item in pattern_search_result:
  print>>thefile, item
channel.send([filename, size, len(pattern_search_result)])
  """ % (size, pattern, iteration_number, iteration_number))
  channel.send([])
  return channel.receive() 
Example #12
Source File: print_english_words.py    From adversarial-squad with MIT License 5 votes vote down vote up
def main():
  freq_dist = FreqDist(w.lower() for w in brown.words() if w not in PUNCTUATION)
  vocab = [x[0] for x in freq_dist.most_common()[:OPTS.size]]
  for w in vocab:
    print w 
Example #13
Source File: text.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def word_similarity_dict(self, word):
        """
        Return a dictionary mapping from words to 'similarity scores,'
        indicating how often these two words occur in the same
        context.
        """
        word = self._key(word)
        word_contexts = set(self._word_to_contexts[word])

        scores = {}
        for w, w_contexts in self._word_to_contexts.items():
            scores[w] = f_measure(word_contexts, set(w_contexts))

        return scores 
Example #14
Source File: text.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def find_concordance(self, word, width=80):
        """
        Find all concordance lines given the query word.
        """
        half_width = (width - len(word) - 2) // 2
        context = width // 4  # approx number of words of context

        # Find the instances of the word to create the ConcordanceLine
        concordance_list = []
        offsets = self.offsets(word)
        if offsets:
            for i in offsets:
                query_word = self._tokens[i]
                # Find the context of query word.
                left_context = self._tokens[max(0, i - context) : i]
                right_context = self._tokens[i + 1 : i + context]
                # Create the pretty lines with the query_word in the middle.
                left_print = " ".join(left_context)[-half_width:]
                right_print = " ".join(right_context)[:half_width]
                # The WYSIWYG line of the concordance.
                line_print = " ".join([left_print, query_word, right_print])
                # Create the ConcordanceLine
                concordance_line = ConcordanceLine(
                    left_context,
                    query_word,
                    right_context,
                    i,
                    left_print,
                    right_print,
                    line_print,
                )
                concordance_list.append(concordance_line)
        return concordance_list 
Example #15
Source File: spellcheck.py    From normalise with GNU General Public License v3.0 5 votes vote down vote up
def words(text):
    return re.findall('[a-z]+', text.lower()) 
Example #16
Source File: text.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def common_contexts(self, words, num=20):
        """
        Find contexts where the specified words appear; list
        most frequent common contexts first.

        :param words: The words used to seed the similarity search
        :type words: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.common_contexts()
        """
        if "_word_context_index" not in self.__dict__:
            # print('Building word-context index...')
            self._word_context_index = ContextIndex(
                self.tokens, key=lambda s: s.lower()
            )

        try:
            fd = self._word_context_index.common_contexts(words, True)
            if not fd:
                print("No common contexts were found")
            else:
                ranked_contexts = [w for w, _ in fd.most_common(num)]
                print(tokenwrap(w1 + "_" + w2 for w1, w2 in ranked_contexts))

        except ValueError as e:
            print(e) 
Example #17
Source File: text.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def dispersion_plot(self, words):
        """
        Produce a plot showing the distribution of the words through the text.
        Requires pylab to be installed.

        :param words: The words to be plotted
        :type words: list(str)
        :seealso: nltk.draw.dispersion_plot()
        """
        from nltk.draw import dispersion_plot

        dispersion_plot(self, words) 
Example #18
Source File: text.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def __init__(self, source):
        if hasattr(source, "words"):  # bridge to the text corpus reader
            source = [source.words(f) for f in source.fileids()]

        self._texts = source
        Text.__init__(self, LazyConcatenation(source))
        self._idf_cache = {} 
Example #19
Source File: text.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def demo():
    from nltk.corpus import brown

    text = Text(brown.words(categories="news"))
    print(text)
    print()
    print("Concordance:")
    text.concordance("news")
    print()
    print("Distributionally similar words:")
    text.similar("news")
    print()
    print("Collocations:")
    text.collocations()
    print()
    # print("Automatically generated text:")
    # text.generate()
    # print()
    print("Dispersion plot:")
    text.dispersion_plot(["news", "report", "said", "announced"])
    print()
    print("Vocabulary plot:")
    text.plot(50)
    print()
    print("Indexing:")
    print("text[3]:", text[3])
    print("text[3:5]:", text[3:5])
    print("text.vocab()['news']:", text.vocab()["news"]) 
Example #20
Source File: testingNLP.py    From python-urbanPlanning with MIT License 5 votes vote down vote up
def splitter(data,num_words):
    words=data.split(' ')
    output=[]
    cur_count=0
    cur_words=[]
    for word in words:
        cur_words.append(word)
        cur_count+=1
        if cur_count==num_words:
            output.append(' '.join(cur_words))
            cur_words=[]
            cur_count=0
    output.append(' '.join(cur_words))
    return output 
Example #21
Source File: testingNLP.py    From python-urbanPlanning with MIT License 5 votes vote down vote up
def splitter(data,num_words):
    words=data.split(' ')
    output=[]
    cur_count=0
    cur_words=[]
    for word in words:
        cur_words.append(word)
        cur_count+=1
        if cur_count==num_words:
            output.append(' '.join(cur_words))
            cur_words=[]
            cur_count=0
    output.append(' '.join(cur_words))
    return output 
Example #22
Source File: 3_corpus.py    From ml_code with Apache License 2.0 5 votes vote down vote up
def unusual_words(text):
    text_vocab = set(w.lower() for w in text if w.isalpha())
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    unusual = text_vocab.difference(english_vocab)
    return sorted(unusual) 
Example #23
Source File: 3_corpus.py    From ml_code with Apache License 2.0 5 votes vote down vote up
def content_fraction(text):
    stopwords = nltk.corpus.stopwords.words('english')
    content = [w for w in text if w.lower() not in stopwords]
    return len(content) / len(text) 
Example #24
Source File: text.py    From luscan-devel with GNU General Public License v2.0 5 votes vote down vote up
def common_contexts(self, words, fail_on_unknown=False):
        """
        Find contexts where the specified words can all appear; and
        return a frequency distribution mapping each context to the
        number of times that context was used.

        :param words: The words used to seed the similarity search
        :type words: str
        :param fail_on_unknown: If true, then raise a value error if
            any of the given words do not occur at all in the index.
        """
        words = [self._key(w) for w in words]
        contexts = [set(self._word_to_contexts[w]) for w in words]
        empty = [words[i] for i in range(len(words)) if not contexts[i]]
        common = reduce(set.intersection, contexts)
        if empty and fail_on_unknown:
            raise ValueError("The following word(s) were not found:",
                             " ".join(words))
        elif not common:
            # nothing in common -- just return an empty freqdist.
            return FreqDist()
        else:
            fd = FreqDist(c for w in words
                          for c in self._word_to_contexts[w]
                          if c in common)
            return fd 
Example #25
Source File: text.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def common_contexts(self, words, fail_on_unknown=False):
        """
        Find contexts where the specified words can all appear; and
        return a frequency distribution mapping each context to the
        number of times that context was used.

        :param words: The words used to seed the similarity search
        :type words: str
        :param fail_on_unknown: If true, then raise a value error if
            any of the given words do not occur at all in the index.
        """
        words = [self._key(w) for w in words]
        contexts = [set(self._word_to_contexts[w]) for w in words]
        empty = [words[i] for i in range(len(words)) if not contexts[i]]
        common = reduce(set.intersection, contexts)
        if empty and fail_on_unknown:
            raise ValueError("The following word(s) were not found:",
                             " ".join(words))
        elif not common:
            # nothing in common -- just return an empty freqdist.
            return FreqDist()
        else:
            fd = FreqDist(c for w in words
                          for c in self._word_to_contexts[w]
                          if c in common)
            return fd 
Example #26
Source File: text.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, tokens, key=lambda x:x):
        """
        Construct a new concordance index.

        :param tokens: The document (list of tokens) that this
            concordance index was created from.  This list can be used
            to access the context of a given word occurrence.
        :param key: A function that maps each token to a normalized
            version that will be used as a key in the index.  E.g., if
            you use ``key=lambda s:s.lower()``, then the index will be
            case-insensitive.
        """
        self._tokens = tokens
        """The document (list of tokens) that this concordance index
           was created from."""

        self._key = key
        """Function mapping each token to an index key (or None)."""

        self._offsets = defaultdict(list)
        """Dictionary mapping words (or keys) to lists of offset
           indices."""

        # Initialize the index (self._offsets)
        for index, word in enumerate(tokens):
            word = self._key(word)
            self._offsets[word].append(index) 
Example #27
Source File: text.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def print_concordance(self, word, width=75, lines=25):
        """
        Print a concordance for ``word`` with the specified context window.

        :param word: The target word
        :type word: str
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param lines: The number of lines to display (default=25)
        :type lines: int
        """
        half_width = (width - len(word) - 2) // 2
        context = width // 4 # approx number of words of context

        offsets = self.offsets(word)
        if offsets:
            lines = min(lines, len(offsets))
            print("Displaying %s of %s matches:" % (lines, len(offsets)))
            for i in offsets:
                if lines <= 0:
                    break
                left = (' ' * half_width +
                        ' '.join(self._tokens[i-context:i]))
                right = ' '.join(self._tokens[i+1:i+context])
                left = left[-half_width:]
                right = right[:half_width]
                print(left, self._tokens[i], right)
                lines -= 1
        else:
            print("No matches") 
Example #28
Source File: text.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def common_contexts(self, words, num=20):
        """
        Find contexts where the specified words appear; list
        most frequent common contexts first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.common_contexts()
        """
        if '_word_context_index' not in self.__dict__:
            #print('Building word-context index...')
            self._word_context_index = ContextIndex(self.tokens,
                                                    key=lambda s:s.lower())

        try:
            fd = self._word_context_index.common_contexts(words, True)
            if not fd:
                print("No common contexts were found")
            else:
                ranked_contexts = [w for w, _ in fd.most_common(num)]
                print(tokenwrap(w1+"_"+w2 for w1,w2 in ranked_contexts))

        except ValueError as e:
            print(e) 
Example #29
Source File: text.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def dispersion_plot(self, words):
        """
        Produce a plot showing the distribution of the words through the text.
        Requires pylab to be installed.

        :param words: The words to be plotted
        :type words: list(str)
        :seealso: nltk.draw.dispersion_plot()
        """
        from nltk.draw import dispersion_plot
        dispersion_plot(self, words) 
Example #30
Source File: text.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, source):
        if hasattr(source, 'words'): # bridge to the text corpus reader
            source = [source.words(f) for f in source.fileids()]

        self._texts = source
        Text.__init__(self, LazyConcatenation(source))
        self._idf_cache = {}