Python nltk.corpus.brown.sents() Examples

The following are 6 code examples of nltk.corpus.brown.sents(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.corpus.brown , or try the search function .
Example #1
Source File: tnt.py    From razzy-spinner with GNU General Public License v3.0 6 votes vote down vote up
def demo():
    from nltk.corpus import brown
    sents = list(brown.tagged_sents())
    test = list(brown.sents())

    # create and train the tagger
    tagger = TnT()
    tagger.train(sents[200:1000])

    # tag some data
    tagged_data = tagger.tagdata(test[100:120])

    # print results
    for j in range(len(tagged_data)):
        s = tagged_data[j]
        t = sents[j+100]
        for i in range(len(s)):
            print(s[i],'--', t[i])
        print() 
Example #2
Source File: short_sentence_similarity.py    From Semantic-Texual-Similarity-Toolkits with MIT License 6 votes vote down vote up
def info_content(lookup_word):
    """
    Uses the Brown corpus available in NLTK to calculate a Laplace
    smoothed frequency distribution of words, then uses this information
    to compute the information content of the lookup_word.
    """
    global N
    if N == 0:
        # poor man's lazy evaluation
        for sent in brown.sents():
            for word in sent:
                word = word.lower()
                if not word in brown_freqs:
                    brown_freqs[word] = 0
                brown_freqs[word] = brown_freqs[word] + 1
                N = N + 1
    lookup_word = lookup_word.lower()
    n = 0 if not lookup_word in brown_freqs else brown_freqs[lookup_word]
    return 1.0 - (math.log(n + 1) / math.log(N + 1)) 
Example #3
Source File: tnt.py    From luscan-devel with GNU General Public License v2.0 6 votes vote down vote up
def demo():
    from nltk.tag import tnt
    from nltk.corpus import brown
    sents = list(brown.tagged_sents())
    test = list(brown.sents())

    # create and train the tagger
    tagger = tnt.TnT()
    tagger.train(sents[200:1000])

    # tag some data
    tagged_data = tagger.tagdata(test[100:120])

    # print results
    for j in range(len(tagged_data)):
        s = tagged_data[j]
        t = sents[j+100]
        for i in range(len(s)):
            print s[i],'--', t[i]
        print 
Example #4
Source File: tnt.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def demo():
    from nltk.corpus import brown

    sents = list(brown.tagged_sents())
    test = list(brown.sents())

    # create and train the tagger
    tagger = TnT()
    tagger.train(sents[200:1000])

    # tag some data
    tagged_data = tagger.tagdata(test[100:120])

    # print results
    for j in range(len(tagged_data)):
        s = tagged_data[j]
        t = sents[j + 100]
        for i in range(len(s)):
            print(s[i], '--', t[i])
        print() 
Example #5
Source File: data_load.py    From neural_tokenizer with MIT License 5 votes vote down vote up
def load_data(mode="train"):
    word2idx, idx2word = load_vocab()

    from nltk.corpus import brown
    sents = [" ".join(words) for words in brown.sents()]

    xs, ys = [], []
    for sent in sents:
        sent = re.sub(r"[^ A-Za-z']", "", sent)
        if hp.minlen <= len(sent) <= hp.maxlen:
            x, y = [], []
            for word in sent.split():
                for char in word:
                    x.append(word2idx[char])
                    y.append(0) # 0: no space
                y[-1] = 1 # space for end of a word
            y[-1] = 0 # no space for end of sentence

            xs.append(x + [0] * (hp.maxlen-len(x)))
            ys.append(y + [0] * (hp.maxlen-len(x)))

    # Convert to ndarrays
    X = np.array(xs, np.int32)
    Y = np.array(ys, np.int32)

    # mode
    if mode=="train":
        X, Y = X[: int(len(X) * .8)], Y[: int(len(Y) * .8)]
        # X, Y = X[: 128], Y[: 128]
    elif mode=="val":
        X, Y = X[int(len(X) * .8): -int(len(X) * .1)], Y[int(len(X) * .8): -int(len(X) * .1)]
    else:
        X, Y = X[-int(len(X) * .1):], Y[-int(len(X) * .1):]

    return X, Y 
Example #6
Source File: do_benchmark.py    From PyRATA with Apache License 2.0 4 votes vote down vote up
def test_clause():

  """                            
  """

  print ('Measuring time performance on # {} sentences over # {} iterations for recognizing Clause'.format(size, iteration_number))

  from nltk.corpus import brown
  brown_sents = brown.sents()[:size]
  import nltk
  global brown_pos_tag_sents
  brown_pos_tag_sents = [nltk.pos_tag(sentence) for sentence in brown_sents] 
  #print (brown_pos_tag_sents[0])


  # ----------------------------------------------------
  # nltk_parser 
  # ----------------------------------------------------
  analyzer_name='nltk_parser'
  

  times, averagetime, mintime = measure_time(nltk_parse_clause_in_the_whole_text, iteration_number)
  grammar = "clause"
  print ('{}\t{}\t{}\t{}'.format(analyzer_name, grammar, averagetime, mintime))


  # ----------------------------------------------------
  # pyrata 
  # ----------------------------------------------------
  analyzer_name='pyrata'

  global sentences_dict_list_list
  sentences_dict_list_list = []

  for s in brown_pos_tag_sents:
    sentences_dict_list_list.append([{'raw':w, 'pos':p} for (w, p) in s])
  # data -> sentences_dict_list_list
  #data = data[0]
  # flatten a list of list i.e. sentences of words becomes a text of words 
  # data = [val for sublist in data for val in sublist]
  #print (data[:10])
  #print ('len(data):', len(data))

  times, averagetime, mintime = measure_time(pyrata_recognize_clause_in_the_whole_text, iteration_number)
  grammar = "clause"
  print ('{}\t{}\t{}\t{}'.format(analyzer_name, grammar, averagetime, mintime))