Python nltk.corpus.brown.tagged_sents() Examples
The following are 16
code examples of nltk.corpus.brown.tagged_sents().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.corpus.brown
, or try the search function
.
Example #1
Source File: tnt.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def demo(): from nltk.corpus import brown sents = list(brown.tagged_sents()) test = list(brown.sents()) # create and train the tagger tagger = TnT() tagger.train(sents[200:1000]) # tag some data tagged_data = tagger.tagdata(test[100:120]) # print results for j in range(len(tagged_data)): s = tagged_data[j] t = sents[j+100] for i in range(len(s)): print(s[i],'--', t[i]) print()
Example #2
Source File: hmm.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def load_pos(num_sents): from nltk.corpus import brown sentences = brown.tagged_sents(categories='news')[:num_sents] tag_re = re.compile(r'[*]|--|[^+*-]+') tag_set = set() symbols = set() cleaned_sentences = [] for sentence in sentences: for i in range(len(sentence)): word, tag = sentence[i] word = word.lower() # normalize symbols.add(word) # log this word # Clean up the tag. tag = tag_re.match(tag).group() tag_set.add(tag) sentence[i] = (word, tag) # store cleaned-up tagged token cleaned_sentences += [sentence] return cleaned_sentences, list(tag_set), list(symbols)
Example #3
Source File: glue.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def get_pos_tagger(self): from nltk.corpus import brown regexp_tagger = RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) brown_train = brown.tagged_sents(categories='news') unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger) bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger) #Override particular words main_tagger = RegexpTagger( [(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant') ], backoff=trigram_tagger) return main_tagger
Example #4
Source File: tnt.py From luscan-devel with GNU General Public License v2.0 | 6 votes |
def demo(): from nltk.tag import tnt from nltk.corpus import brown sents = list(brown.tagged_sents()) test = list(brown.sents()) # create and train the tagger tagger = tnt.TnT() tagger.train(sents[200:1000]) # tag some data tagged_data = tagger.tagdata(test[100:120]) # print results for j in range(len(tagged_data)): s = tagged_data[j] t = sents[j+100] for i in range(len(s)): print s[i],'--', t[i] print
Example #5
Source File: hmm.py From luscan-devel with GNU General Public License v2.0 | 6 votes |
def load_pos(num_sents): from nltk.corpus import brown sentences = brown.tagged_sents(categories='news')[:num_sents] tag_re = re.compile(r'[*]|--|[^+*-]+') tag_set = set() symbols = set() cleaned_sentences = [] for sentence in sentences: for i in range(len(sentence)): word, tag = sentence[i] word = word.lower() # normalize symbols.add(word) # log this word # Clean up the tag. tag = tag_re.match(tag).group() tag_set.add(tag) sentence[i] = (word, tag) # store cleaned-up tagged token cleaned_sentences += [sentence] return cleaned_sentences, list(tag_set), list(symbols)
Example #6
Source File: glue.py From luscan-devel with GNU General Public License v2.0 | 6 votes |
def get_pos_tagger(self): regexp_tagger = RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) brown_train = brown.tagged_sents(categories='news') unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger) bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger) #Override particular words main_tagger = RegexpTagger( [(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant') ], backoff=trigram_tagger) return main_tagger
Example #7
Source File: pos_tag_dict.py From normalise with GNU General Public License v3.0 | 6 votes |
def store_pos_tag_dicts(): pos_tag_dict = defaultdict(tuple) tagged = treebank.tagged_sents() for sent in tagged: for tup in sent: if not tup[1] in pos_tag_dict[tup[0].lower()]: pos_tag_dict[tup[0].lower()] += (tup[1],) pos_tag_dict_univ = defaultdict(tuple) penn_tagged_univ = treebank.tagged_sents(tagset='universal') brown_tagged_univ = brown.tagged_sents(tagset='universal') for text in [penn_tagged_univ, brown_tagged_univ]: for sent in text: for tup in sent: if not tup[1] in pos_tag_dict_univ[tup[0].lower()]: pos_tag_dict_univ[tup[0].lower()] += (tup[1],) for word in states.values(): pos_tag_dict[word.lower()] += ('NNP',) pos_tag_dict_univ[word.lower()] += ('NOUN',) dicts = (pos_tag_dict, pos_tag_dict_univ) with open('{}/data/pos_dicts.pickle'.format(mod_path), 'wb') as file: pickle.dump(dicts, file, protocol=2)
Example #8
Source File: tnt.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def demo(): from nltk.corpus import brown sents = list(brown.tagged_sents()) test = list(brown.sents()) # create and train the tagger tagger = TnT() tagger.train(sents[200:1000]) # tag some data tagged_data = tagger.tagdata(test[100:120]) # print results for j in range(len(tagged_data)): s = tagged_data[j] t = sents[j + 100] for i in range(len(s)): print(s[i], '--', t[i]) print()
Example #9
Source File: hmm.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def load_pos(num_sents): from nltk.corpus import brown sentences = brown.tagged_sents(categories='news')[:num_sents] tag_re = re.compile(r'[*]|--|[^+*-]+') tag_set = set() symbols = set() cleaned_sentences = [] for sentence in sentences: for i in range(len(sentence)): word, tag = sentence[i] word = word.lower() # normalize symbols.add(word) # log this word # Clean up the tag. tag = tag_re.match(tag).group() tag_set.add(tag) sentence[i] = (word, tag) # store cleaned-up tagged token cleaned_sentences += [sentence] return cleaned_sentences, list(tag_set), list(symbols)
Example #10
Source File: tnt.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def demo2(): from nltk.corpus import treebank d = list(treebank.tagged_sents()) t = TnT(N=1000, C=False) s = TnT(N=1000, C=True) t.train(d[(11)*100:]) s.train(d[(11)*100:]) for i in range(10): tacc = t.evaluate(d[i*100:((i+1)*100)]) tp_un = float(t.unknown) / float(t.known +t.unknown) tp_kn = float(t.known) / float(t.known + t.unknown) t.unknown = 0 t.known = 0 print('Capitalization off:') print('Accuracy:', tacc) print('Percentage known:', tp_kn) print('Percentage unknown:', tp_un) print('Accuracy over known words:', (tacc / tp_kn)) sacc = s.evaluate(d[i*100:((i+1)*100)]) sp_un = float(s.unknown) / float(s.known +s.unknown) sp_kn = float(s.known) / float(s.known + s.unknown) s.unknown = 0 s.known = 0 print('Capitalization on:') print('Accuracy:', sacc) print('Percentage known:', sp_kn) print('Percentage unknown:', sp_un) print('Accuracy over known words:', (sacc / sp_kn))
Example #11
Source File: tnt.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def demo2(): from nltk import tag from nltk.tag import tnt from nltk.corpus import treebank d = list(treebank.tagged_sents()) t = tnt.TnT(N=1000, C=False) s = tnt.TnT(N=1000, C=True) t.train(d[(11)*100:]) s.train(d[(11)*100:]) for i in range(10): tacc = tag.accuracy(t, d[i*100:((i+1)*100)]) tp_un = float(t.unknown) / float(t.known +t.unknown) tp_kn = float(t.known) / float(t.known + t.unknown) t.unknown = 0 t.known = 0 print 'Capitalization off:' print 'Accuracy:', tacc print 'Percentage known:', tp_kn print 'Percentage unknown:', tp_un print 'Accuracy over known words:', (tacc / tp_kn) sacc = tag.accuracy(s, d[i*100:((i+1)*100)]) sp_un = float(s.unknown) / float(s.known +s.unknown) sp_kn = float(s.known) / float(s.known + s.unknown) s.unknown = 0 s.known = 0 print 'Capitalization on:' print 'Accuracy:', sacc print 'Percentage known:', sp_kn print 'Percentage unknown:', sp_un print 'Accuracy over known words:', (sacc / sp_kn)
Example #12
Source File: crf.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def demo(train_size=100, test_size=100, java_home=None, mallet_home=None): from nltk.corpus import brown import textwrap # Define a very simple feature detector def fd(sentence, index): word = sentence[index] return dict(word=word, suffix=word[-2:], len=len(word)) # Let nltk know where java & mallet are. nltk.internals.config_java(java_home) nltk.classify.mallet.config_mallet(mallet_home) # Get the training & test corpus. We simplify the tagset a little: # just the first 2 chars. def strip(corpus): return [[(w, t[:2]) for (w,t) in sent] for sent in corpus] brown_train = strip(brown.tagged_sents(categories='news')[:train_size]) brown_test = strip(brown.tagged_sents(categories='editorial')[:test_size]) crf = MalletCRF.train(fd, brown_train, #'/tmp/crf-model', transduction_type='VITERBI') sample_output = crf.tag([w for (w,t) in brown_test[5]]) acc = nltk.tag.accuracy(crf, brown_test) print '\nAccuracy: %.1f%%' % (acc*100) print 'Sample output:' print textwrap.fill(' '.join('%s/%s' % w for w in sample_output), initial_indent=' ', subsequent_indent=' ')+'\n' # Clean up print 'Clean-up: deleting', crf.filename os.remove(crf.filename) return crf
Example #13
Source File: tnt.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def demo2(): from nltk.corpus import treebank d = list(treebank.tagged_sents()) t = TnT(N=1000, C=False) s = TnT(N=1000, C=True) t.train(d[(11) * 100 :]) s.train(d[(11) * 100 :]) for i in range(10): tacc = t.evaluate(d[i * 100 : ((i + 1) * 100)]) tp_un = t.unknown / (t.known + t.unknown) tp_kn = t.known / (t.known + t.unknown) t.unknown = 0 t.known = 0 print('Capitalization off:') print('Accuracy:', tacc) print('Percentage known:', tp_kn) print('Percentage unknown:', tp_un) print('Accuracy over known words:', (tacc / tp_kn)) sacc = s.evaluate(d[i * 100 : ((i + 1) * 100)]) sp_un = s.unknown / (s.known + s.unknown) sp_kn = s.known / (s.known + s.unknown) s.unknown = 0 s.known = 0 print('Capitalization on:') print('Accuracy:', sacc) print('Percentage known:', sp_kn) print('Percentage unknown:', sp_un) print('Accuracy over known words:', (sacc / sp_kn))
Example #14
Source File: glue.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def get_pos_tagger(self): from nltk.corpus import brown regexp_tagger = RegexpTagger( [ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN'), # nouns (default) ] ) brown_train = brown.tagged_sents(categories='news') unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger) bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger) # Override particular words main_tagger = RegexpTagger( [(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant')], backoff=trigram_tagger, ) return main_tagger
Example #15
Source File: tnt.py From razzy-spinner with GNU General Public License v3.0 | 4 votes |
def demo3(): from nltk.corpus import treebank, brown d = list(treebank.tagged_sents()) e = list(brown.tagged_sents()) d = d[:1000] e = e[:1000] d10 = int(len(d)*0.1) e10 = int(len(e)*0.1) tknacc = 0 sknacc = 0 tallacc = 0 sallacc = 0 tknown = 0 sknown = 0 for i in range(10): t = TnT(N=1000, C=False) s = TnT(N=1000, C=False) dtest = d[(i*d10):((i+1)*d10)] etest = e[(i*e10):((i+1)*e10)] dtrain = d[:(i*d10)] + d[((i+1)*d10):] etrain = e[:(i*e10)] + e[((i+1)*e10):] t.train(dtrain) s.train(etrain) tacc = t.evaluate(dtest) tp_un = float(t.unknown) / float(t.known +t.unknown) tp_kn = float(t.known) / float(t.known + t.unknown) tknown += tp_kn t.unknown = 0 t.known = 0 sacc = s.evaluate(etest) sp_un = float(s.unknown) / float(s.known + s.unknown) sp_kn = float(s.known) / float(s.known + s.unknown) sknown += sp_kn s.unknown = 0 s.known = 0 tknacc += (tacc / tp_kn) sknacc += (sacc / tp_kn) tallacc += tacc sallacc += sacc #print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc print("brown: acc over words known:", 10 * tknacc) print(" : overall accuracy:", 10 * tallacc) print(" : words known:", 10 * tknown) print("treebank: acc over words known:", 10 * sknacc) print(" : overall accuracy:", 10 * sallacc) print(" : words known:", 10 * sknown)
Example #16
Source File: tnt.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 4 votes |
def demo3(): from nltk.corpus import treebank, brown d = list(treebank.tagged_sents()) e = list(brown.tagged_sents()) d = d[:1000] e = e[:1000] d10 = int(len(d) * 0.1) e10 = int(len(e) * 0.1) tknacc = 0 sknacc = 0 tallacc = 0 sallacc = 0 tknown = 0 sknown = 0 for i in range(10): t = TnT(N=1000, C=False) s = TnT(N=1000, C=False) dtest = d[(i * d10) : ((i + 1) * d10)] etest = e[(i * e10) : ((i + 1) * e10)] dtrain = d[: (i * d10)] + d[((i + 1) * d10) :] etrain = e[: (i * e10)] + e[((i + 1) * e10) :] t.train(dtrain) s.train(etrain) tacc = t.evaluate(dtest) tp_un = t.unknown / (t.known + t.unknown) tp_kn = t.known / (t.known + t.unknown) tknown += tp_kn t.unknown = 0 t.known = 0 sacc = s.evaluate(etest) sp_un = s.unknown / (s.known + s.unknown) sp_kn = s.known / (s.known + s.unknown) sknown += sp_kn s.unknown = 0 s.known = 0 tknacc += tacc / tp_kn sknacc += sacc / tp_kn tallacc += tacc sallacc += sacc # print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc print("brown: acc over words known:", 10 * tknacc) print(" : overall accuracy:", 10 * tallacc) print(" : words known:", 10 * tknown) print("treebank: acc over words known:", 10 * sknacc) print(" : overall accuracy:", 10 * sallacc) print(" : words known:", 10 * sknown)