Python nltk.corpus.brown.tagged_sents() Examples

The following are 16 code examples for showing how to use nltk.corpus.brown.tagged_sents(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module nltk.corpus.brown , or try the search function .

Example 1
Project: razzy-spinner   Author: rafasashi   File: tnt.py    License: GNU General Public License v3.0 6 votes vote down vote up
def demo():
    from nltk.corpus import brown
    sents = list(brown.tagged_sents())
    test = list(brown.sents())

    # create and train the tagger
    tagger = TnT()
    tagger.train(sents[200:1000])

    # tag some data
    tagged_data = tagger.tagdata(test[100:120])

    # print results
    for j in range(len(tagged_data)):
        s = tagged_data[j]
        t = sents[j+100]
        for i in range(len(s)):
            print(s[i],'--', t[i])
        print() 
Example 2
Project: razzy-spinner   Author: rafasashi   File: hmm.py    License: GNU General Public License v3.0 6 votes vote down vote up
def load_pos(num_sents):
    from nltk.corpus import brown

    sentences = brown.tagged_sents(categories='news')[:num_sents]

    tag_re = re.compile(r'[*]|--|[^+*-]+')
    tag_set = set()
    symbols = set()

    cleaned_sentences = []
    for sentence in sentences:
        for i in range(len(sentence)):
            word, tag = sentence[i]
            word = word.lower()  # normalize
            symbols.add(word)    # log this word
            # Clean up the tag.
            tag = tag_re.match(tag).group()
            tag_set.add(tag)
            sentence[i] = (word, tag)  # store cleaned-up tagged token
        cleaned_sentences += [sentence]

    return cleaned_sentences, list(tag_set), list(symbols) 
Example 3
Project: razzy-spinner   Author: rafasashi   File: glue.py    License: GNU General Public License v3.0 6 votes vote down vote up
def get_pos_tagger(self):
        from nltk.corpus import brown
        regexp_tagger = RegexpTagger(
            [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
             (r'(The|the|A|a|An|an)$', 'AT'),   # articles
             (r'.*able$', 'JJ'),                # adjectives
             (r'.*ness$', 'NN'),                # nouns formed from adjectives
             (r'.*ly$', 'RB'),                  # adverbs
             (r'.*s$', 'NNS'),                  # plural nouns
             (r'.*ing$', 'VBG'),                # gerunds
             (r'.*ed$', 'VBD'),                 # past tense verbs
             (r'.*', 'NN')                      # nouns (default)
        ])
        brown_train = brown.tagged_sents(categories='news')
        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

        #Override particular words
        main_tagger = RegexpTagger(
            [(r'(A|a|An|an)$', 'ex_quant'),
             (r'(Every|every|All|all)$', 'univ_quant')
        ], backoff=trigram_tagger)

        return main_tagger 
Example 4
Project: luscan-devel   Author: blackye   File: tnt.py    License: GNU General Public License v2.0 6 votes vote down vote up
def demo():
    from nltk.tag import tnt
    from nltk.corpus import brown
    sents = list(brown.tagged_sents())
    test = list(brown.sents())

    # create and train the tagger
    tagger = tnt.TnT()
    tagger.train(sents[200:1000])

    # tag some data
    tagged_data = tagger.tagdata(test[100:120])

    # print results
    for j in range(len(tagged_data)):
        s = tagged_data[j]
        t = sents[j+100]
        for i in range(len(s)):
            print s[i],'--', t[i]
        print 
Example 5
Project: luscan-devel   Author: blackye   File: hmm.py    License: GNU General Public License v2.0 6 votes vote down vote up
def load_pos(num_sents):
    from nltk.corpus import brown

    sentences = brown.tagged_sents(categories='news')[:num_sents]

    tag_re = re.compile(r'[*]|--|[^+*-]+')
    tag_set = set()
    symbols = set()

    cleaned_sentences = []
    for sentence in sentences:
        for i in range(len(sentence)):
            word, tag = sentence[i]
            word = word.lower()  # normalize
            symbols.add(word)    # log this word
            # Clean up the tag.
            tag = tag_re.match(tag).group()
            tag_set.add(tag)
            sentence[i] = (word, tag)  # store cleaned-up tagged token
        cleaned_sentences += [sentence]

    return cleaned_sentences, list(tag_set), list(symbols) 
Example 6
Project: luscan-devel   Author: blackye   File: glue.py    License: GNU General Public License v2.0 6 votes vote down vote up
def get_pos_tagger(self):
        regexp_tagger = RegexpTagger(
            [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
             (r'(The|the|A|a|An|an)$', 'AT'),   # articles
             (r'.*able$', 'JJ'),                # adjectives
             (r'.*ness$', 'NN'),                # nouns formed from adjectives
             (r'.*ly$', 'RB'),                  # adverbs
             (r'.*s$', 'NNS'),                  # plural nouns
             (r'.*ing$', 'VBG'),                # gerunds
             (r'.*ed$', 'VBD'),                 # past tense verbs
             (r'.*', 'NN')                      # nouns (default)
        ])
        brown_train = brown.tagged_sents(categories='news')
        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

        #Override particular words
        main_tagger = RegexpTagger(
            [(r'(A|a|An|an)$', 'ex_quant'),
             (r'(Every|every|All|all)$', 'univ_quant')
        ], backoff=trigram_tagger)

        return main_tagger 
Example 7
Project: normalise   Author: EFord36   File: pos_tag_dict.py    License: GNU General Public License v3.0 6 votes vote down vote up
def store_pos_tag_dicts():
    pos_tag_dict = defaultdict(tuple)
    tagged = treebank.tagged_sents()
    for sent in tagged:
        for tup in sent:
            if not tup[1] in pos_tag_dict[tup[0].lower()]:
                pos_tag_dict[tup[0].lower()] += (tup[1],)

    pos_tag_dict_univ = defaultdict(tuple)
    penn_tagged_univ = treebank.tagged_sents(tagset='universal')
    brown_tagged_univ = brown.tagged_sents(tagset='universal')
    for text in [penn_tagged_univ, brown_tagged_univ]:
        for sent in text:
            for tup in sent:
                if not tup[1] in pos_tag_dict_univ[tup[0].lower()]:
                    pos_tag_dict_univ[tup[0].lower()] += (tup[1],)
    for word in states.values():
        pos_tag_dict[word.lower()] += ('NNP',)
        pos_tag_dict_univ[word.lower()] += ('NOUN',)
    dicts = (pos_tag_dict, pos_tag_dict_univ)
    with open('{}/data/pos_dicts.pickle'.format(mod_path), 'wb') as file:
        pickle.dump(dicts, file, protocol=2) 
Example 8
Project: V1EngineeringInc-Docs   Author: V1EngineeringInc   File: tnt.py    License: Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def demo():
    from nltk.corpus import brown

    sents = list(brown.tagged_sents())
    test = list(brown.sents())

    # create and train the tagger
    tagger = TnT()
    tagger.train(sents[200:1000])

    # tag some data
    tagged_data = tagger.tagdata(test[100:120])

    # print results
    for j in range(len(tagged_data)):
        s = tagged_data[j]
        t = sents[j + 100]
        for i in range(len(s)):
            print(s[i], '--', t[i])
        print() 
Example 9
Project: V1EngineeringInc-Docs   Author: V1EngineeringInc   File: hmm.py    License: Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def load_pos(num_sents):
    from nltk.corpus import brown

    sentences = brown.tagged_sents(categories='news')[:num_sents]

    tag_re = re.compile(r'[*]|--|[^+*-]+')
    tag_set = set()
    symbols = set()

    cleaned_sentences = []
    for sentence in sentences:
        for i in range(len(sentence)):
            word, tag = sentence[i]
            word = word.lower()  # normalize
            symbols.add(word)  # log this word
            # Clean up the tag.
            tag = tag_re.match(tag).group()
            tag_set.add(tag)
            sentence[i] = (word, tag)  # store cleaned-up tagged token
        cleaned_sentences += [sentence]

    return cleaned_sentences, list(tag_set), list(symbols) 
Example 10
Project: razzy-spinner   Author: rafasashi   File: tnt.py    License: GNU General Public License v3.0 5 votes vote down vote up
def demo2():
    from nltk.corpus import treebank

    d = list(treebank.tagged_sents())

    t = TnT(N=1000, C=False)
    s = TnT(N=1000, C=True)
    t.train(d[(11)*100:])
    s.train(d[(11)*100:])

    for i in range(10):
        tacc = t.evaluate(d[i*100:((i+1)*100)])
        tp_un = float(t.unknown) / float(t.known +t.unknown)
        tp_kn = float(t.known) / float(t.known + t.unknown)
        t.unknown = 0
        t.known = 0

        print('Capitalization off:')
        print('Accuracy:', tacc)
        print('Percentage known:', tp_kn)
        print('Percentage unknown:', tp_un)
        print('Accuracy over known words:', (tacc / tp_kn))

        sacc = s.evaluate(d[i*100:((i+1)*100)])
        sp_un = float(s.unknown) / float(s.known +s.unknown)
        sp_kn = float(s.known) / float(s.known + s.unknown)
        s.unknown = 0
        s.known = 0

        print('Capitalization on:')
        print('Accuracy:', sacc)
        print('Percentage known:', sp_kn)
        print('Percentage unknown:', sp_un)
        print('Accuracy over known words:', (sacc / sp_kn)) 
Example 11
Project: luscan-devel   Author: blackye   File: tnt.py    License: GNU General Public License v2.0 5 votes vote down vote up
def demo2():
    from nltk import tag
    from nltk.tag import tnt
    from nltk.corpus import treebank

    d = list(treebank.tagged_sents())

    t = tnt.TnT(N=1000, C=False)
    s = tnt.TnT(N=1000, C=True)
    t.train(d[(11)*100:])
    s.train(d[(11)*100:])

    for i in range(10):
        tacc = tag.accuracy(t, d[i*100:((i+1)*100)])
        tp_un = float(t.unknown) / float(t.known +t.unknown)
        tp_kn = float(t.known) / float(t.known + t.unknown)
        t.unknown = 0
        t.known = 0

        print 'Capitalization off:'
        print 'Accuracy:', tacc
        print 'Percentage known:', tp_kn
        print 'Percentage unknown:', tp_un
        print 'Accuracy over known words:', (tacc / tp_kn)

        sacc = tag.accuracy(s, d[i*100:((i+1)*100)])
        sp_un = float(s.unknown) / float(s.known +s.unknown)
        sp_kn = float(s.known) / float(s.known + s.unknown)
        s.unknown = 0
        s.known = 0

        print 'Capitalization on:'
        print 'Accuracy:', sacc
        print 'Percentage known:', sp_kn
        print 'Percentage unknown:', sp_un
        print 'Accuracy over known words:', (sacc / sp_kn) 
Example 12
Project: luscan-devel   Author: blackye   File: crf.py    License: GNU General Public License v2.0 5 votes vote down vote up
def demo(train_size=100, test_size=100, java_home=None, mallet_home=None):
    from nltk.corpus import brown
    import textwrap

    # Define a very simple feature detector
    def fd(sentence, index):
        word = sentence[index]
        return dict(word=word, suffix=word[-2:], len=len(word))

    # Let nltk know where java & mallet are.
    nltk.internals.config_java(java_home)
    nltk.classify.mallet.config_mallet(mallet_home)

    # Get the training & test corpus.  We simplify the tagset a little:
    # just the first 2 chars.
    def strip(corpus): return [[(w, t[:2]) for (w,t) in sent]
                               for sent in corpus]
    brown_train = strip(brown.tagged_sents(categories='news')[:train_size])
    brown_test = strip(brown.tagged_sents(categories='editorial')[:test_size])

    crf = MalletCRF.train(fd, brown_train, #'/tmp/crf-model',
                          transduction_type='VITERBI')
    sample_output = crf.tag([w for (w,t) in brown_test[5]])
    acc = nltk.tag.accuracy(crf, brown_test)
    print '\nAccuracy: %.1f%%' % (acc*100)
    print 'Sample output:'
    print textwrap.fill(' '.join('%s/%s' % w for w in sample_output),
                        initial_indent='  ', subsequent_indent='  ')+'\n'

    # Clean up
    print 'Clean-up: deleting', crf.filename
    os.remove(crf.filename)

    return crf 
Example 13
Project: V1EngineeringInc-Docs   Author: V1EngineeringInc   File: tnt.py    License: Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def demo2():
    from nltk.corpus import treebank

    d = list(treebank.tagged_sents())

    t = TnT(N=1000, C=False)
    s = TnT(N=1000, C=True)
    t.train(d[(11) * 100 :])
    s.train(d[(11) * 100 :])

    for i in range(10):
        tacc = t.evaluate(d[i * 100 : ((i + 1) * 100)])
        tp_un = t.unknown / (t.known + t.unknown)
        tp_kn = t.known / (t.known + t.unknown)
        t.unknown = 0
        t.known = 0

        print('Capitalization off:')
        print('Accuracy:', tacc)
        print('Percentage known:', tp_kn)
        print('Percentage unknown:', tp_un)
        print('Accuracy over known words:', (tacc / tp_kn))

        sacc = s.evaluate(d[i * 100 : ((i + 1) * 100)])
        sp_un = s.unknown / (s.known + s.unknown)
        sp_kn = s.known / (s.known + s.unknown)
        s.unknown = 0
        s.known = 0

        print('Capitalization on:')
        print('Accuracy:', sacc)
        print('Percentage known:', sp_kn)
        print('Percentage unknown:', sp_un)
        print('Accuracy over known words:', (sacc / sp_kn)) 
Example 14
Project: V1EngineeringInc-Docs   Author: V1EngineeringInc   File: glue.py    License: Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def get_pos_tagger(self):
        from nltk.corpus import brown

        regexp_tagger = RegexpTagger(
            [
                (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
                (r'(The|the|A|a|An|an)$', 'AT'),  # articles
                (r'.*able$', 'JJ'),  # adjectives
                (r'.*ness$', 'NN'),  # nouns formed from adjectives
                (r'.*ly$', 'RB'),  # adverbs
                (r'.*s$', 'NNS'),  # plural nouns
                (r'.*ing$', 'VBG'),  # gerunds
                (r'.*ed$', 'VBD'),  # past tense verbs
                (r'.*', 'NN'),  # nouns (default)
            ]
        )
        brown_train = brown.tagged_sents(categories='news')
        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

        # Override particular words
        main_tagger = RegexpTagger(
            [(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant')],
            backoff=trigram_tagger,
        )

        return main_tagger 
Example 15
Project: razzy-spinner   Author: rafasashi   File: tnt.py    License: GNU General Public License v3.0 4 votes vote down vote up
def demo3():
    from nltk.corpus import treebank, brown

    d = list(treebank.tagged_sents())
    e = list(brown.tagged_sents())

    d = d[:1000]
    e = e[:1000]

    d10 = int(len(d)*0.1)
    e10 = int(len(e)*0.1)

    tknacc = 0
    sknacc = 0
    tallacc = 0
    sallacc = 0
    tknown = 0
    sknown = 0

    for i in range(10):

        t = TnT(N=1000, C=False)
        s = TnT(N=1000, C=False)

        dtest = d[(i*d10):((i+1)*d10)]
        etest = e[(i*e10):((i+1)*e10)]

        dtrain = d[:(i*d10)] + d[((i+1)*d10):]
        etrain = e[:(i*e10)] + e[((i+1)*e10):]

        t.train(dtrain)
        s.train(etrain)

        tacc = t.evaluate(dtest)
        tp_un = float(t.unknown) / float(t.known +t.unknown)
        tp_kn = float(t.known) / float(t.known + t.unknown)
        tknown += tp_kn
        t.unknown = 0
        t.known = 0

        sacc = s.evaluate(etest)
        sp_un = float(s.unknown) / float(s.known + s.unknown)
        sp_kn = float(s.known) / float(s.known + s.unknown)
        sknown += sp_kn
        s.unknown = 0
        s.known = 0

        tknacc += (tacc / tp_kn)
        sknacc += (sacc / tp_kn)
        tallacc += tacc
        sallacc += sacc

        #print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc


    print("brown: acc over words known:", 10 * tknacc)
    print("     : overall accuracy:", 10 * tallacc)
    print("     : words known:", 10 * tknown)
    print("treebank: acc over words known:", 10 * sknacc)
    print("        : overall accuracy:", 10 * sallacc)
    print("        : words known:", 10 * sknown) 
Example 16
Project: V1EngineeringInc-Docs   Author: V1EngineeringInc   File: tnt.py    License: Creative Commons Attribution Share Alike 4.0 International 4 votes vote down vote up
def demo3():
    from nltk.corpus import treebank, brown

    d = list(treebank.tagged_sents())
    e = list(brown.tagged_sents())

    d = d[:1000]
    e = e[:1000]

    d10 = int(len(d) * 0.1)
    e10 = int(len(e) * 0.1)

    tknacc = 0
    sknacc = 0
    tallacc = 0
    sallacc = 0
    tknown = 0
    sknown = 0

    for i in range(10):

        t = TnT(N=1000, C=False)
        s = TnT(N=1000, C=False)

        dtest = d[(i * d10) : ((i + 1) * d10)]
        etest = e[(i * e10) : ((i + 1) * e10)]

        dtrain = d[: (i * d10)] + d[((i + 1) * d10) :]
        etrain = e[: (i * e10)] + e[((i + 1) * e10) :]

        t.train(dtrain)
        s.train(etrain)

        tacc = t.evaluate(dtest)
        tp_un = t.unknown / (t.known + t.unknown)
        tp_kn = t.known / (t.known + t.unknown)
        tknown += tp_kn
        t.unknown = 0
        t.known = 0

        sacc = s.evaluate(etest)
        sp_un = s.unknown / (s.known + s.unknown)
        sp_kn = s.known / (s.known + s.unknown)
        sknown += sp_kn
        s.unknown = 0
        s.known = 0

        tknacc += tacc / tp_kn
        sknacc += sacc / tp_kn
        tallacc += tacc
        sallacc += sacc

        # print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc

    print("brown: acc over words known:", 10 * tknacc)
    print("     : overall accuracy:", 10 * tallacc)
    print("     : words known:", 10 * tknown)
    print("treebank: acc over words known:", 10 * sknacc)
    print("        : overall accuracy:", 10 * sallacc)
    print("        : words known:", 10 * sknown)