Python nltk.RegexpTagger() Examples

The following are code examples for showing how to use nltk.RegexpTagger(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: chattR   Author: patrickstocklin   File: np_extractors.py    GNU General Public License v2.0 6 votes vote down vote up
def train(self):
        train_data = nltk.corpus.brown.tagged_sents(categories='news')
        regexp_tagger = nltk.RegexpTagger([
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
            (r'(-|:|;)$', ':'),
            (r'\'*$', 'MD'),
            (r'(The|the|A|a|An|an)$', 'AT'),
            (r'.*able$', 'JJ'),
            (r'^[A-Z].*$', 'NNP'),
            (r'.*ness$', 'NN'),
            (r'.*ly$', 'RB'),
            (r'.*s$', 'NNS'),
            (r'.*ing$', 'VBG'),
            (r'.*ed$', 'VBD'),
            (r'.*', 'NN'),
            ])
        unigram_tagger = nltk.UnigramTagger(train_data, backoff=regexp_tagger)
        self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
        self._trained = True
        return None 
Example 2
Project: honours_project   Author: JFriel   File: np_extractors.py    GNU General Public License v3.0 6 votes vote down vote up
def train(self):
        train_data = nltk.corpus.brown.tagged_sents(categories='news')
        regexp_tagger = nltk.RegexpTagger([
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
            (r'(-|:|;)$', ':'),
            (r'\'*$', 'MD'),
            (r'(The|the|A|a|An|an)$', 'AT'),
            (r'.*able$', 'JJ'),
            (r'^[A-Z].*$', 'NNP'),
            (r'.*ness$', 'NN'),
            (r'.*ly$', 'RB'),
            (r'.*s$', 'NNS'),
            (r'.*ing$', 'VBG'),
            (r'.*ed$', 'VBD'),
            (r'.*', 'NN'),
            ])
        unigram_tagger = nltk.UnigramTagger(train_data, backoff=regexp_tagger)
        self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
        self._trained = True
        return None 
Example 3
Project: honours_project   Author: JFriel   File: np_extractors.py    GNU General Public License v3.0 6 votes vote down vote up
def train(self):
        train_data = nltk.corpus.brown.tagged_sents(categories='news')
        regexp_tagger = nltk.RegexpTagger([
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
            (r'(-|:|;)$', ':'),
            (r'\'*$', 'MD'),
            (r'(The|the|A|a|An|an)$', 'AT'),
            (r'.*able$', 'JJ'),
            (r'^[A-Z].*$', 'NNP'),
            (r'.*ness$', 'NN'),
            (r'.*ly$', 'RB'),
            (r'.*s$', 'NNS'),
            (r'.*ing$', 'VBG'),
            (r'.*ed$', 'VBD'),
            (r'.*', 'NN'),
            ])
        unigram_tagger = nltk.UnigramTagger(train_data, backoff=regexp_tagger)
        self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
        self._trained = True
        return None 
Example 4
Project: ispassive   Author: cowlicks   File: ispassive.py    GNU General Public License v3.0 6 votes vote down vote up
def create_tagger():
    """Train a tagger from the Brown Corpus. This should not be called very
    often; only in the event that the tagger pickle wasn't found."""
    train_sents = brown.tagged_sents()

    # These regexes were lifted from the NLTK book tagger chapter.
    t0 = nltk.RegexpTagger(
        [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
         (r'(The|the|A|a|An|an)$', 'AT'), # articles
         (r'.*able$', 'JJ'),              # adjectives
         (r'.*ness$', 'NN'),              # nouns formed from adjectives
         (r'.*ly$', 'RB'),                # adverbs
         (r'.*s$', 'NNS'),                # plural nouns
         (r'.*ing$', 'VBG'),              # gerunds
         (r'.*ed$', 'VBD'),               # past tense verbs
         (r'.*', 'NN')                    # nouns (default)
        ])
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    t3 = nltk.TrigramTagger(train_sents, backoff=t2)
    return t3 
Example 5
Project: Natural-Language-Processing-with-Python-Cookbook   Author: PacktPublishing   File: OwnTagger.py    MIT License 5 votes vote down vote up
def learnRETagger(simpleSentence):
    customPatterns = [
        (r'.*ing$', 'ADJECTIVE'),             # running
        (r'.*ly$', 'ADVERB'),                 # willingly
        (r'.*ion$', 'NOUN'),                  # intimation
        (r'(.*ate|.*en|is)$', 'VERB'),        # terminate, darken, lighten
        (r'^an$', 'INDEFINITE-ARTICLE'),      # terminate
        (r'^(with|on|at)$', 'PREPOSITION'),   # on
        (r'^\-?[0-9]+(\.[0-9]+)$', 'NUMBER'), # -1.0, 12345.123
        (r'.*$', None),
    ]
    tagger = nltk.RegexpTagger(customPatterns)
    wordsInSentence = nltk.word_tokenize(simpleSentence)
    posEnabledTags = tagger.tag(wordsInSentence)
    print(posEnabledTags) 
Example 6
Project: ICE   Author: shahryarabaki   File: pos_tagger.py    Apache License 2.0 5 votes vote down vote up
def _train_tagger(self):
        training_sents = treebank.tagged_sents()
        patterns = [  # for regexp tagger
            (r'^[\.|\?|!]$', '.'),
            (r'^,$', ','),
            (r'^\'$', '\'\''),
            (r'^\"$', '\"'),
            (r'^\($', '('),
            (r'^\)$', ')'),
            (r'^[=|/]$', 'SYM'),
            (r'.*ing$', 'VBG'),
            (r'.*ed$', 'VBD'),
            (r'.*es$', 'VBZ'),
            (r'.*ould$', 'MD'),
            (r'.*\'s$', 'POS'),
            (r'.*s$', 'NNS'),
            (r'(The|the|A|a|An|an)$', 'AT'),
            (r'.*able$', 'JJ'),
            (r'.*ly$', 'RB'),
            (r'.*s$', 'NNS'),
            (r'^[0-9][0-9]*$', 'CD'),
            (r'^[0-9]([0-9]*[-|.|,|/][0-9]*)*$', 'CD'),
            (r'^([0-9]*\.[0-9]*)*$', 'CD'),
            (r'^[^a-zA-Z]*$', ':'),
            (r'[A-Z].*', 'NNP'),
            (r'.*', 'NN')]

        default_tagger = nltk.DefaultTagger('NN')
        regexp_tagger = nltk.RegexpTagger(patterns, backoff=default_tagger)
        unigram_tagger = nltk.UnigramTagger(training_sents, backoff=regexp_tagger)
        bigram_tagger = nltk.BigramTagger(training_sents, backoff=unigram_tagger)
        trigram_tagger = nltk.TrigramTagger(training_sents, backoff=bigram_tagger)

        self.final_tagger = trigram_tagger 
Example 7
Project: ICE   Author: shahryarabaki   File: n_grams_test.py    Apache License 2.0 5 votes vote down vote up
def test_POS_tag_tokenize_words_simple_test(self):
        training_sents = brown.tagged_sents()

        patterns = [ # for regexp tagger
            (r'.*ing$', 'VBG'),
            (r'.*ed$', 'VBD'),
            (r'.*es$', 'VBZ'),
            (r'.*ould$', 'MD'),
            (r'.*\'s$', 'POS'),
            (r'.*s$', 'NNS'),
            (r'(The|the|A|a|An|an)$', 'AT'),
            (r'.*able$', 'JJ'),
            (r'.*ly$', 'RB'),
            (r'.*s$', 'NNS'),
            (r'.*', 'NN')]

        default_tagger = nltk.DefaultTagger('NN')
        regexp_tagger = nltk.RegexpTagger(patterns, backoff=default_tagger)
        unigram_tagger = nltk.UnigramTagger(training_sents, backoff=regexp_tagger)
        bigram_tagger = nltk.BigramTagger(training_sents, backoff=unigram_tagger)
        trigram_tagger = nltk.TrigramTagger(training_sents, backoff=bigram_tagger)
        
        final_tagger = trigram_tagger

        self.assertEqual(
            [[('who', 'WPS'),
            ('are', 'BER'),
            ('your', 'PP$'),
            ('friend', 'NN'),
            ("'s", 'POS'),
            ('here', 'RB'),
            ('?', '.')]],
            POS_tag_tokenized_phrases(
                [ ['who', 'are', 'your', 'friend', "'s", 'here', '?'] ],
                final_tagger)) 
Example 8
Project: BrillPlusPlus   Author: elaheh-sadredini   File: ap-exp.py    BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def gen_ap_regex():
    print "============================================================"
    print "Generate Regex from learned Brill tagging rules."
    # Parameters:
    training = my_corpus.tagged_sents()
    templates = nltk.tag.brill.fntbl37()
    n_rules = 30

    # Taggers:
    print "Initializing ..."
    regex_tagger = nltk.RegexpTagger(
        [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
         (r'(The|the|A|a|An|an)$', 'AT'),   # articles
         (r'.*able$', 'JJ'),                # adjectives
         (r'.*ness$', 'NN'),                # nouns formed from adjectives
         (r'.*ly$', 'RB'),                  # adverbs
         (r'.*s$', 'NNS'),                  # plural nouns
         (r'.*ing$', 'VBG'),                # gerunds
         (r'.*ed$', 'VBD'),                 # past tense verbs
         (r'.*', 'NN')                      # nouns (default)
        ])
    u_gram_tag=nltk.UnigramTagger(training,backoff=regex_tagger)
    b_gram_tag=nltk.BigramTagger(training,backoff=u_gram_tag)
    t_gram_tag=nltk.TrigramTagger(training,backoff=b_gram_tag)

    print "Training brill tagger ..."
    tt = BrillTaggerTrainer(t_gram_tag, templates, trace=3)
    brill_tagger = tt.train(training, max_rules=n_rules)
    print "Training finished."

    print "Template size:", len(templates)
    range_l, range_r = get_template_range(templates)
    print "Template range:", range_l, range_r
    print "Total rules:", len(brill_tagger.rules())
    print "Generating Regex for the AP ..."

    for rule in brill_tagger.rules():
        regex, report_tag = rule_to_regex(rule, range_l, range_r)
        print report_tag, ":", regex

    print "Done."


# Cross validation