Python nltk.ConditionalFreqDist() Examples

The following are 6 code examples for showing how to use nltk.ConditionalFreqDist(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module nltk , or try the search function .

Example 1
Project: nltk_teach   Author: nltk   File: categories.py    License: Apache License 2.0 6 votes vote down vote up
def build_word_associations():
    cfd = nltk.ConditionalFreqDist()

    # get a list of all English stop words
    stopwords_list = nltk.corpus.stopwords.words('english')

    # count words that occur within a window of size 5 ahead of other words
    for sentence in nltk.corpus.brown.tagged_sents():
        sentence = [(token.lower(), tag) for (token, tag) in sentence if token.lower() not in stopwords_list]
        for (index, (token, tag)) in enumerate(sentence):
            if token not in stopwords_list:
                window = sentence[index+1:index+5]
                for (window_token, window_tag) in window:
                    if window_token not in stopwords_list and window_tag[0] is tag[0]:
                        cfd[token].inc(window_token)
    return cfd 
Example 2
Project: yenlp   Author: stathius   File: sentiwordnet.py    License: GNU General Public License v3.0 6 votes vote down vote up
def word_sense_cdf(word, context, wn_pos):
    '''Word sense disambiguation in terms of matching words frequency 
    between the context each sense's definition. Adapted from
    www.slideshare.net/faigg/tutotial-of-sentiment-analysis'''
    senses = wordnet.synsets(word, wn_pos)
    if len(senses) > 0:
        cfd = nltk.ConditionalFreqDist((sense, def_word)
                       for sense in senses
                       for def_word in sense.definition().split()
                       if def_word in context)
        best_sense = senses[0]
        for sense in senses:
            try:
                if cfd[sense].max() > cfd[best_sense].max():
                    best_sense = sense
            except: 
                pass                
        return best_sense
    else:
        return None 
Example 3
def test_increment(self):
        # make sure that we can still mutate cfd normally
        text = "cow cat mouse cat tiger"
        cfd = ConditionalFreqDist()

        # create cfd with word length as condition 
        for word in tokenize.word_tokenize(text):
            condition = len(word)
            cfd[condition][word] += 1

        self.assertEqual(cfd.conditions(), [3,5])

        # incrementing previously unseen key is still possible
        cfd[2]['hi'] += 1
        self.assertEqual(set(cfd.conditions()),set([3,5,2])) # new condition added
        self.assertEqual(cfd[2]['hi'], 1) # key's frequency incremented from 0 (unseen) to 1 
Example 4
Project: deep_disfluency   Author: clp-research   File: hmm.py    License: MIT License 5 votes vote down vote up
def train_markov_model_from_constraint_matrix(self, csv_path, mm_path,
                                                  delim="\t"):
        table = [line.split(delim) for line in open(csv_path)]
        tags = []
        range_states = table.pop(0)[1:]
        for row in table:
            domain = row[0]
            for i, r in enumerate(row[1:]):
                s = r.replace(" ", "").strip("\n")
                if (s == ''):
                    continue
                if int(s) > 0:
                    for _ in range(0, int(s)):
                        tags.append((domain, range_states[i]))
        self.cfd_tags = nltk.ConditionalFreqDist(tags)
        print "cfd trained, counts:"
        self.cfd_tags.tabulate()
        print "test:"
        print tabulate_cfd(self.cfd_tags)
        # save this new cfd for later use
        pickle.dump(self.cfd_tags, open(mm_path, "wb"))
        # initialize the cpd
        self.cpd_tags = nltk.ConditionalProbDist(self.cfd_tags,
                                                 nltk.MLEProbDist)
        # print "cpd summary:"
        # print self.cpd_tags.viewitems()
        print tabulate_cfd(self.cpd_tags)
        all_outcomes = [v.keys() for v in self.cfd_tags.values()]
        self.tag_set = set(self.cfd_tags.keys() +
                           [y for x in all_outcomes for y in x])
        self.viterbi_init()  # initialize viterbi 
Example 5
def test_tabulate(self):
        empty = ConditionalFreqDist()
        self.assertEqual(empty.conditions(),[])
        try:
            empty.tabulate(conditions="BUG") # nonexistent keys shouldn't be added
        except:
            pass
        self.assertEqual(empty.conditions(), []) 
Example 6
def test_plot(self):
        empty = ConditionalFreqDist()
        self.assertEqual(empty.conditions(),[])
        try:
            empty.plot(conditions=["BUG"]) # nonexistent keys shouldn't be added
        except:
            pass
        self.assertEqual(empty.conditions(),[])