Python nltk.ConditionalFreqDist() Examples

The following are code examples for showing how to use nltk.ConditionalFreqDist(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: yenlp   Author: stathius   File: sentiwordnet.py    GNU General Public License v3.0 6 votes vote down vote up
def word_sense_cdf(word, context, wn_pos):
    '''Word sense disambiguation in terms of matching words frequency 
    between the context each sense's definition. Adapted from
    www.slideshare.net/faigg/tutotial-of-sentiment-analysis'''
    senses = wordnet.synsets(word, wn_pos)
    if len(senses) > 0:
        cfd = nltk.ConditionalFreqDist((sense, def_word)
                       for sense in senses
                       for def_word in sense.definition().split()
                       if def_word in context)
        best_sense = senses[0]
        for sense in senses:
            try:
                if cfd[sense].max() > cfd[best_sense].max():
                    best_sense = sense
            except: 
                pass                
        return best_sense
    else:
        return None 
Example 2
Project: neuralgae   Author: spikelynch   File: generate_text.py    GNU General Public License v2.0 5 votes vote down vote up
def makeCfd(m):
    deffile = os.path.join(DEFINITIONS, '%s.txt' % m)
    if not os.path.isfile(deffile):
        print "Can't find %s" % deffile
        sys.exit(-1)
    with open(deffile, 'r') as f:
        corpus = f.read()
        corpus = corpus.split()
        pairs = makePairs(corpus)
        cfd = nltk.ConditionalFreqDist(pairs)
    return cfd 
Example 3
Project: nltk-on-gae   Author: sivu22   File: nltk_utils.py    Apache License 2.0 5 votes vote down vote up
def findTags(tagPrefix,taggedText,tagResults,useBlacklist = True):
    if useBlacklist:
        cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in taggedText
                                        if tag.startswith(tagPrefix) and not (word.lower() in blacklist))
    else:
        cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in taggedText
                                        if tag.startswith(tagPrefix))

    if tagResults < 1:
        return dict((tag, cfd[tag].keys()[:len(cfd[tag].keys())]) for tag in cfd.conditions())

    return dict((tag, cfd[tag].keys()[:tagResults]) for tag in cfd.conditions()) 
Example 4
Project: keyword-extraction   Author: htetmyet   File: plot_termf.py    MIT License 5 votes vote down vote up
def findtags(tag_prefix, tagged_text):
    cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text
                                  if tag.startswith(tag_prefix))
    return dict((tag, cfd[tag].most_common(5)) for tag in cfd.conditions()) 
Example 5
Project: arabic-historical-dictionary-backend   Author: mohammedi-haroune   File: fill_documents.py    MIT License 5 votes vote down vote up
def testDoc(request):
    import nltk
    # path = Corpus.objects.filter(name='الجامع الاساسي')[0].path
    documents = Document.objects.all()
    documents = [(document.name,(document.fileid,document.category)) for document in documents]
    cfd = nltk.ConditionalFreqDist(documents)
    freqs = [cond for cond in cfd.keys() if len(cfd[cond]) > 1]
    res = dict((key,list(cfd[key])) for key in freqs)
    categories = list(set(list(cfd[key])[0][1] for key in freqs))
    qur = dict((key,res[key]) for key in res if res[key][0][1] == 'quran')
    # duplicated = set([(document.name,document.fileid)
    #               for d in documents
    #                 for document in documents
    #                     if document != d and document.name == d.name])
    return JsonResponse([categories,qur,res], safe=False) 
Example 6
Project: wntf   Author: tonybaloney   File: algorithm.py    ISC License 5 votes vote down vote up
def findtags(self, tag_prefix, tagged_text):
        '''
        Find all words that match a 'tag' (word type) prefix

        :param tag_prefix: The tag prefix
        :type  tag_prefix: ``str``

        :param tagged_text: The text to search
        :type  tagged_text: ``list`` of ``dict``
        '''
        cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text
            if tag.startswith(tag_prefix))
        return dict((tag, cfd[tag].most_common(50)) for tag in cfd.conditions()) 
Example 7
Project: trolling_detection   Author: rafaharo   File: feature_extraction.py    Apache License 2.0 5 votes vote down vote up
def language_model(collection):
    from nltk import ConditionalProbDist
    from nltk import ConditionalFreqDist
    from nltk import bigrams
    from nltk import MLEProbDist
    words = tokenize_collection(collection)
    freq_model = ConditionalFreqDist(bigrams(words))
    prob_model = ConditionalProbDist(freq_model, MLEProbDist)
    return prob_model 
Example 8
Project: python-scripts   Author: Ventrosky   File: ngram.py    GNU General Public License v3.0 5 votes vote down vote up
def makeTrigramFreq(tokens):
    tgs = filterNgrams(list(nltk.trigrams(tokens)))
    a,b,c = list(zip(*tgs))
    bgs = list(zip(a,b))
    cfd = nltk.ConditionalFreqDist(list(zip(bgs, c)))
    with open('freqTrigrams.json', 'w') as f:
        json.dump(remapKeys(cfd),f)
    return cfd 
Example 9
Project: python-scripts   Author: Ventrosky   File: ngram.py    GNU General Public License v3.0 5 votes vote down vote up
def makeBigramFreq(tokens):
    bgs = filterNgrams(list(nltk.bigrams(tokens)))
    cfd = nltk.ConditionalFreqDist(bgs)
    with open('freqBigrams.json', 'w') as g:
        json.dump(cfd, g)
    return cfd 
Example 10
Project: facebook-message-analysis   Author: szheng17   File: language_model.py    MIT License 4 votes vote down vote up
def generate_from_trigrams(lm, start_words, n_words):
        """
        backoff model
        start_words: list of two strings.
        n_words: integer >= 0, number of words to generate, not including start_words
        lm: lowercase_tokens must be nonempty
        """
        # Create probability maps
        trigram_counter = Counter(ngrams(lm.lowercase_tokens, 3))
        trigram_prob = trigram_prob_map(trigram_counter)
        bigram_cfd = nltk.ConditionalFreqDist(ngrams(lm.lowercase_tokens, 2))
        bigram_prob = bigram_prob_map(bigram_cfd)
        unigram_counter = Counter(lm.lowercase_tokens)
        unigram_prob = unigram_prob_map(unigram_counter)

        # Build sentence
        w1, w2 = start_words[0], start_words[1]
        words = [w1, w2]
        for i in range(n_words):
            # Use trigram
            if (w1, w2) in trigram_prob:
                prob_map = trigram_prob[(w1, w2)]
                next_words = prob_map.keys()
                next_word = choice(next_words, p=[prob_map[w] for w in next_words])
                
            # Use bigram
            elif w2 in bigram_prob:
                prob_map = bigram_prob[w2]
                next_words = prob_map.keys()
                next_word = choice(next_words, p=[prob_map[w] for w in next_words])

            # Use unigram
            else:
                prob_map = unigram_prob
                next_words = prob_map.keys()
                next_word = choice(next_words, p=[prob_map[w] for w in next_words])


            # Update words
            w1 = w2
            w2 = next_word
            words.append(w2)
        sentence = ' '.join(words)
        return sentence