Python nltk.FreqDist() Examples

The following are 30 code examples for showing how to use nltk.FreqDist(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module nltk , or try the search function .

Example 1
Project: BERT   Author: yyht   File: utils.py    License: Apache License 2.0 8 votes vote down vote up
def bigram_counts(word_list):
	bgs = nltk.bigrams(word_list)
	fdist = nltk.FreqDist(bgs)
	d = Counter()
	for k, v in fdist.items():
		d[k] = v
	return d 
Example 2
Project: DiSAN   Author: taoshen58   File: analysis.py    License: Apache License 2.0 7 votes vote down vote up
def do_analysis(dataset_obj):
        # 1. all sample classification distribution
        # 2. all sentence sample classification distribution
        sample_num = dataset_obj.sample_num
        collect = []
        sent_collect = []
        for trees in dataset_obj.nn_data:
            for sample in trees:
                sentiment_float = sample['root_node']['sentiment_label']
                sentiment_int = cfg.sentiment_float_to_int(sentiment_float)
                if sample['is_sent']:
                    sent_collect.append(sentiment_int)
                collect.append(sentiment_int)
        all_pdf = nltk.FreqDist(collect)
        sent_pdf = nltk.FreqDist(sent_collect)
        print('sample_num:', sample_num)
        print('all')
        print(all_pdf.tabulate())
        print('sent')
        print(sent_pdf.tabulate()) 
Example 3
Project: lisc   Author: lisc-tools   File: articles_all.py    License: Apache License 2.0 6 votes vote down vote up
def create_freq_dist(in_lst, exclude):
        """Create a frequency distribution.

        Parameters
        ----------
        in_lst : list of str
            Words to create the frequency distribution from.
        exclude : list of str
            Words to exclude from the frequency distribution.

        Returns
        -------
        freqs : nltk.FreqDist
            Frequency distribution of the words.

        Examples
        --------
        Compute the frequency distribution of a collection of words:

        >>> ArticlesAll.create_freq_dist(in_lst=['brain', 'brain', 'head', 'body'], exclude=['body'])
        FreqDist({'brain': 2, 'head': 1})

        If you want to visualize a frequency distribution, you can plot them as a wordcloud:

        >>> from lisc.plts.words import plot_wordcloud
        >>> freq_dist = nltk.FreqDist({'frontal': 26, 'brain': 26, 'lobe': 23, 'patients': 19})
        >>> plot_wordcloud(freq_dist, len(freq_dist))
        """

        freqs = nltk.FreqDist(in_lst)

        for excl in exclude:
            try:
                freqs.pop(excl.lower())
            except KeyError:
                pass

        return freqs 
Example 4
Project: atap   Author: foxbook   File: reader.py    License: Apache License 2.0 6 votes vote down vote up
def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and
        returns a dictionary with a variety of metrics
        concerning the state of the corpus.
        """
        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            for sent in para:
                for word, tag in sent:
                    counts['words'] += 1
                    tokens[word] += 1

        # Return data structure with information
        return {
            'words':  counts['words'],
            'vocab':  len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
        } 
Example 5
Project: SML-Cogs   Author: smlbiobot   File: tldr.py    License: MIT License 6 votes vote down vote up
def _calculate_word_scores(self, phrase_list):
        word_freq = nltk.FreqDist()
        word_degree = nltk.FreqDist()
        for phrase in phrase_list:
            # degree = len(filter(lambda x: not isNumeric(x), phrase)) - 1
            # SML above cost error
            degree = len(list(filter(lambda x: not isNumeric(x), phrase))) - 1
            for word in phrase:
                # word_freq.inc(word)
                # SML error above:
                word_freq[word] += 1
                # word_degree.inc(word, degree) # other words
                word_degree[word] = degree
        for word in word_freq.keys():
            word_degree[word] = word_degree[word] + word_freq[word] # itself
        # word score = deg(w) / freq(w)
        word_scores = {}
        for word in word_freq.keys():
            word_scores[word] = word_degree[word] / word_freq[word]
        return word_scores 
Example 6
Project: ConvNetPy   Author: benglard   File: dialogue.py    License: MIT License 6 votes vote down vote up
def load_data():
    global N, words, labels

    posts = corpus.xml_posts()[:10000]
    freqs = [ FreqDist(post.text) for post in posts ] 
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    labels = list(set([ post.get('class') for post in posts ]))

    data = []
    N = len(words)
    for post, dist in zip(posts, freqs):
        V = Vol(1, 1, N, 0.0)
        for i, word in enumerate(words):
            V.w[i] = dist.freq(word)
        data.append((V, labels.index(post.get('class'))))

    return data 
Example 7
Project: ConvNetPy   Author: benglard   File: topics.py    License: MIT License 6 votes vote down vote up
def load_data():
    global N, words

    freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ]
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    data = []
    N = len(words)
    for dist in freqs:
        V = Vol(1, 1, N, 0.0)
        for i, word in enumerate(words):
            V.w[i] = dist.freq(word)
        data.append((V, V.w))

    return data 
Example 8
Project: ConvNetPy   Author: benglard   File: similarity.py    License: MIT License 6 votes vote down vote up
def load_data():
    global N, words

    freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ]
    words = list(set(word 
                    for dist in freqs 
                    for word in dist.keys()
                    if word not in ENGLISH_STOP_WORDS and
                    word not in punctuation))

    data = []
    N = len(words)
    for dist in freqs:
        x = volumize(dist)
        data.append((x, x.w))

    return data 
Example 9
Project: ConvNetPy   Author: benglard   File: similarity.py    License: MIT License 6 votes vote down vote up
def test(): 
    gt = GetTweets()
    documents = gt.get_hashtag('ferguson', count=20)
    documents += gt.get_hashtag('police', count=21)
    print 'Query:', documents[-1]

    tokenizer = RegexpTokenizer('\w+')
    vols = []
    for doc in documents:
        samples = []
        for token in tokenizer.tokenize(doc):
            word = token.lower()
            if word not in ENGLISH_STOP_WORDS and word not in punctuation:
                samples.append(word)
        vols.append(volumize(FreqDist(samples)))

    vectors = [ doc_code(v) for v in vols[:-1] ]
    query_vec = doc_code(vols[-1])

    sims = [ cos(v, query_vec) for v in vectors ]
    m = max(sims)
    print m, documents[sims.index(m)] 
Example 10
Project: DiSAN   Author: taoshen58   File: nlp.py    License: Apache License 2.0 6 votes vote down vote up
def gene_token_freq_info(context_token, question_token):
    def look_up_dict(t_dict, t):
        try:
            return t_dict[t]
        except KeyError:
            return 0
    context_token_dict = dict(nltk.FreqDist(context_token))
    question_token_dict = dict(nltk.FreqDist(question_token))

    # context tokens in context and question dicts
    context_tf = []
    for token in context_token:
        context_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token)))

    # question tokens in context and question dicts
    question_tf = []
    for token in context_token:
        question_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token)))

    return {'context':context_tf, 'question':question_tf} 
Example 11
Project: CrisisLex   Author: sajao   File: lexicon.py    License: MIT License 6 votes vote down vote up
def __init__(self, documents, terms, classes, class_types, frequency, main_class, min_docs):
        self.terms = terms  # the terms used to build the lexicon
        self.documents = documents
        self.classes = classes
        self.terms_frequency = frequency
        self.terms_frequency_per_class = dict()
        self.main_class = main_class
        # the minimum support for a term (i.e., number of documents in the class of interest in order to be considered)
        self.min_docs = min_docs
        self.class_occ = dict()
        for c in class_types:
            self.terms_frequency_per_class[c]=nltk.FreqDist()
            self.class_occ[c] = classes.count(c)
        for i, doc in enumerate(self.documents):
            cls = self.classes[i]
            for t in doc:
                self.terms_frequency_per_class[cls].inc(t)

    # the scoring functions return the list of discriminative terms for the class of interest according to each metric 
Example 12
Project: ReSAN   Author: taoshen58   File: nlp.py    License: Apache License 2.0 6 votes vote down vote up
def gene_token_freq_info(context_token, question_token):
    def look_up_dict(t_dict, t):
        try:
            return t_dict[t]
        except KeyError:
            return 0
    context_token_dict = dict(nltk.FreqDist(context_token))
    question_token_dict = dict(nltk.FreqDist(question_token))

    # context tokens in context and question dicts
    context_tf = []
    for token in context_token:
        context_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token)))

    # question tokens in context and question dicts
    question_tf = []
    for token in context_token:
        question_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token)))

    return {'context':context_tf, 'question':question_tf} 
Example 13
Project: jakaton_feminicidios   Author: iorch   File: lang_model_2.py    License: MIT License 6 votes vote down vote up
def __init__(self, order, alpha, sentences):
        self.order = order
        self.alpha = alpha
        if order > 1:
            self.backoff = LangModel(order - 1, alpha, sentences)
            self.lexicon = None
        else:
            self.backoff = None
            self.n = 0
        self.ngramFD = nltk.FreqDist()
        lexicon = set()
        for sentence in sentences:
            words = nltk.word_tokenize(sentence)
            wordNGrams = nltk.ngrams(words, order)
            for wordNGram in wordNGrams:
                self.ngramFD[wordNGram] += 1
                # self.ngramFD.inc(wordNGram)
                if order == 1:
                    lexicon.add(wordNGram)
                    self.n += 1
        self.v = len(lexicon) 
Example 14
Project: BiBloSA   Author: taoshen58   File: nlp.py    License: Apache License 2.0 6 votes vote down vote up
def gene_token_freq_info(context_token, question_token):
    def look_up_dict(t_dict, t):
        try:
            return t_dict[t]
        except KeyError:
            return 0
    context_token_dict = dict(nltk.FreqDist(context_token))
    question_token_dict = dict(nltk.FreqDist(question_token))

    # context tokens in context and question dicts
    context_tf = []
    for token in context_token:
        context_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token)))

    # question tokens in context and question dicts
    question_tf = []
    for token in context_token:
        question_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token)))

    return {'context':context_tf, 'question':question_tf} 
Example 15
Project: BiBloSA   Author: taoshen58   File: analysis.py    License: Apache License 2.0 6 votes vote down vote up
def do_analysis(dataset_obj):
        # 1. all sample classification distribution
        # 2. all sentence sample classification distribution
        sample_num = dataset_obj.sample_num
        collect = []
        sent_collect = []
        for trees in dataset_obj.nn_data:
            for sample in trees:
                sentiment_float = sample['root_node']['sentiment_label']
                sentiment_int = cfg.sentiment_float_to_int(sentiment_float)
                if sample['is_sent']:
                    sent_collect.append(sentiment_int)
                collect.append(sentiment_int)
        all_pdf = nltk.FreqDist(collect)
        sent_pdf = nltk.FreqDist(sent_collect)
        print('sample_num:', sample_num)
        print('all')
        print(all_pdf.tabulate())
        print('sent')
        print(sent_pdf.tabulate()) 
Example 16
Project: BiBloSA   Author: taoshen58   File: nlp.py    License: Apache License 2.0 6 votes vote down vote up
def gene_token_freq_info(context_token, question_token):
    def look_up_dict(t_dict, t):
        try:
            return t_dict[t]
        except KeyError:
            return 0
    context_token_dict = dict(nltk.FreqDist(context_token))
    question_token_dict = dict(nltk.FreqDist(question_token))

    # context tokens in context and question dicts
    context_tf = []
    for token in context_token:
        context_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token)))

    # question tokens in context and question dicts
    question_tf = []
    for token in context_token:
        question_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token)))

    return {'context':context_tf, 'question':question_tf} 
Example 17
Project: razzy-spinner   Author: rafasashi   File: textcat.py    License: GNU General Public License v3.0 5 votes vote down vote up
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)
        
        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint 
Example 18
Project: hierarchical-attention-networks   Author: tqtg   File: data_prepare.py    License: MIT License 5 votes vote down vote up
def build_vocab(docs, save_path):
  print('Building vocab ...')

  sents = itertools.chain(*[text.split('<sssss>') for text in docs])
  tokenized_sents = [sent.split() for sent in sents]

  # Count the word frequencies
  word_freq = nltk.FreqDist(itertools.chain(*tokenized_sents))
  print("%d unique words found" % len(word_freq.items()))

  # Cut-off
  retained_words = [w for (w, f) in word_freq.items() if f > WORD_CUT_OFF]
  print("%d words retained" % len(retained_words))

  # Get the most common words and build index_to_word and word_to_index vectors
  # Word index starts from 2, 1 is reserved for UNK, 0 is reserved for padding
  word_to_index = {'PAD': 0, 'UNK': 1}
  for i, w in enumerate(retained_words):
    word_to_index[w] = i + 2
  index_to_word = {i: w for (w, i) in word_to_index.items()}

  print("Vocabulary size = %d" % len(word_to_index))

  with open('{}-w2i.pkl'.format(save_path), 'wb') as f:
    pickle.dump(word_to_index, f)

  with open('{}-i2w.pkl'.format(save_path), 'wb') as f:
    pickle.dump(index_to_word, f)

  return word_to_index 
Example 19
Project: wordvectors   Author: Kyubyong   File: make_wordvectors.py    License: MIT License 5 votes vote down vote up
def get_min_count(sents):
    '''
    Args:
      sents: A list of lists. E.g., [["I", "am", "a", "boy", "."], ["You", "are", "a", "girl", "."]]
     
    Returns:
      min_count: A uint. Should be set as the parameter value of word2vec `min_count`.   
    '''
    global vocab_size
    from itertools import chain
     
    fdist = nltk.FreqDist(chain.from_iterable(sents))
    min_count = fdist.most_common(vocab_size)[-1][1] # the count of the the top-kth word
    
    return min_count 
Example 20
Project: Python   Author: Ajinkya-Sonawane   File: sentiment.py    License: MIT License 5 votes vote down vote up
def get_word_features(wordList):
    wordList = nltk.FreqDist(wordList)
    features = wordList.keys()
    return features

#Function to extract words based on document features 
Example 21
Project: lisc   Author: lisc-tools   File: test_plts_words.py    License: Apache License 2.0 5 votes vote down vote up
def test_plot_wordcloud():

    freq_dist = FreqDist(['lots', 'of', 'words', 'words'])
    plot_wordcloud(freq_dist, 5) 
Example 22
Project: lisc   Author: lisc-tools   File: test_plts_wordcloud.py    License: Apache License 2.0 5 votes vote down vote up
def test_conv_freqs():

    freq_dist = FreqDist(['lots', 'of', 'words', 'words'])
    out = conv_freqs(freq_dist, 2)

    assert isinstance(out, dict) 
Example 23
Project: BERT   Author: yyht   File: utils.py    License: Apache License 2.0 5 votes vote down vote up
def trigram_counts(word_list):
	tgs = nltk.trigrams(word_list)
	fdist = nltk.FreqDist(tgs)
	d = Counter()
	for k, v in fdist.items():
		d[k] = v
	return d 
Example 24
Project: truecaser   Author: nreimers   File: TrainFunctions.py    License: Apache License 2.0 5 votes vote down vote up
def checkSentenceSanity(sentence):
    """ Checks the sanity of the sentence. If the sentence is for example all uppercase, it is recjected"""
    caseDist = nltk.FreqDist()
    
    for token in sentence:
        caseDist[getCasing(token)] += 1
    
    if caseDist.most_common(1)[0][0] != 'allLower':        
        return False
    
    return True 
Example 25
Project: python-examples   Author: jamesacampbell   File: sentiment_analysis_nltk-example.py    License: MIT License 5 votes vote down vote up
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features 
Example 26
Project: python-examples   Author: jamesacampbell   File: multi-categorization-tweets-example.py    License: MIT License 5 votes vote down vote up
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features 
Example 27
Project: Natural-Language-Processing-with-Python-Cookbook   Author: PacktPublishing   File: Similarity.py    License: MIT License 5 votes vote down vote up
def TF(self, sentence):
        words = nltk.word_tokenize(sentence.lower())
        freq = nltk.FreqDist(words)
        dictionary = {}
        for key in freq.keys():
            norm = freq[key]/float(len(words))
            dictionary[key] = norm
        return dictionary 
Example 28
Project: augmented_seq2seq   Author: suriyadeepan   File: data.py    License: GNU General Public License v3.0 5 votes vote down vote up
def index_(tokenized_sentences, vocab_size):
    # get frequency distribution
    freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    # get vocabulary of 'vocab_size' most used words
    vocab = freq_dist.most_common(vocab_size)
    vocab = [ item for item in vocab if item[1] > 1 ]
    # index2word
    index2word = ['_'] + ['UNK'] + list(POS_TAGS.keys()) + [ x[0] for x in vocab ]
    # word2index
    word2index = dict([(w,i) for i,w in enumerate(index2word)] )
    return index2word, word2index, freq_dist 
Example 29
Project: augmented_seq2seq   Author: suriyadeepan   File: data.py    License: GNU General Public License v3.0 5 votes vote down vote up
def index_(tokenized_sentences, vocab_size):
    # get frequency distribution
    freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    # get vocabulary of 'vocab_size' most used words
    vocab = freq_dist.most_common(vocab_size)
    # index2word
    index2word = ['_'] + [UNK] + [ x[0] for x in vocab ]
    # word2index
    word2index = dict([(w,i) for i,w in enumerate(index2word)] )
    return index2word, word2index, freq_dist 
Example 30
Project: conv-emotion   Author: declare-lab   File: vocab.py    License: MIT License 5 votes vote down vote up
def __init__(self, tokenizer=None, max_size=None, min_freq=1):
        """Basic Vocabulary object"""

        self.vocab_size = 0
        self.freqdist = FreqDist()
        self.tokenizer = tokenizer