Python nltk.FreqDist() Examples
The following are 30 code examples for showing how to use nltk.FreqDist(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
You may check out the related API usage on the sidebar.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example 1
Project: BERT Author: yyht File: utils.py License: Apache License 2.0 | 7 votes |
def bigram_counts(word_list): bgs = nltk.bigrams(word_list) fdist = nltk.FreqDist(bgs) d = Counter() for k, v in fdist.items(): d[k] = v return d
Example 2
Project: DiSAN Author: taoshen58 File: analysis.py License: Apache License 2.0 | 7 votes |
def do_analysis(dataset_obj): # 1. all sample classification distribution # 2. all sentence sample classification distribution sample_num = dataset_obj.sample_num collect = [] sent_collect = [] for trees in dataset_obj.nn_data: for sample in trees: sentiment_float = sample['root_node']['sentiment_label'] sentiment_int = cfg.sentiment_float_to_int(sentiment_float) if sample['is_sent']: sent_collect.append(sentiment_int) collect.append(sentiment_int) all_pdf = nltk.FreqDist(collect) sent_pdf = nltk.FreqDist(sent_collect) print('sample_num:', sample_num) print('all') print(all_pdf.tabulate()) print('sent') print(sent_pdf.tabulate())
Example 3
Project: razzy-spinner Author: rafasashi File: textcat.py License: GNU General Public License v3.0 | 6 votes |
def profile(self, text): ''' Create FreqDist of trigrams within text ''' from nltk import word_tokenize, FreqDist clean_text = self.remove_punctuation(text) tokens = word_tokenize(clean_text) fingerprint = FreqDist() for t in tokens: token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR) token_trigrams = [''.join(tri) for tri in token_trigram_tuples] for cur_trigram in token_trigrams: if cur_trigram in fingerprint: fingerprint[cur_trigram] += 1 else: fingerprint[cur_trigram] = 1 return fingerprint
Example 4
Project: lisc Author: lisc-tools File: articles_all.py License: Apache License 2.0 | 6 votes |
def create_freq_dist(in_lst, exclude): """Create a frequency distribution. Parameters ---------- in_lst : list of str Words to create the frequency distribution from. exclude : list of str Words to exclude from the frequency distribution. Returns ------- freqs : nltk.FreqDist Frequency distribution of the words. Examples -------- Compute the frequency distribution of a collection of words: >>> ArticlesAll.create_freq_dist(in_lst=['brain', 'brain', 'head', 'body'], exclude=['body']) FreqDist({'brain': 2, 'head': 1}) If you want to visualize a frequency distribution, you can plot them as a wordcloud: >>> from lisc.plts.words import plot_wordcloud >>> freq_dist = nltk.FreqDist({'frontal': 26, 'brain': 26, 'lobe': 23, 'patients': 19}) >>> plot_wordcloud(freq_dist, len(freq_dist)) """ freqs = nltk.FreqDist(in_lst) for excl in exclude: try: freqs.pop(excl.lower()) except KeyError: pass return freqs
Example 5
Project: atap Author: foxbook File: reader.py License: Apache License 2.0 | 6 votes |
def describe(self, fileids=None, categories=None): """ Performs a single pass of the corpus and returns a dictionary with a variety of metrics concerning the state of the corpus. """ # Structures to perform counting. counts = nltk.FreqDist() tokens = nltk.FreqDist() # Perform single pass over paragraphs, tokenize and count for para in self.paras(fileids, categories): for sent in para: for word, tag in sent: counts['words'] += 1 tokens[word] += 1 # Return data structure with information return { 'words': counts['words'], 'vocab': len(tokens), 'lexdiv': float(counts['words']) / float(len(tokens)), }
Example 6
Project: SML-Cogs Author: smlbiobot File: tldr.py License: MIT License | 6 votes |
def _calculate_word_scores(self, phrase_list): word_freq = nltk.FreqDist() word_degree = nltk.FreqDist() for phrase in phrase_list: # degree = len(filter(lambda x: not isNumeric(x), phrase)) - 1 # SML above cost error degree = len(list(filter(lambda x: not isNumeric(x), phrase))) - 1 for word in phrase: # word_freq.inc(word) # SML error above: word_freq[word] += 1 # word_degree.inc(word, degree) # other words word_degree[word] = degree for word in word_freq.keys(): word_degree[word] = word_degree[word] + word_freq[word] # itself # word score = deg(w) / freq(w) word_scores = {} for word in word_freq.keys(): word_scores[word] = word_degree[word] / word_freq[word] return word_scores
Example 7
Project: ConvNetPy Author: benglard File: dialogue.py License: MIT License | 6 votes |
def load_data(): global N, words, labels posts = corpus.xml_posts()[:10000] freqs = [ FreqDist(post.text) for post in posts ] words = list(set(word for dist in freqs for word in dist.keys() if word not in ENGLISH_STOP_WORDS and word not in punctuation)) labels = list(set([ post.get('class') for post in posts ])) data = [] N = len(words) for post, dist in zip(posts, freqs): V = Vol(1, 1, N, 0.0) for i, word in enumerate(words): V.w[i] = dist.freq(word) data.append((V, labels.index(post.get('class')))) return data
Example 8
Project: ConvNetPy Author: benglard File: topics.py License: MIT License | 6 votes |
def load_data(): global N, words freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ] words = list(set(word for dist in freqs for word in dist.keys() if word not in ENGLISH_STOP_WORDS and word not in punctuation)) data = [] N = len(words) for dist in freqs: V = Vol(1, 1, N, 0.0) for i, word in enumerate(words): V.w[i] = dist.freq(word) data.append((V, V.w)) return data
Example 9
Project: ConvNetPy Author: benglard File: similarity.py License: MIT License | 6 votes |
def load_data(): global N, words freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ] words = list(set(word for dist in freqs for word in dist.keys() if word not in ENGLISH_STOP_WORDS and word not in punctuation)) data = [] N = len(words) for dist in freqs: x = volumize(dist) data.append((x, x.w)) return data
Example 10
Project: ConvNetPy Author: benglard File: similarity.py License: MIT License | 6 votes |
def test(): gt = GetTweets() documents = gt.get_hashtag('ferguson', count=20) documents += gt.get_hashtag('police', count=21) print 'Query:', documents[-1] tokenizer = RegexpTokenizer('\w+') vols = [] for doc in documents: samples = [] for token in tokenizer.tokenize(doc): word = token.lower() if word not in ENGLISH_STOP_WORDS and word not in punctuation: samples.append(word) vols.append(volumize(FreqDist(samples))) vectors = [ doc_code(v) for v in vols[:-1] ] query_vec = doc_code(vols[-1]) sims = [ cos(v, query_vec) for v in vectors ] m = max(sims) print m, documents[sims.index(m)]
Example 11
Project: DiSAN Author: taoshen58 File: nlp.py License: Apache License 2.0 | 6 votes |
def gene_token_freq_info(context_token, question_token): def look_up_dict(t_dict, t): try: return t_dict[t] except KeyError: return 0 context_token_dict = dict(nltk.FreqDist(context_token)) question_token_dict = dict(nltk.FreqDist(question_token)) # context tokens in context and question dicts context_tf = [] for token in context_token: context_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token))) # question tokens in context and question dicts question_tf = [] for token in context_token: question_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token))) return {'context':context_tf, 'question':question_tf}
Example 12
Project: CrisisLex Author: sajao File: lexicon.py License: MIT License | 6 votes |
def __init__(self, documents, terms, classes, class_types, frequency, main_class, min_docs): self.terms = terms # the terms used to build the lexicon self.documents = documents self.classes = classes self.terms_frequency = frequency self.terms_frequency_per_class = dict() self.main_class = main_class # the minimum support for a term (i.e., number of documents in the class of interest in order to be considered) self.min_docs = min_docs self.class_occ = dict() for c in class_types: self.terms_frequency_per_class[c]=nltk.FreqDist() self.class_occ[c] = classes.count(c) for i, doc in enumerate(self.documents): cls = self.classes[i] for t in doc: self.terms_frequency_per_class[cls].inc(t) # the scoring functions return the list of discriminative terms for the class of interest according to each metric
Example 13
Project: ReSAN Author: taoshen58 File: nlp.py License: Apache License 2.0 | 6 votes |
def gene_token_freq_info(context_token, question_token): def look_up_dict(t_dict, t): try: return t_dict[t] except KeyError: return 0 context_token_dict = dict(nltk.FreqDist(context_token)) question_token_dict = dict(nltk.FreqDist(question_token)) # context tokens in context and question dicts context_tf = [] for token in context_token: context_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token))) # question tokens in context and question dicts question_tf = [] for token in context_token: question_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token))) return {'context':context_tf, 'question':question_tf}
Example 14
Project: jakaton_feminicidios Author: iorch File: lang_model_2.py License: MIT License | 6 votes |
def __init__(self, order, alpha, sentences): self.order = order self.alpha = alpha if order > 1: self.backoff = LangModel(order - 1, alpha, sentences) self.lexicon = None else: self.backoff = None self.n = 0 self.ngramFD = nltk.FreqDist() lexicon = set() for sentence in sentences: words = nltk.word_tokenize(sentence) wordNGrams = nltk.ngrams(words, order) for wordNGram in wordNGrams: self.ngramFD[wordNGram] += 1 # self.ngramFD.inc(wordNGram) if order == 1: lexicon.add(wordNGram) self.n += 1 self.v = len(lexicon)
Example 15
Project: BiBloSA Author: taoshen58 File: nlp.py License: Apache License 2.0 | 6 votes |
def gene_token_freq_info(context_token, question_token): def look_up_dict(t_dict, t): try: return t_dict[t] except KeyError: return 0 context_token_dict = dict(nltk.FreqDist(context_token)) question_token_dict = dict(nltk.FreqDist(question_token)) # context tokens in context and question dicts context_tf = [] for token in context_token: context_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token))) # question tokens in context and question dicts question_tf = [] for token in context_token: question_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token))) return {'context':context_tf, 'question':question_tf}
Example 16
Project: BiBloSA Author: taoshen58 File: analysis.py License: Apache License 2.0 | 6 votes |
def do_analysis(dataset_obj): # 1. all sample classification distribution # 2. all sentence sample classification distribution sample_num = dataset_obj.sample_num collect = [] sent_collect = [] for trees in dataset_obj.nn_data: for sample in trees: sentiment_float = sample['root_node']['sentiment_label'] sentiment_int = cfg.sentiment_float_to_int(sentiment_float) if sample['is_sent']: sent_collect.append(sentiment_int) collect.append(sentiment_int) all_pdf = nltk.FreqDist(collect) sent_pdf = nltk.FreqDist(sent_collect) print('sample_num:', sample_num) print('all') print(all_pdf.tabulate()) print('sent') print(sent_pdf.tabulate())
Example 17
Project: BiBloSA Author: taoshen58 File: nlp.py License: Apache License 2.0 | 6 votes |
def gene_token_freq_info(context_token, question_token): def look_up_dict(t_dict, t): try: return t_dict[t] except KeyError: return 0 context_token_dict = dict(nltk.FreqDist(context_token)) question_token_dict = dict(nltk.FreqDist(question_token)) # context tokens in context and question dicts context_tf = [] for token in context_token: context_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token))) # question tokens in context and question dicts question_tf = [] for token in context_token: question_tf.append((look_up_dict(context_token_dict, token), look_up_dict(question_token_dict, token))) return {'context':context_tf, 'question':question_tf}
Example 18
Project: hierarchical-attention-networks Author: tqtg File: data_prepare.py License: MIT License | 5 votes |
def build_vocab(docs, save_path): print('Building vocab ...') sents = itertools.chain(*[text.split('<sssss>') for text in docs]) tokenized_sents = [sent.split() for sent in sents] # Count the word frequencies word_freq = nltk.FreqDist(itertools.chain(*tokenized_sents)) print("%d unique words found" % len(word_freq.items())) # Cut-off retained_words = [w for (w, f) in word_freq.items() if f > WORD_CUT_OFF] print("%d words retained" % len(retained_words)) # Get the most common words and build index_to_word and word_to_index vectors # Word index starts from 2, 1 is reserved for UNK, 0 is reserved for padding word_to_index = {'PAD': 0, 'UNK': 1} for i, w in enumerate(retained_words): word_to_index[w] = i + 2 index_to_word = {i: w for (w, i) in word_to_index.items()} print("Vocabulary size = %d" % len(word_to_index)) with open('{}-w2i.pkl'.format(save_path), 'wb') as f: pickle.dump(word_to_index, f) with open('{}-i2w.pkl'.format(save_path), 'wb') as f: pickle.dump(index_to_word, f) return word_to_index
Example 19
Project: wordvectors Author: Kyubyong File: make_wordvectors.py License: MIT License | 5 votes |
def get_min_count(sents): ''' Args: sents: A list of lists. E.g., [["I", "am", "a", "boy", "."], ["You", "are", "a", "girl", "."]] Returns: min_count: A uint. Should be set as the parameter value of word2vec `min_count`. ''' global vocab_size from itertools import chain fdist = nltk.FreqDist(chain.from_iterable(sents)) min_count = fdist.most_common(vocab_size)[-1][1] # the count of the the top-kth word return min_count
Example 20
Project: Python Author: Ajinkya-Sonawane File: sentiment.py License: MIT License | 5 votes |
def get_word_features(wordList): wordList = nltk.FreqDist(wordList) features = wordList.keys() return features #Function to extract words based on document features
Example 21
Project: lisc Author: lisc-tools File: test_plts_words.py License: Apache License 2.0 | 5 votes |
def test_plot_wordcloud(): freq_dist = FreqDist(['lots', 'of', 'words', 'words']) plot_wordcloud(freq_dist, 5)
Example 22
Project: lisc Author: lisc-tools File: test_plts_wordcloud.py License: Apache License 2.0 | 5 votes |
def test_conv_freqs(): freq_dist = FreqDist(['lots', 'of', 'words', 'words']) out = conv_freqs(freq_dist, 2) assert isinstance(out, dict)
Example 23
Project: BERT Author: yyht File: utils.py License: Apache License 2.0 | 5 votes |
def trigram_counts(word_list): tgs = nltk.trigrams(word_list) fdist = nltk.FreqDist(tgs) d = Counter() for k, v in fdist.items(): d[k] = v return d
Example 24
Project: truecaser Author: nreimers File: TrainFunctions.py License: Apache License 2.0 | 5 votes |
def checkSentenceSanity(sentence): """ Checks the sanity of the sentence. If the sentence is for example all uppercase, it is recjected""" caseDist = nltk.FreqDist() for token in sentence: caseDist[getCasing(token)] += 1 if caseDist.most_common(1)[0][0] != 'allLower': return False return True
Example 25
Project: python-examples Author: jamesacampbell File: sentiment_analysis_nltk-example.py License: MIT License | 5 votes |
def get_word_features(wordlist): wordlist = nltk.FreqDist(wordlist) word_features = wordlist.keys() return word_features
Example 26
Project: python-examples Author: jamesacampbell File: multi-categorization-tweets-example.py License: MIT License | 5 votes |
def get_word_features(wordlist): wordlist = nltk.FreqDist(wordlist) word_features = wordlist.keys() return word_features
Example 27
Project: Natural-Language-Processing-with-Python-Cookbook Author: PacktPublishing File: Similarity.py License: MIT License | 5 votes |
def TF(self, sentence): words = nltk.word_tokenize(sentence.lower()) freq = nltk.FreqDist(words) dictionary = {} for key in freq.keys(): norm = freq[key]/float(len(words)) dictionary[key] = norm return dictionary
Example 28
Project: augmented_seq2seq Author: suriyadeepan File: data.py License: GNU General Public License v3.0 | 5 votes |
def index_(tokenized_sentences, vocab_size): # get frequency distribution freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences)) # get vocabulary of 'vocab_size' most used words vocab = freq_dist.most_common(vocab_size) vocab = [ item for item in vocab if item[1] > 1 ] # index2word index2word = ['_'] + ['UNK'] + list(POS_TAGS.keys()) + [ x[0] for x in vocab ] # word2index word2index = dict([(w,i) for i,w in enumerate(index2word)] ) return index2word, word2index, freq_dist
Example 29
Project: augmented_seq2seq Author: suriyadeepan File: data.py License: GNU General Public License v3.0 | 5 votes |
def index_(tokenized_sentences, vocab_size): # get frequency distribution freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences)) # get vocabulary of 'vocab_size' most used words vocab = freq_dist.most_common(vocab_size) # index2word index2word = ['_'] + [UNK] + [ x[0] for x in vocab ] # word2index word2index = dict([(w,i) for i,w in enumerate(index2word)] ) return index2word, word2index, freq_dist
Example 30
Project: conv-emotion Author: declare-lab File: vocab.py License: MIT License | 5 votes |
def __init__(self, tokenizer=None, max_size=None, min_freq=1): """Basic Vocabulary object""" self.vocab_size = 0 self.freqdist = FreqDist() self.tokenizer = tokenizer