Python nltk.trigrams() Examples
The following are 7 code examples for showing how to use nltk.trigrams(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
You may check out the related API usage on the sidebar.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example 1
Project: jakaton_feminicidios Author: iorch File: lang_model_2.py License: MIT License | 6 votes |
def test(): lm1 = pickle.load(open("lm.bin", 'rb')) tweets_list = deviger.load_dataset('test.txt') for line in tweets_list: sentences = nltk.sent_tokenize(line.strip()) print("Tweet sentences:", sentences) for sent in sentences: words = nltk.word_tokenize(sent) word_trigrams = nltk.trigrams(words) sum_log_prob = 0 for trigram in word_trigrams: logprob = lm1.log_prob(trigram) sum_log_prob += logprob print("(", sum_log_prob / len(words), ")")
Example 2
Project: DeepLearn Author: GauravBh1010tt File: lex_sem_ft.py License: MIT License | 5 votes |
def train_trigram(lst): model = defaultdict(lambda: defaultdict(lambda: 0)) for sent in lst: sent = sent.split() for w1, w2, w3 in trigrams(sent, pad_right=True, pad_left=True): model[(w1,w2)][w2] += 1 total_count = 0 for w1,w2 in model: total_count = float(sum(model[(w1, w2)].values())) for w3 in model[(w1,w2)]: model[(w1, w2)][w3] /= total_count #Total Sum Of Trigram Probablity Of A Sentence[Returns Float]:
Example 3
Project: BERT Author: yyht File: utils.py License: Apache License 2.0 | 5 votes |
def trigram_counts(word_list): tgs = nltk.trigrams(word_list) fdist = nltk.FreqDist(tgs) d = Counter() for k, v in fdist.items(): d[k] = v return d
Example 4
Project: DL-text Author: GauravBh1010tt File: lex_sem_ft.py License: MIT License | 5 votes |
def train_trigram(lst): model = defaultdict(lambda: defaultdict(lambda: 0)) for sent in lst: sent = sent.split() for w1, w2, w3 in trigrams(sent, pad_right=True, pad_left=True): model[(w1,w2)][w2] += 1 total_count = 0 for w1,w2 in model: total_count = float(sum(model[(w1, w2)].values())) for w3 in model[(w1,w2)]: model[(w1, w2)][w3] /= total_count #Total Sum Of Trigram Probablity Of A Sentence[Returns Float]:
Example 5
Project: codenn Author: sriniiyer File: SVM.py License: MIT License | 5 votes |
def tokenize(text): # text = NB.remove_punctuation(text) try: text = text.decode('utf-8').encode('ascii', 'replace').strip().lower() except: text = text.encode('ascii', 'replace').strip().lower() word = [porter.stem(w) for w in re.findall(r"[\w'-]+|[^\s\w]", text)] # split punctuations but dont split single quotes for words like don't biword = [b for b in nltk.bigrams(word)] triword = [t for t in nltk.trigrams(word)] # word = [w for w in word if w not in stopwords.words('english')] return word # triword
Example 6
Project: words2map Author: overlap-ai File: words2map.py License: MIT License | 5 votes |
def get_collocations(words): # returns n-grams up to trigrams that appear at least 3 times, with pruning of grams that are redundant minimum_frequency = 3 ngrams = {"_".join(ngram): frequency/len(words) for ngram, frequency in FreqDist(everygrams(words, max_len=3)).items() if frequency > minimum_frequency} collocations = dict(ngrams) for ngram, likelihood in dict(ngrams).iteritems(): grams = ngram.split("_") if len(grams) != 1: gram_likelihoods = [ngrams[gram] for gram in grams] if likelihood < 0.5 * np.prod(gram_likelihoods)**(1 / len(grams)): collocations.pop(ngram, None) else: for gram in grams: collocations.pop(gram, None) return sorted(collocations.items(), key=itemgetter(1), reverse=True)
Example 7
Project: qb Author: Pinafore File: dataset.py License: MIT License | 4 votes |
def create_qb_tokenizer( unigrams=True, bigrams=False, trigrams=False, zero_length_token='zerolengthunk', strip_qb_patterns=True): def tokenizer(text): if strip_qb_patterns: text = re.sub( '\s+', ' ', re.sub(regex_pattern, ' ', text, flags=re.IGNORECASE) ).strip().capitalize() import nltk tokens = nltk.word_tokenize(text) if len(tokens) == 0: return [zero_length_token] else: ngrams = [] if unigrams: ngrams.extend(tokens) if bigrams: ngrams.extend([f'{w0}++{w1}' for w0, w1 in nltk.bigrams(tokens)]) if trigrams: ngrams.extend([f'{w0}++{w1}++{w2}' for w0, w1, w2 in nltk.trigrams(tokens)]) if len(ngrams) == 0: ngrams.append(zero_length_token) return ngrams return tokenizer