Python nltk.trigrams() Examples

The following are 7 code examples for showing how to use nltk.trigrams(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module nltk , or try the search function .

Example 1
Project: jakaton_feminicidios   Author: iorch   File: lang_model_2.py    License: MIT License 6 votes vote down vote up
def test():
    lm1 = pickle.load(open("lm.bin", 'rb'))

    tweets_list = deviger.load_dataset('test.txt')

    for line in tweets_list:
        sentences = nltk.sent_tokenize(line.strip())
        print("Tweet sentences:", sentences)
        for sent in sentences:
            words = nltk.word_tokenize(sent)
            word_trigrams = nltk.trigrams(words)
                sum_log_prob = 0
            for trigram in word_trigrams:
                logprob = lm1.log_prob(trigram)
                sum_log_prob += logprob
            print("(", sum_log_prob / len(words), ")") 
Example 2
Project: qb   Author: Pinafore   File: dataset.py    License: MIT License 5 votes vote down vote up
def create_qb_tokenizer(
        unigrams=True, bigrams=False, trigrams=False,
        zero_length_token='zerolengthunk', strip_qb_patterns=True):
    def tokenizer(text):
        if strip_qb_patterns:
            text = re.sub(
                '\s+', ' ',
                re.sub(regex_pattern, ' ', text, flags=re.IGNORECASE)
            ).strip().capitalize()
        import nltk
        tokens = nltk.word_tokenize(text)
        if len(tokens) == 0:
            return [zero_length_token]
        else:
            ngrams = []
            if unigrams:
                ngrams.extend(tokens)
            if bigrams:
                ngrams.extend([f'{w0}++{w1}' for w0, w1 in nltk.bigrams(tokens)])
            if trigrams:
                ngrams.extend([f'{w0}++{w1}++{w2}' for w0, w1, w2 in nltk.trigrams(tokens)])

            if len(ngrams) == 0:
                ngrams.append(zero_length_token)
            return ngrams

    return tokenizer 
Example 3
Project: DeepLearn   Author: GauravBh1010tt   File: lex_sem_ft.py    License: MIT License 5 votes vote down vote up
def train_trigram(lst):
    model = defaultdict(lambda: defaultdict(lambda: 0))
    for sent in lst:
        sent = sent.split()
        for w1, w2, w3 in trigrams(sent, pad_right=True, pad_left=True):
            model[(w1,w2)][w2] += 1
    total_count = 0
    for w1,w2 in model:
        total_count = float(sum(model[(w1, w2)].values()))
        for w3 in model[(w1,w2)]:
            model[(w1, w2)][w3] /= total_count

#Total Sum Of Trigram Probablity Of A Sentence[Returns Float]: 
Example 4
Project: BERT   Author: yyht   File: utils.py    License: Apache License 2.0 5 votes vote down vote up
def trigram_counts(word_list):
	tgs = nltk.trigrams(word_list)
	fdist = nltk.FreqDist(tgs)
	d = Counter()
	for k, v in fdist.items():
		d[k] = v
	return d 
Example 5
Project: DL-text   Author: GauravBh1010tt   File: lex_sem_ft.py    License: MIT License 5 votes vote down vote up
def train_trigram(lst):
    model = defaultdict(lambda: defaultdict(lambda: 0))
    for sent in lst:
        sent = sent.split()
        for w1, w2, w3 in trigrams(sent, pad_right=True, pad_left=True):
            model[(w1,w2)][w2] += 1
    total_count = 0
    for w1,w2 in model:
        total_count = float(sum(model[(w1, w2)].values()))
        for w3 in model[(w1,w2)]:
            model[(w1, w2)][w3] /= total_count

#Total Sum Of Trigram Probablity Of A Sentence[Returns Float]: 
Example 6
Project: codenn   Author: sriniiyer   File: SVM.py    License: MIT License 5 votes vote down vote up
def tokenize(text):
      # text = NB.remove_punctuation(text)
      try:
        text = text.decode('utf-8').encode('ascii', 'replace').strip().lower()
      except:
        text = text.encode('ascii', 'replace').strip().lower()
      word = [porter.stem(w) for w in re.findall(r"[\w'-]+|[^\s\w]", text)]   # split punctuations but dont split single quotes for words like don't
      biword =  [b for b in nltk.bigrams(word)]
      triword =  [t for t in nltk.trigrams(word)]
      # word = [w for w in word if w not in stopwords.words('english')]
      return  word # triword 
Example 7
Project: words2map   Author: overlap-ai   File: words2map.py    License: MIT License 5 votes vote down vote up
def get_collocations(words):
	# returns n-grams up to trigrams that appear at least 3 times, with pruning of grams that are redundant
	minimum_frequency = 3
	ngrams = {"_".join(ngram): frequency/len(words) for ngram, frequency in FreqDist(everygrams(words, max_len=3)).items() if frequency > minimum_frequency}
	collocations = dict(ngrams)
	for ngram, likelihood in dict(ngrams).iteritems():
		grams = ngram.split("_")
		if len(grams) != 1:
			gram_likelihoods = [ngrams[gram] for gram in grams]
			if likelihood < 0.5 * np.prod(gram_likelihoods)**(1 / len(grams)):
				collocations.pop(ngram, None)
			else:
				for gram in grams:
					collocations.pop(gram, None)
	return sorted(collocations.items(), key=itemgetter(1), reverse=True)