Python nltk.trigrams() Examples

The following are 7 code examples of nltk.trigrams(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk , or try the search function .
Example #1
Source File: lex_sem_ft.py    From DeepLearn with MIT License 5 votes vote down vote up
def train_trigram(lst):
    model = defaultdict(lambda: defaultdict(lambda: 0))
    for sent in lst:
        sent = sent.split()
        for w1, w2, w3 in trigrams(sent, pad_right=True, pad_left=True):
            model[(w1,w2)][w2] += 1
    total_count = 0
    for w1,w2 in model:
        total_count = float(sum(model[(w1, w2)].values()))
        for w3 in model[(w1,w2)]:
            model[(w1, w2)][w3] /= total_count

#Total Sum Of Trigram Probablity Of A Sentence[Returns Float]: 
Example #2
Source File: utils.py    From BERT with Apache License 2.0 5 votes vote down vote up
def trigram_counts(word_list):
	tgs = nltk.trigrams(word_list)
	fdist = nltk.FreqDist(tgs)
	d = Counter()
	for k, v in fdist.items():
		d[k] = v
	return d 
Example #3
Source File: lex_sem_ft.py    From DL-text with MIT License 5 votes vote down vote up
def train_trigram(lst):
    model = defaultdict(lambda: defaultdict(lambda: 0))
    for sent in lst:
        sent = sent.split()
        for w1, w2, w3 in trigrams(sent, pad_right=True, pad_left=True):
            model[(w1,w2)][w2] += 1
    total_count = 0
    for w1,w2 in model:
        total_count = float(sum(model[(w1, w2)].values()))
        for w3 in model[(w1,w2)]:
            model[(w1, w2)][w3] /= total_count

#Total Sum Of Trigram Probablity Of A Sentence[Returns Float]: 
Example #4
Source File: SVM.py    From codenn with MIT License 5 votes vote down vote up
def tokenize(text):
      # text = NB.remove_punctuation(text)
      try:
        text = text.decode('utf-8').encode('ascii', 'replace').strip().lower()
      except:
        text = text.encode('ascii', 'replace').strip().lower()
      word = [porter.stem(w) for w in re.findall(r"[\w'-]+|[^\s\w]", text)]   # split punctuations but dont split single quotes for words like don't
      biword =  [b for b in nltk.bigrams(word)]
      triword =  [t for t in nltk.trigrams(word)]
      # word = [w for w in word if w not in stopwords.words('english')]
      return  word # triword 
Example #5
Source File: words2map.py    From words2map with MIT License 5 votes vote down vote up
def get_collocations(words):
	# returns n-grams up to trigrams that appear at least 3 times, with pruning of grams that are redundant
	minimum_frequency = 3
	ngrams = {"_".join(ngram): frequency/len(words) for ngram, frequency in FreqDist(everygrams(words, max_len=3)).items() if frequency > minimum_frequency}
	collocations = dict(ngrams)
	for ngram, likelihood in dict(ngrams).iteritems():
		grams = ngram.split("_")
		if len(grams) != 1:
			gram_likelihoods = [ngrams[gram] for gram in grams]
			if likelihood < 0.5 * np.prod(gram_likelihoods)**(1 / len(grams)):
				collocations.pop(ngram, None)
			else:
				for gram in grams:
					collocations.pop(gram, None)
	return sorted(collocations.items(), key=itemgetter(1), reverse=True) 
Example #6
Source File: lang_model_2.py    From jakaton_feminicidios with MIT License 5 votes vote down vote up
def test():
    lm1 = pickle.load(open("lm.bin", 'rb'))

    tweets_list = deviger.load_dataset('test.txt')

    for line in tweets_list:
        sentences = nltk.sent_tokenize(line.strip())
        print("Tweet sentences:", sentences)
        for sent in sentences:
            words = nltk.word_tokenize(sent)
            word_trigrams = nltk.trigrams(words)
                sum_log_prob = 0
            for trigram in word_trigrams:
                logprob = lm1.log_prob(trigram)
                sum_log_prob += logprob
            print("(", sum_log_prob / len(words), ")") 
Example #7
Source File: dataset.py    From qb with MIT License 4 votes vote down vote up
def create_qb_tokenizer(
        unigrams=True, bigrams=False, trigrams=False,
        zero_length_token='zerolengthunk', strip_qb_patterns=True):
    def tokenizer(text):
        if strip_qb_patterns:
            text = re.sub(
                '\s+', ' ',
                re.sub(regex_pattern, ' ', text, flags=re.IGNORECASE)
            ).strip().capitalize()
        import nltk
        tokens = nltk.word_tokenize(text)
        if len(tokens) == 0:
            return [zero_length_token]
        else:
            ngrams = []
            if unigrams:
                ngrams.extend(tokens)
            if bigrams:
                ngrams.extend([f'{w0}++{w1}' for w0, w1 in nltk.bigrams(tokens)])
            if trigrams:
                ngrams.extend([f'{w0}++{w1}++{w2}' for w0, w1, w2 in nltk.trigrams(tokens)])

            if len(ngrams) == 0:
                ngrams.append(zero_length_token)
            return ngrams

    return tokenizer