```import math
import os.path

import pickle

import glob
import nltk

import deviger

class LangModel:
def __init__(self, order, alpha, sentences):
self.order = order
self.alpha = alpha
if order > 1:
self.backoff = LangModel(order - 1, alpha, sentences)
self.lexicon = None
else:
self.backoff = None
self.n = 0
self.ngramFD = nltk.FreqDist()
lexicon = set()
for sentence in sentences:
words = nltk.word_tokenize(sentence)
wordNGrams = nltk.ngrams(words, order)
for wordNGram in wordNGrams:
self.ngramFD[wordNGram] += 1
# self.ngramFD.inc(wordNGram)
if order == 1:
self.n += 1
self.v = len(lexicon)

def log_prob(self, ngram):
return math.log(self.prob(ngram))

def prob(self, ngram):
if not self.backoff is None:
freq = self.ngramFD[ngram]
backoffFreq = self.backoff.ngramFD[ngram[1:]]
if freq == 0:
return self.alpha * self.backoff.prob(ngram[1:])
else:
return freq / backoffFreq
else:
# laplace smoothing to handle unknown unigrams
return (self.ngramFD[ngram] + 1) / (self.n + self.v)

def train():
"""
Creates a language model
:return:
"""

sentences = []
i = 0
for tw in tweets_list:
if i > 0 and i % 500 == 0:
print("%d/%d tweet loaded, #-sentences: %d" % (i, len(tweets_list), len(sentences)))
sentences.extend(nltk.sent_tokenize(tw))
i += 1
lm = LangModel(3, 0.4, sentences)

pickle.dump(lm, open("lm.bin", "wb"))

def test():

for line in tweets_list:
sentences = nltk.sent_tokenize(line.strip())
print("Tweet sentences:", sentences)
for sent in sentences:
words = nltk.word_tokenize(sent)
word_trigrams = nltk.trigrams(words)
sum_log_prob = 0
for trigram in word_trigrams:
logprob = lm1.log_prob(trigram)
sum_log_prob += logprob
print("(", sum_log_prob / len(words), ")")

def main():
train()
test()

if __name__ == "__main__":
main()
```