from nltk.tokenize import RegexpTokenizer from stop_words import get_stop_words from nltk.stem.porter import PorterStemmer from gensim.models.ldamodel import LdaModel from gensim import corpora, models # LDA topic extractor class LDA: def __init__(self): print("Initializing topic extractor") self.topics = [] """ This method takes list of documents in string format and returns a list of tokens """ def __tokenize(self, docs): output = [] for doc in docs: tokenizer = RegexpTokenizer(r'\w\w\w\w\w+') output.append(tokenizer.tokenize(doc.lower())) return output """ This method takes list of words and identifies stop words and removes them from the list """ def __remove_stop_words(self, docs): output = [] for doc in docs: en_stop = get_stop_words('en') stopped_tokens = [i for i in doc if not i in en_stop] output.append(stopped_tokens) return output """ This method takes words in each document and returns its corresponding base word """ def __lemmatizer(self, docs): output = [] for doc in docs: stemmer = PorterStemmer() texts = [stemmer.stem(i) for i in doc] output.append(texts) return output """ This method takes each lemmatized text and generates a document-term matrix """ def __dt_matrix(self, terms): gen_dict = corpora.Dictionary(terms) corpus = [gen_dict.doc2bow(term) for term in terms] return [corpus, gen_dict] def get_topic(self, doc_set): # compile sample documents into a list o1 = self.__tokenize(doc_set) o2 = self.__remove_stop_words(o1) #o3 = self.__lemmatizer(o2) o4 = self.__dt_matrix(o2) self.topics = LdaModel(o4[0], num_topics=1, id2word=o4[1], passes=50) output = self.topics.show_topics(num_topics=1, num_words=3, log=False, formatted=True) return [x.split("*")[1].replace('"', '') for x in output[0][1].split("+")]