python source code of topic_extractor

Project: TBBTCorpus (GitHub Link)

TBBTCorpus-master
- LICENSE
- classifiers
  - CRFs(pycrf_CRF++)
    - template_modified
    - pycrfsuite.py
    - crfpp_classifier.py
    - template
    - crfpp_evaluator.py
  - MLP
    - MLP.py
  - svm
    - svmClassifier.py
    - output.txt
    - __pycache__
      - corpus_reader.cpython-34.pyc
    - analysis.py
    - svm-bagofWords.py
    - corpus_reader.py
    - topics.json
  - MaxEnt+NaiveBayes
    - maxEnt.py
    - maxEntTrainer.py
    - naiveBayes.py
    - getBagOfWords.py
- corpus_reader.py
- feature_extractor.py
- data
- README.md
- preprocessing
  - util.py
  - generate_act_tag.py
  - episode_links.json
  - topic_extractor.py
  - preprocessing.py
  - README.md
  - raw_corpus
    - 6_4.txt
    - 9_24.txt
    - 4_5.txt
    - 1_1.txt
    - 9_11.txt
    - 5_20.txt
    - 5_9.txt
    - 5_14.txt
    - 1_4.txt
    - 2_4.txt
    - 1_16.txt
    - 8_22.txt
    - 9_21.txt
    - 6_8.txt
    - 7_22.txt
    - 5_24.txt
    - 3_18.txt
    - 6_21.txt
    - 1_15.txt
    - 5_22.txt
    - 1_8.txt
    - 9_19.txt
    - 4_19.txt
    - 4_11.txt
    - 9_20.txt
    - 7_2.txt
    - 7_9.txt
    - 4_24.txt
    - 4_8.txt
    - 8_15.txt
    - 5_4.txt
    - 9_16.txt
    - 7_1.txt
    - 3_3.txt
    - 6_7.txt
    - 9_12.txt
    - 3_15.txt
    - 7_24.txt
    - 8_11.txt
    - 5_16.txt
    - 4_7.txt
    - 5_17.txt
    - 6_11.txt
    - 7_6.txt
    - 5_19.txt
    - 1_17.txt
    - 2_21.txt
    - 9_23.txt
    - 9_14.txt
    - 6_19.txt
    - 7_5.txt
    - 3_7.txt
    - 8_13.txt
    - 6_10.txt
    - 9_5.txt
    - 8_7.txt
    - 3_13.txt
    - 8_23.txt
    - 8_6.txt
    - 5_23.txt
    - 1_13.txt
    - 3_21.txt
    - 2_2.txt
    - 6_18.txt
    - 2_11.txt
    - 4_10.txt
    - 4_15.txt
    - 9_22.txt
    - 8_2.txt
    - 2_7.txt
    - 2_9.txt
    - 6_13.txt
    - 4_1.txt
    - 3_19.txt
    - 5_5.txt
    - 4_6.txt
    - 7_3.txt
    - 3_17.txt
    - 3_20.txt
    - 2_17.txt
    - 1_3.txt
    - 4_20.txt
    - 6_2.txt
    - 4_21.txt
    - 9_15.txt
    - 3_9.txt
    - 4_2.txt
    - 6_15.txt
    - 4_13.txt
    - 9_9.txt
    - 7_10.txt
    - 8_3.txt
    - 8_10.txt
    - 3_1.txt
    - 7_21.txt
    - 1_12.txt
    - 6_9.txt
    - 6_5.txt
    - 3_23.txt
    - 7_19.txt
    - 1_5.txt
    - 9_3.txt
    - 3_22.txt
    - 3_2.txt
    - 6_16.txt
    - 1_10.txt
    - 8_19.txt
    - 3_12.txt
    - 4_16.txt
    - 2_13.txt
    - 4_17.txt
    - 7_17.txt
    - 4_14.txt
    - 9_2.txt
    - 1_14.txt
    - 7_15.txt
    - 4_12.txt
    - 9_1.txt
    - 2_1.txt
    - 3_10.txt
    - 9_10.txt
    - 8_14.txt
    - 2_20.txt
    - 4_3.txt
    - 7_18.txt
    - 2_3.txt
    - 3_6.txt
    - 6_3.txt
    - 8_24.txt
    - 7_8.txt
    - 5_12.txt
    - 6_14.txt
    - 9_18.txt
    - 8_20.txt
    - 8_21.txt
    - 8_16.txt
    - 2_16.txt
    - 9_4.txt
    - 7_12.txt
    - 4_9.txt
    - 4_18.txt
    - 8_4.txt
    - 6_17.txt
    - 1_2.txt
    - 7_7.txt
    - 3_4.txt
    - 5_10.txt
    - 3_5.txt
    - 1_9.txt
    - 5_1.txt
    - 6_20.txt
    - 4_4.txt
    - 5_8.txt
    - 2_5.txt
    - 7_4.txt
    - 7_23.txt
    - 2_15.txt
    - 5_2.txt
    - 7_16.txt
    - 4_23.txt
    - 6_6.txt
    - 9_8.txt
    - 8_12.txt
    - 5_3.txt
    - 5_11.txt
    - 1_11.txt
    - 3_8.txt
    - 1_6.txt
    - 7_14.txt
    - 9_13.txt
    - 6_1.txt
    - 5_13.txt
    - 3_14.txt
    - 5_15.txt
    - 5_6.txt
    - 1_7.txt
    - 5_21.txt
    - 2_19.txt
    - 9_7.txt
    - 4_22.txt
    - 8_8.txt
    - 2_14.txt
    - 7_20.txt
    - 8_17.txt
    - 6_12.txt
    - 5_7.txt
    - 9_6.txt
    - 9_17.txt
    - 2_6.txt
    - 3_11.txt
    - 8_9.txt
    - 8_1.txt
    - 5_18.txt
    - 6_22.txt
    - 8_18.txt
    - 2_23.txt
    - 2_18.txt
    - 2_12.txt
    - 7_11.txt
    - 3_16.txt
    - 2_22.txt
    - 7_13.txt

from nltk.tokenize          import RegexpTokenizer
from stop_words             import get_stop_words
from nltk.stem.porter       import PorterStemmer
from gensim.models.ldamodel import LdaModel
from gensim                 import corpora, models

# LDA topic extractor

class LDA:
	def __init__(self):
		print("Initializing topic extractor")
		self.topics = []
	"""
	This method takes list of documents in string format and returns a list of tokens
	"""
	def __tokenize(self, docs):
		output = []
		for doc in docs:
			tokenizer = RegexpTokenizer(r'\w\w\w\w\w+')
			output.append(tokenizer.tokenize(doc.lower()))
		return output


	"""
	This method takes list of words and identifies stop words and removes them from the list
	"""
	def __remove_stop_words(self, docs):
		output = []
		for doc in docs:
			en_stop = get_stop_words('en')
			stopped_tokens = [i for i in doc if not i in en_stop]
			output.append(stopped_tokens)
		return output


	"""
	This method takes words in each document and returns its corresponding base word
	"""
	def __lemmatizer(self, docs):
		output = []
		for doc in docs:
			stemmer = PorterStemmer()
			texts = [stemmer.stem(i) for i in doc]
			output.append(texts)
		return output


	"""
	This method takes each lemmatized text and generates a document-term matrix
	"""
	def __dt_matrix(self, terms):
		gen_dict = corpora.Dictionary(terms)
		corpus = [gen_dict.doc2bow(term) for term in terms]
		return [corpus, gen_dict]


	def get_topic(self, doc_set):
		# compile sample documents into a list
		o1 = self.__tokenize(doc_set)
		o2 = self.__remove_stop_words(o1)
		#o3 = self.__lemmatizer(o2)
		o4 = self.__dt_matrix(o2)
		
		self.topics = LdaModel(o4[0], num_topics=1, id2word=o4[1], passes=50)
		output = self.topics.show_topics(num_topics=1, num_words=3, log=False, formatted=True)
		return [x.split("*")[1].replace('"', '') for x in output[0][1].split("+")]