python source code of preprocessing

import json
import re
import operator
import collections
import nltk
import os
from generate_act_tag import Act_Tag
from os import path
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from topic_extractor import LDA



attribute = re.compile("\\((.*)\\)")
class turn:
    def __init__(self, speaker, words, scene, act_tag):
        self.speaker = speaker
        self.addresse = []
        self.topic = []
        self.words = []
        self.scene = scene
        self.act_tag = act_tag
        for token,pos  in nltk.pos_tag(words):
            self.words.append((token,pos))
        self.speaker_attribute = None

    def addAddresse(self, p_list):
        self.addresse = p_list


class Event:
    def __init__(self, scene_desc, episode, season):
        self.episode = episode
        self.season = season
        self.scene_desc = scene_desc
        self.turn_list = []
        self.cntr = -1
        self.participants = []

    def addConversation(self, conv):
        self.turn_list.append(conv)

    def addParticipants(self, actors):
        self.participants = list(actors)

class getEpisodeTranscripts:
    def __init__(self):
        self.episodeInfo = {}
        self.Info = []
        self.allTranscripts = {}
        self.vocabulary = collections.defaultdict(int)
        self.Stopwords = get_stop_words('en')
        self.impactActors = ["Leonard","Sheldon","Penny", "Howard","Raj","Amy","Bernadette"]

    def processText(self, transcript, season, episode):
        """ Map to a logical form """
        scenes = transcript.split("Scene:")
        for s in scenes:
            tt = s.split("\n")
            scene_desc = tt[0].strip()
            sc = Event(scene_desc,episode,season)
            actors = []
            doc_set = []
            for t in tt[1:]:
                try:
                    (speaker, dialogue) =  t.split(":")
                    # check if the speaker has associated attributes with it
                    speaker = speaker.strip()
                    #print(speaker)
                    m = attribute.search(speaker)
                    if m:
                        attr = m.group(1)
                        # Remove extra information from speaker info
                        # eg : Sheldon (Laughing) : should give only sheldon
                        # as the speaker
                        speaker = speaker.split("(")[0].strip()
                    doc_set.append(dialogue) 
                    tokenizer = RegexpTokenizer(r'\w+')
                    words = tokenizer.tokenize(dialogue.lower())
                    act_tag = Act.get_act_tag([dialogue]) 
                    words = self.addtoVocab(words)
                    if speaker not in self.impactActors:
                        speaker = "Others"
                    conv = turn(speaker,words, scene_desc, act_tag)
                    actors.append(speaker)
                    sc.addConversation(conv)
                except ValueError:
                    pass
            actors = set(actors) # get only the unique ones
            if doc_set:
                try:
                    topic = lda.get_topic(doc_set)
                except ValueError:
                    topic = None
            else:
                topic = None
            sc.addParticipants(actors)
            for c in sc.turn_list:
                # all people in the scene except the speaker are treated recipients
                c.addresse = list(filter(lambda x:  x != c.speaker,actors))
                c.topic = topic
            self.Info.append(sc)

    def addtoVocab(self, words):
        #stemmer = PorterStemmer()
        w_list = self.removeStopWords(words)
        for word in w_list:
            self.vocabulary[word] += 1
        return w_list

    def removeStopWords(self, words):
        return [word for word in words if word not in self.Stopwords]

    def dumpValues(self):
        """
            Get the content in plain text format. Can deserialize the object to get info
            in JSON format. That's a TODO
        """
        info = {}
        actors = []
        dialogue_list = []
        for scene in self.Info:
            key = scene.season+"_"+scene.episode
            sc = {}
            sc["Scene"] = scene.scene_desc
            sc["Partcipiants"] = scene.participants
            sc["Turns"] = []
            dialogue_list.append(scene.scene_desc)
            for c in scene.turn_list:
                turn = {}
                turn["Speaker"] = c.speaker
                turn["Scene"] = c.scene
                actors.append(c.speaker)
                turn["Words"] = c.words
                turn["Topics"] = c.topic
                turn["Act_Tag"] = c.act_tag
                turn["Recipients"] = c.addresse
                dlg = c.speaker+" : "+" ".join(x[0] for x in c.words)
                dialogue_list.append(dlg)
                sc["Turns"].append(turn)
            if key not in info:
                info[key] = []
            info[key].append(sc)
        with open("final.json","w") as fh:
            json.dump(info,fh,indent=4)
            fh.close()
        # Dump Vocabulary
        with open("vocabulary.json","w") as fh:
            json.dump(self.vocabulary,fh,indent=4)
            fh.close()
        with open("dialogues.json","w") as fh:
            json.dump(dialogue_list,fh)
            fh.close()

    def readTranscripts(self):
        """
            Use the file that has episode and transcript mapping to process
            each scene in the episode and capture all conversations
        """
        #corpus_path = "D:\\NLP-544\\Speaker-Prediction\\preprocessing\\corpus.json"
        corpus_path = "corpus.json"
        with open(corpus_path,"r") as fhandle:
            Transcripts = json.load(fhandle)
            fhandle.close()
        for k in Transcripts:
            (episode,season) = k.split("_")
            self.processText(Transcripts[k],season, episode)
        self.dumpValues()

    def getEpisodes(self, season):
        return 

lda = LDA()
Act = Act_Tag()
t = getEpisodeTranscripts()
t.readTranscripts()