from __future__ import print_function import numpy as np import pandas as pd import os import time import pprint import pickle from keras.callbacks import RemoteMonitor from keras.preprocessing import sequence from keras.models import Sequential from keras.optimizers import SGD, Adam, RMSprop from keras.preprocessing.text import Tokenizer from keras.layers.embeddings import Embedding from keras.utils import np_utils from mboxConvert import parseEmails,parseEmailsCSV,getEmailStats,mboxToBinaryCSV from kerasPlotter import Plotter from keras.layers.embeddings import Embedding from keras.layers.convolutional import Conv1D, MaxPooling1D from keras.layers.recurrent import LSTM,GRU from keras.layers.core import Dense, Dropout, Activation, Flatten from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer def get_word_features(emails,verbose=True,nb_words=5000,skip_top=0,maxlen=None,as_matrix=True, matrix_type='count', label_cutoff=0.01,max_n=1): (totalWordsCount,fromCount,domainCount,labels) = getEmailStats(emails) if verbose: print('Creating email dataset with labels %s '%str(labels)) print('Label word breakdown:') total = 0 for label in labels: count = sum(totalWordsCount[label].values()) total+=count print('\t%s:%d'%(label,count)) print('Total word count: %d'%total) labelCounts = {label:0 for label in labels} for email in emails: labelCounts[email.label]+=1 cutoff = int(len(emails)*label_cutoff) removed = 0 for label in labels[:]: if labelCounts[label]<cutoff or label=='Important' or label=='Unread' or label=='Sent': removed+=1 labels.remove(label) labelNums = {labels[i]:i for i in range(len(labels))} if verbose: print('Found %d labels below count threshold of %d '%(removed,cutoff)) if verbose: print('Creating email dataset with labels %s '%str(labels)) print('Label email count breakdown:') total = 0 for label in labels: print('\t%s:%d'%(label,labelCounts[label])) print('Total emails: %d'%sum([labelCounts[label] for label in labels])) texts = [] emailLabels = [] for email in emails: if email.label not in labels: continue text = email.sender+" "+str(email.subject) + " " text+= email.fromDomain if email.to != None: text+= email.to + " " if email.cc != None: text+= email.cc + " " text+=email.content texts.append(text.replace('\n','').replace('\r','')) emailLabels.append(labelNums[email.label]) emailLabels = np.array(emailLabels) if max_n==1 or not as_matrix: tokenizer = Tokenizer(nb_words) tokenizer.fit_on_texts(texts) reverse_word_index = {tokenizer.word_index[word]:word for word in tokenizer.word_index} word_list = [reverse_word_index[i+1] for i in range(nb_words)] if as_matrix: feature_matrix = tokenizer.texts_to_matrix(texts, mode=matrix_type) return feature_matrix,emailLabels,word_list,labels else: sequences = tokenizer.texts_to_sequences(texts) return sequences,emailLabels,word_list,labels else: if matrix_type=='tfidf': vectorizer = TfidfVectorizer(ngram_range=(1,max_n),max_features=nb_words) else: vectorizer = CounterVectorizer(ngram_range=(1,max_n),max_features=nb_words,binary=matrix_type=='binary') feature_matrix = vectorizer.fit_transform(texts) word_list = vectorizer.get_feature_names() return feature_matrix,emailLabels,word_list,labels def write_csv(csvfile, feature_matrix, labels,feature_names=None, verbose=True): dataframe = pd.DataFrame(data=feature_matrix,columns=feature_names) dataframe['label'] = labels dataframe.to_csv(csvfile) if verbose: print('Wrote CSV with columns %s to %s'%(str(dataframe.columns),csvfile)) def read_csv(csvfile,verbose=True): dataframe = pd.read_csv(csvfile,header=0) labels = dataframe[u'label'].tolist() if verbose: print('Read CSV with columns %s'%str(dataframe.columns)) dataframe.drop(u'label',inplace=True,axis=1) if u'Unnamed: 0' in dataframe.columns: dataframe.drop(u'Unnamed: 0',inplace=True,axis=1) feature_matrix = dataframe.values feature_names = dataframe.columns return feature_matrix,labels,feature_names def write_info(txtfile, label_names, verbose=True): with open(txtfile,'w') as writeto: writeto.write(','.join(label_names)) def read_info(txtfile,verbose=True): with open(txtfile,'r') as readfrom: label_names=readfrom.readline().split(',') return label_names def write_sequences(txtfile, sequences, labels, verbose=True): with open(txtfile,'w') as writeto: for sequence,label in zip(sequences,labels): #lol random demarcation markers so fun amirite writeto.write(','.join([str(x) for x in sequence])+';;;'+str(label)+'\n') if verbose: print('Wrote txt with %d lines'%len(sequences)) def read_sequences(txtfile,verbose=True): sequences = [] labels = [] linesnum = 0 with open(txtfile,'r') as readfrom: for line in readfrom: linesnum+=1 parts = line.split(';;;') split = parts[0].split(',') if len(split)<=1: continue sequences.append(np.asarray(split)) labels.append((int)(parts[1])) if verbose: print('Read txt with %d lines'%linesnum) return sequences,labels dataframe = pd.read_csv(csvfile,header=0) labels = dataframe[u'label'].tolist() if verbose: print('Read CSV with columns %s'%str(dataframe.columns)) dataframe.drop('label',inplace=True,axis=1) feature_matrix = dataframe.as_matrix() return feature_matrix,labels def make_dataset(features,labels,num_labels,test_split=0.1,nb_words=1000): if type(features)==list: num_examples = len(features) random_order = np.random.permutation(num_examples) index_split = (int)(test_split*num_examples) train_indices = random_order[index_split:] test_indices = random_order[:index_split] X_train = [features[i] for i in train_indices] X_test = [features[i] for i in test_indices] Y_train = [labels[i] for i in train_indices] Y_test = [labels[i] for i in test_indices] else: num_examples = features.shape[0] random_order = np.random.permutation(num_examples) index_split = (int)(test_split*num_examples) train_indices = random_order[index_split:] test_indices = random_order[:index_split] X_train = features[train_indices] X_test = features[test_indices] Y_train = [labels[i] for i in train_indices] Y_test = [labels[i] for i in test_indices] Y_train_c = np_utils.to_categorical(Y_train, num_labels) Y_test_c = np_utils.to_categorical(Y_test, num_labels) return ((X_train,Y_train_c),(X_test,Y_test_c)),Y_train,Y_test def get_emails(emailsFilePath,verbose=True): picklefile = 'pickled_emails.pickle' if os.path.isfile(picklefile): with open(picklefile,'rb') as load_from: emails = pickle.load(load_from) else: # Uncomment to parse .mbox exported from Gmail # emails = parseEmails('.',printInfo=verbose) # Uncomment to parse CSV emails = parseEmailsCSV(emailsFilePath) with open(picklefile,'wb') as store_to: pickle.dump(emails,store_to) return emails def get_ngram_data(emailsFilePath, num_words=1000,matrix_type='binary',verbose=True,max_n=1): #yeah yeah these can be separate functions, but lets just bundle it all up csvfile = 'keras_data_%d_%s.csv'%(num_words,str(matrix_type)) infofile = 'data_info.txt' if os.path.isfile(csvfile): features,labels,feature_names = read_csv(csvfile,verbose=verbose) label_names = read_info(infofile) else: emails = get_emails(emailsFilePath, verbose=verbose) features,labels,feature_names,label_names = get_word_features(emails,nb_words=num_words,matrix_type=matrix_type,verbose=verbose,max_n=max_n) if max_n==1: write_csv(csvfile,features,labels,feature_names,verbose=verbose) write_info(infofile,label_names) return features,labels,feature_names,label_names def get_my_data(per_label=False): csvfile = 'my_data_%s.csv'%str(per_label) infofile = 'data_info.txt' if os.path.isfile(csvfile): features,labels,feature_names = read_csv(csvfile) label_names = read_info(infofile) else: mboxToBinaryCSV('.',csvfile,perLabel=per_label) features,labels,feature_names = read_csv(csvfile)#legacy code etc. label_names = list(set(labels)) write_info(infofile,label_names) num_labels = max(labels)+1 return features,labels,feature_names,label_names def get_sequence_data(): txtfile = 'sequence_data.txt' infofile = 'data_info.txt' if os.path.isfile(txtfile): features,labels = read_sequences(txtfile) label_names = read_info(infofile) else: emails = parseEmails('.') features,labels,words,labelVals = get_keras_features(emails,as_matrix=False) write_sequences(txtfile,features,labels) write_info(infofile,labelVals) num_labels = max(labels)+1 return features,labels,label_names def evaluate_mlp_model(dataset,num_classes,extra_layers=0,num_hidden=512,dropout=0.5,graph_to=None,verbose=True): (X_train, Y_train), (X_test, Y_test) = dataset batch_size = 32 nb_epoch = 7 if verbose: print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Y_train shape:', Y_train.shape) print('Y_test shape:', Y_test.shape) print('Building model...') model = Sequential() model.add(Dense(num_hidden)) model.add(Activation('relu')) model.add(Dropout(dropout)) for i in range(extra_layers): model.add(Dense(num_hidden)) model.add(Activation('relu')) model.add(Dropout(dropout)) model.add(Dense(num_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) plotter = Plotter(save_to_filepath=graph_to, show_plot_window=True) callbacks = [plotter] if graph_to else [] history = model.fit(X_train, Y_train, epochs=nb_epoch, batch_size=batch_size, verbose=1 if verbose else 0, validation_split=0.1,callbacks=callbacks) score = model.evaluate(X_test, Y_test, batch_size=batch_size, verbose=1 if verbose else 0) if verbose: print('Test score:',score[0]) print('Test accuracy:', score[1]) predictions = model.predict_classes(X_test,verbose=1 if verbose else 0) return predictions,score[1] def evaluate_recurrent_model(dataset,num_classes): (X_train, Y_train), (X_test, Y_test) = dataset max_features = 20000 maxlen = 125 # cut texts after this number of words (among top max_features most common words) batch_size = 32 print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') print("Pad sequences (samples x time) with maxlen %d"%maxlen) X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Build model...') model = Sequential() model.add(Embedding(max_features, 128, input_length=maxlen)) model.add(GRU(512)) # try using a GRU instead, for fun model.add(Dropout(0.5)) model.add(Dense(num_classes)) model.add(Activation('softmax')) # try using different optimizers and different optimizer configs model.compile(loss='categorical_crossentropy',optimizer='adam') print("Train...") model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=15, validation_data=(X_test, Y_test), show_accuracy=True) score, acc = model.evaluate(X_test, Y_test, batch_size=batch_size, show_accuracy=True) if verbose: print('Test score:', score) print('Test accuracy:', acc) return score[1] def evaluate_conv_model(dataset, num_classes, maxlen=125,embedding_dims=250,max_features=5000,nb_filter=300,filter_length=3,num_hidden=250,dropout=0.25,verbose=True,pool_length=2,with_lstm=False): (X_train, Y_train), (X_test, Y_test) = dataset batch_size = 32 nb_epoch = 7 if verbose: print('Loading data...') print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') print('Pad sequences (samples x time)') X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) if verbose: print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Build model...') model = Sequential() # we start off with an efficient embedding layer which maps # our vocab indices into embedding_dims dimensions model.add(Embedding(max_features, embedding_dims, input_length=maxlen)) model.add(Dropout(dropout)) # we add a Convolution1D, which will learn nb_filter # word group filters of size filter_length: model.add(Conv1D(activation="relu", filters=nb_filter, kernel_size=filter_length, strides=1, padding="valid")) if pool_length: # we use standard max pooling (halving the output of the previous layer): model.add(MaxPooling1D(pool_size=2)) if with_lstm: model.add(LSTM(125)) else: # We flatten the output of the conv layer, # so that we can add a vanilla dense layer: model.add(Flatten()) #We add a vanilla hidden layer: model.add(Dense(num_hidden)) model.add(Activation('relu')) model.add(Dropout(dropout)) # We project onto a single unit output layer, and squash it with a sigmoid: model.add(Dense(num_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy']) model.fit(X_train, Y_train, batch_size=batch_size,epochs=nb_epoch, validation_split=0.1) score = model.evaluate(X_test, Y_test, batch_size=batch_size, verbose=1 if verbose else 0) if verbose: print('Test score:',score[0]) print('Test accuracy:', score[1]) predictions = model.predict_classes(X_test,verbose=1 if verbose else 0) return predictions,score[1]