# Loops through ML models for classification # Basic code borrowed from RAYID GHANI, with extensive edits. ## https://github.com/rayidghani/magicloops/blob/master/magicloops.py from __future__ import division import pandas as pd import numpy as np import random import os from scipy.sparse import isspmatrix_csc, csc_matrix from sklearn import svm, metrics from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier, OrthogonalMatchingPursuit, RandomizedLogisticRegression from sklearn.neighbors.nearest_centroid import NearestCentroid from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB from sklearn.tree import DecisionTreeClassifier, export_graphviz from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import ParameterGrid from sklearn.metrics import * import spacy import traceback from transform_features import get_feature_transformer from model import Model class ModelLoop(): def __init__(self, X_train, X_test, y_train, y_test, models, iterations, output_dir, thresholds = [.1], ks = [], ignore_columns=[], method='pandas', report='full', pickle=False, roc=False, unique_identifiers=['id'], setting='both_only'): ''' Constructor for the ModelLoop. Inputs: - train: training data as a pandas dataframe - test: testing data as a pandas dataframe - models: models to run as list - iterations: maximum number of parameter iterations as int - output_dir: directory output model performance - report: type of reporting, options are simple and full - pickle: whether to pickle models ''' self.raw_X_train = X_train self.raw_X_test = X_test self.y_train = y_train self.y_test = y_test self.models_to_run = models self.iterations_max = iterations self.output_dir = output_dir self.params_iter_max = 50 self.thresholds = thresholds self.ks = ks if self.thresholds == [] and self.ks == []: raise NameError('Either thresholds or ks must contain values.') self.method = method self.clfs = None self.params = None self.ignore = ignore_columns self.X_variables = [] self.define_clfs_params() self.report = report assert (report == 'simple' or report == 'full') self.pickle = pickle # Not currently supported self.roc = roc self.unique_identifiers = unique_identifiers self.setting = setting if self.setting not in ['both_only', 'all', 'tfidf_only', 'grammar_only']: raise NameError('Incorrect feature setting.') def define_clfs_params(self): ''' Defines all relevant parameters and classes for classfier objects. Edit these if you wish to change parameters. ''' # These are the classifiers self.clfs = { 'RF': RandomForestClassifier(n_estimators = 50, n_jobs = -1), 'ET': ExtraTreesClassifier(n_estimators = 10, n_jobs = -1, criterion = 'entropy'), 'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth = [1, 5, 10, 15]), algorithm = "SAMME", n_estimators = 200), 'LR': LogisticRegression(penalty = 'l1', C = 1e5), 'SVM': svm.SVC(kernel = 'linear', probability = True, random_state = 0), 'GB': GradientBoostingClassifier(learning_rate = 0.05, subsample = 0.5, max_depth = 6, n_estimators = 10), 'NB': GaussianNB(), 'DT': DecisionTreeClassifier(), 'SGD': SGDClassifier(loss = 'log', penalty = 'l2'), 'KNN': KNeighborsClassifier(n_neighbors = 3) } # These are the parameters which will be run through self.params = { 'RF':{'n_estimators': [1,10,100,1000], 'max_depth': [10, 15,20,30,40,50,60,70,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]}, 'LR': {'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10], 'random_state': [1]}, 'SGD': {'loss': ['log'], 'penalty': ['l2','l1','elasticnet'], 'random_state': [1]}, 'ET': {'n_estimators': [1,10,100,1000], 'criterion' : ['gini', 'entropy'], 'max_depth': [1,3,5,10,15], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]}, 'AB': {'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000], 'random_state': [1]}, 'GB': {'n_estimators': [1,10,100,1000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100], 'random_state': [1]}, 'NB': {}, 'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,2,15,20,30,40,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]}, 'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear'], 'random_state': [1]}, 'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']} } def clf_loop(self, X_train, X_test, y_train, y_test, individuals, setting): ''' Runs through each model specified by models_to_run once with each possible setting in params. ''' N = 0 self.prepare_report() for index, clf in enumerate([self.clfs[x] for x in self.models_to_run]): iteration = 0 print('Running {}.'.format(self.models_to_run[index])) parameter_values = self.params[self.models_to_run[index]] grid = ParameterGrid(parameter_values) while iteration < self.iterations_max and iteration < len(grid): print(' Running Iteration {} of {}...'.format(iteration + 1, self.iterations_max)) if len(grid) > self.iterations_max: p = random.choice(list(grid)) else: p = list(grid)[iteration] try: m = Model(clf, X_train, y_train, X_test, y_test, p, N, self.models_to_run[index], iteration, self.output_dir, thresholds = self.thresholds, ks = self.ks, report = self.report, label='label', individuals=individuals, setting=setting) m.run() print(' Printing to file...') if not self.roc: m.performance_to_file() else: m.performance_to_file(roc='{}ROC_{}_{}-{}.png'.format( self.output_dir, self.models_to_run[index], N, iteration)) except IndexError as e: print(p) print(N) print('IndexError: {}'.format(e)) print(traceback.format_exc()) continue except RuntimeError as e: print(p) print(N) print('RuntimeError: {}'.format(e)) print(traceback.format_exc()) continue except AttributeError as e: print(p) print(N) print('AttributeError: {}'.format(e)) print(traceback.format_exc()) continue iteration += 1 N += 1 def prepare_report(self): ''' Prepares the output file(s). ''' if not os.path.isdir(self.output_dir): os.mkdir(self.output_dir) with open(self.output_dir + '/evaluations.csv', 'w') as f: if self.thresholds != []: measure = 'threshold' else: measure = 'k' f.write('model_id, model_type, iteration, auc, {}, precision, recall, accuracy, params\n'.format(measure)) def data_checks(self, dataframe): ''' Checks that data are all present and there are no infinite values. ''' if self.method == 'pandas': # Feature generation assumes that each article text is unique. assert (self.raw_X_train.duplicated().sum() == 0) assert (self.raw_X_test.duplicated().sum() == 0) # Remove any infinities, replace with missing dataframe=dataframe.replace([np.inf, -np.inf], np.nan) # Find any columns with missing values missing_cols = [] for column in self.X_variables: assert max(dataframe[column] <= 1) if len(dataframe[dataframe[column].isnull()]) > 0: missing_cols.append(column) if len(missing_cols) > 0: raise NameError('Missing or infinite X values detected: {}'.format(missing_cols)) def run(self): ''' Loads data from csvs, executes basic data checks, runs loop. If roc is not False, will print ROC to the filename specified. ''' # Run Data checks if self.method == 'pandas': self.data_checks(self.raw_X_train) self.data_checks(self.raw_X_test) if self.method == 'csc': if not isspmatrix_csc(self.raw_X_train): self.X_train = csc_matrix(self.raw_X_train) self.X_test = csc_matrix(self.raw_X_test) self.y_test = csc_matrix(self.y_test) self.y_train = csc_matrix(self.y_train) individuals = self.y_test # Generate features parser = spacy.load('en') if self.setting == 'all': params = [(True, False, 'no_tfidf'), (True, True, 'both'), (False, True, 'no_grammar')] if self.setting == 'both_only': params = [(True, True, 'both')] if self.setting == 'grammar_only': params = [(True, False, 'no_tfidf')] if self.setting == 'tfidf_only': params = [(False, True, 'no_grammar')] for params in params: print('Feature generation set to {} for this run'.format(params[2])) f = get_feature_transformer(parser, run_grammar=params[0], run_tfidf=params[1]) self.X_train = f.fit_transform(self.raw_X_train).todense() self.X_test = f.transform(self.raw_X_test).todense() # Run the loop self.clf_loop(self.X_train, self.X_test, self.y_train, self.y_test, individuals, params[2])