# -*- coding: utf-8 -*- """ Copyright 2016 William La Cava license: GNU/GPLv3 """ import argparse from ._version import __version__ from .evaluation import EvaluationMixin from .population import PopMixin, node from .variation import VariationMixin from .selection import SurvivalMixin from sklearn.base import BaseEstimator from sklearn.linear_model import LassoLarsCV, LogisticRegression, SGDClassifier from sklearn.svm import SVR, LinearSVR, SVC, LinearSVC from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier from sklearn.tree import export_graphviz from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor from sklearn.pipeline import Pipeline from sklearn.model_selection import cross_val_score, train_test_split, KFold from sklearn.metrics import r2_score, accuracy_score, roc_auc_score from sklearn.preprocessing import Imputer, StandardScaler from sklearn.utils import check_random_state from DistanceClassifier import DistanceClassifier import numpy as np import pandas as pd import warnings import copy import itertools as it import pdb from collections import defaultdict # from update_checker import update_check from sklearn.externals.joblib import Parallel, delayed from tqdm import tqdm import uuid # from profilehooks import profile # import multiprocessing as mp # NUM_THREADS = mp.cpu_count() class FEW(SurvivalMixin, VariationMixin, EvaluationMixin, PopMixin, BaseEstimator): """FEW uses GP to find a set of transformations from the original feature space that produces the best performance for a given machine learner. """ update_checked = False def __init__(self, population_size=50, generations=100, mutation_rate=0.5, crossover_rate=0.5, ml = None, min_depth = 1, max_depth = 2, max_depth_init = 2, sel = 'epsilon_lexicase', tourn_size = 2, fit_choice = None, op_weight = False, max_stall=100, seed_with_ml = True, erc = False, random_state=None, verbosity=0, scoring_function=None, disable_update_check=False, elitism=True, boolean = False,classification=False,clean=False, track_diversity=False,mdr=False,otype='f',c=True, weight_parents=True,operators=None, lex_size=False,normalize=True, names=None,dtypes=None): # sets up GP. # Save params to be recalled later by get_params() self.params = locals() # placed before any local variable definitions self.params.pop('self') # # Do not prompt the user to update during this session if they # ever disabled the update check # if disable_update_check: # FEW.update_checked = True # # # Prompt the user if their version is out of date # if not disable_update_check and not FEW.update_checked: # update_check('FEW', __version__) # FEW.update_checked = True self._best_estimator = None self._training_features = None self._training_labels = None self._best_inds = None self.population_size = population_size self.generations = generations self.mutation_rate = mutation_rate self.crossover_rate = crossover_rate self.min_depth = min_depth self.max_depth = max_depth self.max_depth_init = max_depth_init self.sel = sel self.tourn_size = tourn_size self.fit_choice = fit_choice self.op_weight = op_weight self.max_stall = max_stall self.weight_parents = weight_parents self.lex_size = lex_size self.seed_with_ml = seed_with_ml self.erc = erc self.random_state = check_random_state(random_state) self.verbosity = verbosity self.scoring_function = scoring_function self.gp_generation = 0 self.elitism = elitism self.max_fit = 99999999.666 self.boolean = boolean self.classification = classification self.clean = clean self.ml = ml #self.pipeline = Pipeline([('standardScaler',StandardScaler()), ('ml', ml)]) self.ml_type = type(self.ml).__name__ self.track_diversity = track_diversity self.mdr = mdr self.otype = otype self.normalize = normalize self.names = names # variable names if dtypes is not None: self.dtypes = dtypes.split(',') # variable data types else: self.dtypes = None # if otype is b, boolean functions must be turned on if self.otype=='b': self.boolean = True # instantiate sklearn estimator according to specified machine learner if self.ml is None: if self.classification: self.ml = LogisticRegression(solver='sag') else: self.ml = LassoLarsCV() if not self.scoring_function: if self.classification: self.scoring_function = accuracy_score else: self.scoring_function = r2_score # set default fitness metrics for various learners if not self.fit_choice: tmp_dict = defaultdict(lambda: 'r2', { #regression type(LassoLarsCV()): 'mse', type(SVR()): 'mae', type(LinearSVR()): 'mae', type(KNeighborsRegressor()): 'mse', type(DecisionTreeRegressor()): 'mse', type(RandomForestRegressor()): 'mse', #classification type(DistanceClassifier()): 'silhouette', }) self.fit_choice = tmp_dict[type(self.ml)] # Columns to always ignore when in an operator self.non_feature_columns = ['label', 'group', 'guess'] # function set if operators is None: self.func_set = [node('+'), node('-'), node('*'), node('/'), node('sin'), node('cos'), node('exp'), node('log'), node('^2'), node('^3'), node('sqrt')] else: self.func_set = [node(s) for s in operators.split(',')] # terminal set self.term_set = [] # diversity self.diversity = [] # use cython self.c = c # @profile def fit(self, features, labels): """Fit model to data""" # setup data # imputation if self.clean: features = self.impute_data(features) # save the number of features self.n_features = features.shape[1] self.n_samples = features.shape[0] # set population size if type(self.population_size) is str: if 'x' in self.population_size: # set pop size prop to features self.population_size = int( float(self.population_size[:-1])*features.shape[1]) else: self.population_size = int(self.population_size) if self.verbosity >0: print("population size:",self.population_size) # re-initialize pipeline (needs to be here rather than init for GridSearchCV) if self.normalize: self.pipeline = Pipeline([('standardScaler',StandardScaler()), ('ml', self.ml)]) else: self.pipeline = Pipeline([('ml',self.ml)]) # set variable names if they haven't been set if self.names is None: self.names = ['x_'+str(i) for i in np.arange(features.shape[1])] # set variable data types if they haven't been set if self.dtypes is None: self.dtypes = ['f' for i in np.arange(features.shape[1])] # create terminal set for i in np.arange(self.n_features): self.term_set.append(node('x',loc=i,otype=self.dtypes[i])) # features # add ephemeral random constants if flag if self.erc: # ephemeral random constants self.term_set.append(node('k',value=self.random_state.rand())) # edit function set if boolean if self.boolean or self.otype=='b': # include boolean functions self.func_set += [node('!'), node('&'), node('|'), node('=='), node('>_f'), node('<_f'), node('>=_f'), node('<=_f'), node('>_b'), node('<_b'), node('>=_b'), node('<=_b'), node('xor_b'), node('xor_f')] # add mdr if specified if self.mdr: self.func_set += [node('mdr2')] # print few settings if self.verbosity > 1: for arg in self.get_params(): print('{}\t=\t{}'.format(arg, self.get_params()[arg])) print('') ######################################################### initial model # fit to original data with warnings.catch_warnings(): warnings.simplefilter("ignore") if self.scoring_function == roc_auc_score: self._best_score = self.roc_auc_cv(features,labels) else: self._best_score = np.mean( [self.scoring_function(labels[test], self.pipeline.fit(features[train],labels[train]). predict(features[test])) for train, test in KFold().split(features, labels)]) initial_score = self._best_score if self.verbosity > 0: print("initial ML CV: {:1.3f}".format(self._best_score)) ############################################# Create initial population # for now, force seed_with_ml to be off if otype is 'b', since data # types are assumed to be float if self.otype=='b': self.seed_with_ml = False self.pop = self.init_pop() # check that uuids are unique in population uuids = [p.id for p in self.pop.individuals] if len(uuids) != len(set(uuids)): pdb.set_trace() # Evaluate the entire population # X represents a matrix of the population outputs (number of samples x # population size) # single thread self.X = self.transform(features,self.pop.individuals,labels).transpose() # pdb.set_trace() # parallel: # X = np.asarray(Parallel(n_jobs=-1)( # delayed(out)(I,features,self.otype,labels) for I in self.pop.individuals), # order = 'F') # calculate fitness of individuals # fitnesses = list(map(lambda I: fitness(I,labels,self.pipeline),X)) self.F = self.calc_fitness(self.X,labels,self.fit_choice,self.sel) #pdb.set_trace() #with Parallel(n_jobs=10) as parallel: #################### self.diversity=[] # progress bar pbar = tqdm(total=self.generations,disable = self.verbosity==0, desc='Internal CV: {:1.3f}'.format(self._best_score)) stall_count = 0 ########################################################### main GP loop for g in np.arange(self.generations): if stall_count == self.max_stall: if self.verbosity > 0: print('max stall count reached.') break if self.track_diversity: self.get_diversity(self.X) # mid verbosity printouts if self.verbosity > 1: print("generation", str(g)) print("median fitness pop: %0.2f" % np.median( [np.mean(f) for f in self.F])) print("best fitness pop: %0.2f" % np.min( [np.mean(f) for f in self.F])) if self.track_diversity: print("feature diversity: %0.2f" % self.diversity[-1]) # high verbosity printouts if self.verbosity > 2: eqns = self.stacks_2_eqns(self.pop.individuals) fs = [np.mean(f) for f in self.F] print("population:",[("%0.2f" % f, eqns[i]) for f,i in zip(np.sort(fs), np.argsort(fs))]) #print("pop fitnesses:", ["%0.2f" % np.mean(f) for f in self.F]) ####################################################### fit ml model if self.verbosity > 1: print("ml fitting...") tmp_score=0 with warnings.catch_warnings(): warnings.simplefilter("ignore") try: if self.valid_loc(): if self.scoring_function == roc_auc_score: tmp_score = self.roc_auc_cv(self.X[self.valid_loc(),:].transpose(), labels) else: tmp_score = np.mean( [self.scoring_function(labels[test], self.pipeline.fit( self.X[self.valid_loc(),:].transpose()[train], labels[train]). predict(self.X[self.valid_loc(),:].transpose()[test])) for train, test in KFold().split(features, labels)]) except ValueError as detail: print("warning: ValueError in ml fit. X.shape:", self.X[:,self.valid_loc()].transpose().shape, "labels shape:",labels.shape) print("First ten entries X:", self.X[self.valid_loc(),:].transpose()[:10]) print("First ten entries labels:",labels[:10]) print("equations:",self.stacks_2_eqns(self.pop.individuals)) print("FEW parameters:",self.get_params()) print("---\ndetailed error message:", detail) raise(detail) if self.verbosity > 1: print("current ml validation score:",tmp_score) #################################################### save best model if self.valid_loc() and tmp_score > self._best_score: self._best_estimator = copy.deepcopy(self.pipeline) self._best_score = tmp_score stall_count = 0; self._best_inds = copy.deepcopy(self.valid()) if self.verbosity > 1: print("updated best internal CV:",self._best_score) else: stall_count = stall_count + 1 ########################################################## variation if self.verbosity > 2: print("variation...") offspring,elite,elite_index = self.variation(self.pop.individuals) ################################################# evaluate offspring if self.verbosity > 2: print("output...") X_offspring = self.transform(features,offspring).transpose() if self.verbosity > 2: print("fitness...") F_offspring = self.calc_fitness(X_offspring, labels,self.fit_choice,self.sel) ########################################################### survival if self.verbosity > 2: print("survival..") survivors,survivor_index = self.survival(self.pop.individuals, offspring, elite, elite_index, X = self.X, X_O=X_offspring, F=self.F, F_O=F_offspring) # set survivors self.pop.individuals[:] = survivors self.X = np.vstack((self.X, X_offspring))[survivor_index] if 'lexicase' in self.sel: self.F = np.asarray( np.vstack((self.F, F_offspring))[survivor_index], order='F') else: self.F = np.asarray( np.hstack((self.F,F_offspring))[survivor_index], order='F') if self.verbosity > 2: print("median fitness survivors: %0.2f" % np.median( [np.mean(f) for f in self.F])) if self.verbosity>2: print("best features:", self.stacks_2_eqns(self._best_inds) if self._best_inds else 'original') pbar.set_description('Internal CV: {:1.3f}'.format(self._best_score)) pbar.update(1) # end of main GP loop #################### if self.verbosity > 0: print('finished. best internal val score:' ' {:1.3f}'.format(self._best_score)) if self.verbosity > 0: print("final model:\n",self.print_model()) if not self._best_estimator: # if no better model found, just return underlying method fit to the # training data with warnings.catch_warnings(): warnings.simplefilter("ignore") self._best_estimator = self.pipeline.fit(features,labels) else: # fit final estimator to all the training data with warnings.catch_warnings(): warnings.simplefilter("ignore") self._best_estimator.fit(self.transform(features),labels) return self def transform(self,x,inds=None,labels = None): """return a transformation of x using population outputs""" if inds: # return np.asarray(Parallel(n_jobs=10)(delayed(self.out)(I,x,labels,self.otype) # for I in inds)).transpose() return np.asarray( [self.out(I,x,labels,self.otype) for I in inds]).transpose() elif self._best_inds: # return np.asarray(Parallel(n_jobs=10)(delayed(self.out)(I,x,labels,self.otype) # for I in self._best_inds)).transpose() return np.asarray( [self.out(I,x,labels,self.otype) for I in self._best_inds]).transpose() else: return x def impute_data(self,x): """Imputes data set containing Nan values""" imp = Imputer(missing_values='NaN', strategy='mean', axis=0) return imp.fit_transform(x) def clean(self,x): """remove nan and inf rows from x""" return x[~np.any(np.isnan(x) | np.isinf(x),axis=1)] def clean_with_zeros(self,x): """ set nan and inf rows from x to zero""" x[~np.any(np.isnan(x) | np.isinf(x),axis=1)] = 0 return x def predict(self, testing_features): """predict on a holdout data set.""" # print("best_inds:",self._best_inds) # print("best estimator size:",self._best_estimator.coef_.shape) if self.clean: testing_features = self.impute_data(testing_features) if self._best_inds: X_transform = self.transform(testing_features) try: return self._best_estimator.predict(self.transform(testing_features)) except ValueError as detail: # pdb.set_trace() print('shape of X:',testing_features.shape) print('shape of X_transform:',X_transform.transpose().shape) print('best inds:',self.stacks_2_eqns(self._best_inds)) print('valid locs:',self.valid_loc(self._best_inds)) raise ValueError(detail) else: return self._best_estimator.predict(testing_features) def fit_predict(self, features, labels): """Convenience function that fits a pipeline then predicts on the provided features Parameters ---------- features: array-like {n_samples, n_features} Feature matrix labels: array-like {n_samples} List of class labels for prediction Returns ---------- array-like: {n_samples} Predicted labels for the provided features """ self.fit(features, labels) return self.predict(features) def score(self, testing_features, testing_labels): """estimates accuracy on testing set""" # print("test features shape:",testing_features.shape) # print("testing labels shape:",testing_labels.shape) yhat = self.predict(testing_features) return self.scoring_function(testing_labels,yhat) def export(self, output_file_name): """exports engineered features Parameters ---------- output_file_name: string String containing the path and file name of the desired output file Returns ------- None """ if self._best_estimator is None: raise ValueError('A model has not been optimized. Please call fit()' ' first.') # Write print_model() to file with open(output_file_name, 'w') as output_file: output_file.write(self.print_model()) # if decision tree, print tree into dot file if 'DecisionTree' in self.ml_type: export_graphviz(self._best_estimator, out_file=output_file_name+'.dot', feature_names = self.stacks_2_eqns(self._best_inds) if self._best_inds else None, class_names=['True','False'], filled=False,impurity = True,rotate=True) def print_model(self,sep='\n'): """prints model contained in best inds, if ml has a coefficient property. otherwise, prints the features generated by FEW.""" model = '' # print('ml type:',self.ml_type) # print('ml:',self._best_estimator) if self._best_inds: if self.ml_type == 'GridSearchCV': ml = self._best_estimator.named_steps['ml'].best_estimator_ else: ml = self._best_estimator.named_steps['ml'] if self.ml_type != 'SVC' and self.ml_type != 'SVR': # this is need because svm has a bug that throws valueerror on # attribute check if hasattr(ml,'coef_'): if len(ml.coef_.shape)==1: s = np.argsort(np.abs(ml.coef_))[::-1] scoef = ml.coef_[s] bi = [self._best_inds[k] for k in s] model = (' +' + sep).join( [str(round(c,3))+'*'+self.stack_2_eqn(f) for i,(f,c) in enumerate(zip(bi,scoef)) if round(scoef[i],3) != 0]) else: # more than one decision function is fit. print all. for j,coef in enumerate(ml.coef_): s = np.argsort(np.abs(coef))[::-1] scoef = coef[s] bi =[self._best_inds[k] for k in s] model += sep + 'class'+str(j)+' :'+' + '.join( [str(round(c,3))+'*'+self.stack_2_eqn(f) for i,(f,c) in enumerate(zip(bi,coef)) if coef[i] != 0]) elif hasattr(ml,'feature_importances_'): s = np.argsort(ml.feature_importances_)[::-1] sfi = ml.feature_importances_[s] bi = [self._best_inds[k] for k in s] # model = 'importance:feature'+sep model += sep.join( [str(round(c,3))+':'+self.stack_2_eqn(f) for i,(f,c) in enumerate(zip(bi,sfi)) if round(sfi[i],3) != 0]) else: return sep.join(self.stacks_2_eqns(self._best_inds)) else: return sep.join(self.stacks_2_eqns(self._best_inds)) else: return 'original features' return model def representation(self): """return stacks_2_eqns output""" return self.stacks_2_eqns(self._best_inds) def valid_loc(self,F=None): """returns the indices of individuals with valid fitness.""" if F is not None: return [i for i,f in enumerate(F) if np.all(f < self.max_fit) and np.all(f >= 0)] else: return [i for i,f in enumerate(self.F) if np.all(f < self.max_fit) and np.all(f >= 0)] def valid(self,individuals=None,F=None): """returns the sublist of individuals with valid fitness.""" if F: valid_locs = self.valid_loc(F) else: valid_locs = self.valid_loc(self.F) if individuals: return [ind for i,ind in enumerate(individuals) if i in valid_locs] else: return [ind for i,ind in enumerate(self.pop.individuals) if i in valid_locs] def get_params(self, deep=None): """Get parameters for this estimator This function is necessary for FEW to work as a drop-in feature constructor in, e.g., sklearn.model_selection.cross_val_score Parameters ---------- deep: unused Only implemented to maintain interface for sklearn Returns ------- params: mapping of string to any Parameter names mapped to their values """ return self.params def get_diversity(self,X): """compute mean diversity of individual outputs""" # diversity in terms of cosine distances between features feature_correlations = np.zeros(X.shape[0]-1) for i in np.arange(1,X.shape[0]-1): feature_correlations[i] = max(0.0,r2_score(X[0],X[i])) # pdb.set_trace() self.diversity.append(1-np.mean(feature_correlations)) def roc_auc_cv(self,features,labels): """returns an roc auc score depending on the underlying estimator.""" if callable(getattr(self.ml, "decision_function", None)): return np.mean([self.scoring_function(labels[test], self.pipeline.fit(features[train],labels[train]). decision_function(features[test])) for train, test in KFold().split(features, labels)]) elif callable(getattr(self.ml, "predict_proba", None)): return np.mean([self.scoring_function(labels[test], self.pipeline.fit(features[train],labels[train]). predict_proba(features[test])[:,1]) for train, test in KFold().split(features, labels)]) else: raise ValueError("ROC AUC score won't work with " + self.ml_type + ". No " "decision_function or predict_proba method found for this learner.") def positive_integer(value): """Ensures that the provided value is a positive integer; throws an exception otherwise Parameters ---------- value: int The number to evaluate Returns ------- value: int Returns a positive integer """ try: value = int(value) except Exception: raise argparse.ArgumentTypeError( 'Invalid int value: \'{}\''.format(value)) if value < 0: raise argparse.ArgumentTypeError( 'Invalid positive int value: \'{}\''.format(value)) return value def float_range(value): """Ensures that the provided value is a float integer in the range (0., 1.) throws an exception otherwise Parameters ---------- value: float The number to evaluate Returns ------- value: float Returns a float in the range (0., 1.) """ try: value = float(value) except: raise argparse.ArgumentTypeError( 'Invalid float value: \'{}\''.format(value)) if value < 0.0 or value > 1.0: raise argparse.ArgumentTypeError( 'Invalid float value: \'{}\''.format(value)) return value # dictionary of ml options ml_dict = { 'lasso': LassoLarsCV(), 'svr': SVR(), 'lsvr': LinearSVR(), 'lr': LogisticRegression(solver='sag'), 'sgd': SGDClassifier(loss='log',penalty='l1'), 'svc': SVC(), 'lsvc': LinearSVC(), 'rfc': RandomForestClassifier(), 'rfr': RandomForestRegressor(), 'dtc': DecisionTreeClassifier(), 'dtr': DecisionTreeRegressor(), 'dc': DistanceClassifier(), 'knc': KNeighborsClassifier(), 'knr': KNeighborsRegressor(), None: None } # main functions def main(): """Main function that is called when FEW is run on the command line""" parser = argparse.ArgumentParser(description='A feature engineering wrapper' ' for machine learning algorithms.', add_help=False) parser.add_argument('INPUT_FILE', type=str, help='Data file to run FEW on; ensure that the ' 'target/label column is labeled as "label" or "class".') parser.add_argument('-h', '--help', action='help', help='Show this help message and exit.') parser.add_argument('-is', action='store',dest='INPUT_SEPARATOR', default=None,type=str, help='Character separating columns in the input file.') parser.add_argument('-o', action='store', dest='OUTPUT_FILE', default='', type=str, help='File to export the final model.') parser.add_argument('-g', action='store', dest='GENERATIONS', default=100, type=positive_integer, help='Number of generations to run FEW.') parser.add_argument('-p', action='store', dest='POPULATION_SIZE',default=50, help='Number of individuals in the GP population. ' 'Follow the number with x to set population size as a' 'multiple of raw feature size.') parser.add_argument('-mr', action='store', dest='MUTATION_RATE',default=0.5, type=float_range, help='GP mutation rate in the range [0.0, 1.0].') parser.add_argument('-xr', action='store', dest='CROSSOVER_RATE', default=0.5,type=float_range, help='GP crossover rate in the range [0.0, 1.0].') parser.add_argument('-ml', action='store', dest='MACHINE_LEARNER', default=None, choices = ['lasso','svr','lsvr','lr','svc','rfc','rfr', 'dtc','dtr','dc','knc','knr','sgd'], type=str, help='ML algorithm to pair with features. ' 'Default: Lasso (regression), LogisticRegression ' '(classification)') parser.add_argument('-min_depth', action='store', dest='MIN_DEPTH', default=1,type=positive_integer, help='Minimum length of GP programs.') parser.add_argument('-max_depth', action='store', dest='MAX_DEPTH', default=2,type=positive_integer, help='Maximum number of nodes in GP programs.') parser.add_argument('-max_depth_init', action='store',dest='MAX_DEPTH_INIT', default=2,type=positive_integer, help='Maximum nodes in initial programs.') parser.add_argument('-op_weight', action='store',dest='OP_WEIGHT',default=1, type=bool, help='Weight attributes for incuded in' ' features based on ML scores. Default: off') parser.add_argument('-ms', action='store', dest='MAX_STALL',default=100, type=positive_integer, help='If model CV does not ' 'improve for this many generations, end optimization.') parser.add_argument('--weight_parents', action='store_true', dest='WEIGHT_PARENTS',default=True, help='Feature importance weights parent selection.') parser.add_argument('--lex_size', action='store_true',dest='LEX_SIZE',default=False, help='Size mediated parent selection for lexicase survival.') parser.add_argument('-sel', action='store', dest='SEL', default='epsilon_lexicase', choices = ['tournament','lexicase','epsilon_lexicase', 'deterministic_crowding','random'], type=str, help='Selection method (Default: tournament)') parser.add_argument('-tourn_size', action='store', dest='TOURN_SIZE', default=2, type=positive_integer, help='Tournament size (Default: 2)') parser.add_argument('-fit', action='store', dest='FIT_CHOICE', default=None, choices = ['mse','mae','r2','vaf','mse_rel','mae_rel', 'r2_rel','vaf_rel','silhouette','inertia', 'separation','fisher','random','relief'], type=str, help='Fitness metric (Default: dependent on ml used)') parser.add_argument('--no_seed', action='store_false', dest='SEED_WITH_ML', default=True, help='Turn off initial GP population seeding.') parser.add_argument('--elitism', action='store_true', dest='ELITISM', default=False, help='Force survival of best feature in GP population.') parser.add_argument('--erc', action='store_true', dest='ERC', default=False, help='Use random constants in GP feature construction.') parser.add_argument('--bool', action='store_true', dest='BOOLEAN', default=False, help='Include boolean operators in features.') parser.add_argument('-otype', action='store', dest='OTYPE', default='f', choices=['f','b'], type=str, help='Feature output type. f: float, b: boolean.') parser.add_argument('-ops', action='store', dest='OPS', default=None, type=str, help='Specify operators separated by commas') parser.add_argument('-dtypes', action='store', dest='DTYPES', default=None, type=str, help='Specify datafile types separated by a comma') parser.add_argument('--class', action='store_true', dest='CLASSIFICATION', default=False, help='Conduct classification rather than regression.') parser.add_argument('--mdr', action='store_true',dest='MDR',default=False, help='Use MDR nodes.') parser.add_argument('--nonorm', action='store_false',dest='NORMALIZE',default=True, help='Disable standard scaler preprocessor.') parser.add_argument('--diversity', action='store_true', dest='TRACK_DIVERSITY', default=False, help='Store diversity of feature transforms each gen.') parser.add_argument('--clean', action='store_true', dest='CLEAN', default=False, help='Clean input data of missing values.') parser.add_argument('--no_lib', action='store_false', dest='c', default=True, help='Don''t use optimized c libraries.') parser.add_argument('-s', action='store', dest='RANDOM_STATE', default=None, type=int, help='Random number generator seed for reproducibility.' 'Note that using multi-threading may make exact results' ' impossible to reproduce.') parser.add_argument('-v', action='store', dest='VERBOSITY', default=1, choices=[0, 1, 2, 3], type=int, help='How much information FEW communicates while it is' ' running: 0 = none, 1 = minimal, 2 = lots, 3 = all.') parser.add_argument('--no-update-check', action='store_true', dest='DISABLE_UPDATE_CHECK', default=False, help='Don''t check the FEW version.') parser.add_argument('--version', action='version', version='FEW {version}'.format(version=__version__), help='Show FEW\'s version number and exit.') args = parser.parse_args() # if args.VERBOSITY >= 2: # print('\nFEW settings:') # for arg in sorted(args.__dict__): # if arg == 'DISABLE_UPDATE_CHECK': # continue # print('{}\t=\t{}'.format(arg, args.__dict__[arg])) # print('') # load data from csv file if args.INPUT_SEPARATOR is None: input_data = pd.read_csv(args.INPUT_FILE, sep=args.INPUT_SEPARATOR, engine='python') else: # use c engine for read_csv is separator is specified input_data = pd.read_csv(args.INPUT_FILE, sep=args.INPUT_SEPARATOR) # if 'Label' in input_data.columns.values: input_data.rename(columns={'Label': 'label','Class':'label','class':'label', 'target':'label'}, inplace=True) RANDOM_STATE = args.RANDOM_STATE train_i, test_i = train_test_split(input_data.index, stratify = None, #stratify=input_data['label'].values, train_size=0.75, test_size=0.25, random_state=RANDOM_STATE) training_features = input_data.loc[train_i].drop('label', axis=1).values training_labels = input_data.loc[train_i, 'label'].values testing_features = input_data.loc[test_i].drop('label', axis=1).values testing_labels = input_data.loc[test_i, 'label'].values learner = FEW(generations=args.GENERATIONS, population_size=args.POPULATION_SIZE, mutation_rate=args.MUTATION_RATE, crossover_rate=args.CROSSOVER_RATE, ml = ml_dict[args.MACHINE_LEARNER], min_depth = args.MIN_DEPTH,max_depth = args.MAX_DEPTH, sel = args.SEL, tourn_size = args.TOURN_SIZE, seed_with_ml = args.SEED_WITH_ML, op_weight = args.OP_WEIGHT, max_stall = args.MAX_STALL, erc = args.ERC, random_state=args.RANDOM_STATE, verbosity=args.VERBOSITY, disable_update_check=args.DISABLE_UPDATE_CHECK, fit_choice = args.FIT_CHOICE,boolean=args.BOOLEAN, classification=args.CLASSIFICATION,clean = args.CLEAN, track_diversity=args.TRACK_DIVERSITY,mdr=args.MDR, otype=args.OTYPE,c=args.c, lex_size = args.LEX_SIZE, weight_parents = args.WEIGHT_PARENTS,operators=args.OPS, normalize=args.NORMALIZE, dtypes = args.DTYPES) learner.fit(training_features, training_labels) # pdb.set_trace() if args.VERBOSITY >= 1: print('\nTraining accuracy: {:1.3f}'.format( learner.score(training_features, training_labels))) print('Test accuracy: {:1.3f}'.format( learner.score(testing_features, testing_labels))) if args.OUTPUT_FILE != '': learner.export(args.OUTPUT_FILE) if __name__ == '__main__': main()