# Functions performing various input/output operations for the ChaLearn AutoML challenge # Main contributor: Arthur Pesah, August 2014 # Edits: Isabelle Guyon, October 2014 # ALL INFORMATION, SOFTWARE, DOCUMENTATION, AND DATA ARE PROVIDED "AS-IS". # ISABELLE GUYON, CHALEARN, AND/OR OTHER ORGANIZERS OR CODE AUTHORS DISCLAIM # ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ANY PARTICULAR PURPOSE, AND THE # WARRANTY OF NON-INFRIGEMENT OF ANY THIRD PARTY'S INTELLECTUAL PROPERTY RIGHTS. # IN NO EVENT SHALL ISABELLE GUYON AND/OR OTHER ORGANIZERS BE LIABLE FOR ANY SPECIAL, # INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER ARISING OUT OF OR IN # CONNECTION WITH THE USE OR PERFORMANCE OF SOFTWARE, DOCUMENTS, MATERIALS, # PUBLICATIONS, OR INFORMATION MADE AVAILABLE FOR THE CHALLENGE. import data_converter import data_io from data_io import vprint import numpy as np try: import cPickle as pickle except: import pickle import os import time class DataManager: ''' This class aims at loading and saving data easily with a cache and at generating a dictionary (self.info) in which each key is a feature (e.g. : name, format, feat_num,...). Methods defined here are : __init__ (...) x.__init__([(feature, value)]) -> void Initialize the info dictionary with the tuples (feature, value) given as argument. It recognizes the type of value (int, string) and assign value to info[feature]. An unlimited number of tuple can be sent. getInfo (...) x.getInfo (filename) -> void Fill the dictionary with an info file. Each line of the info file must have this format 'feature' : value The information is obtained from the public.info file if it exists, or inferred from the data files getInfoFromFile (...) x.getInfoFromFile (filename) -> void Fill the dictionary with an info file. Each line of the info file must have this format 'feature' : value getFormatData (...) x.getFormatData (filename) -> str Get the format of the file ('dense', 'sparse' or 'sparse_binary') either using the 'is_sparse' feature if it exists (for example after a call of getInfoFromFile function) and then determing if it's binary or not, or determining it alone. getNbrFeatures (...) x.getNbrFeatures (*filenames) -> int Get the number of features, using the data files given. It first checks the format of the data. If it's a matrix, the number of features is trivial. If it's a sparse file, it gets the max feature index given in every files. getTypeProblem (...) x.getTypeProblem (filename) -> str Get the kind of problem ('binary.classification', 'multiclass.classification', 'multilabel.classification', 'regression'), using the solution file given. ''' def __init__(self, basename, input_dir, verbose=False, replace_missing=True, filter_features=False): '''Constructor''' self.use_pickle = False # Turn this to true to save data as pickle (inefficient) self.basename = basename if basename in input_dir: self.input_dir = input_dir else: self.input_dir = input_dir + "/" + basename + "/" if self.use_pickle: if os.path.exists ("tmp"): self.tmp_dir = "tmp" elif os.path.exists ("../tmp"): self.tmp_dir = "../tmp" else: os.makedirs("tmp") self.tmp_dir = "tmp" info_file = os.path.join (self.input_dir, basename + '_public.info') self.info = {} self.getInfo (info_file) self.feat_type = self.loadType (os.path.join(self.input_dir, basename + '_feat.type'), verbose=verbose) self.data = {} Xtr = self.loadData (os.path.join(self.input_dir, basename + '_train.data'), verbose=verbose, replace_missing=replace_missing) Ytr = self.loadLabel (os.path.join(self.input_dir, basename + '_train.solution'), verbose=verbose) Xva = self.loadData (os.path.join(self.input_dir, basename + '_valid.data'), verbose=verbose, replace_missing=replace_missing) Xte = self.loadData (os.path.join(self.input_dir, basename + '_test.data'), verbose=verbose, replace_missing=replace_missing) # Normally, feature selection should be done as part of a pipeline. # However, here we do it as a preprocessing for efficiency reason idx=[] if filter_features: # add hoc feature selection, for the example... fn = min(Xtr.shape[1], 1000) idx = data_converter.tp_filter(Xtr, Ytr, feat_num=fn, verbose=verbose) Xtr = Xtr[:,idx] Xva = Xva[:,idx] Xte = Xte[:,idx] self.feat_idx = np.array(idx).ravel() self.data['X_train'] = Xtr self.data['Y_train'] = Ytr self.data['X_valid'] = Xva self.data['X_test'] = Xte def __repr__(self): return "DataManager : " + self.basename def __str__(self): val = "DataManager : " + self.basename + "\ninfo:\n" for item in self.info: val = val + "\t" + item + " = " + str(self.info[item]) + "\n" val = val + "data:\n" val = val + "\tX_train = array" + str(self.data['X_train'].shape) + "\n" val = val + "\tY_train = array" + str(self.data['Y_train'].shape) + "\n" val = val + "\tX_valid = array" + str(self.data['X_valid'].shape) + "\n" val = val + "\tX_test = array" + str(self.data['X_test'].shape) + "\n" val = val + "feat_type:\tarray" + str(self.feat_type.shape) + "\n" val = val + "feat_idx:\tarray" + str(self.feat_idx.shape) + "\n" return val def loadData (self, filename, verbose=True, replace_missing=True): ''' Get the data from a text file in one of 3 formats: matrix, sparse, binary_sparse''' if verbose: print("========= Reading " + filename) start = time.time() if self.use_pickle and os.path.exists (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")): with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file: vprint (verbose, "Loading pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")) return pickle.load(pickle_file) if 'format' not in self.info.keys(): self.getFormatData(filename) if 'feat_num' not in self.info.keys(): self.getNbrFeatures(filename) data_func = {'dense':data_io.data, 'sparse':data_io.data_sparse, 'sparse_binary':data_io.data_binary_sparse} data = data_func[self.info['format']](filename, self.info['feat_num']) # INPORTANT: when we replace missing values we double the number of variables if self.info['format']=='dense' and replace_missing and np.any(map(np.isnan,data)): vprint (verbose, "Replace missing values by 0 (slow, sorry)") data = data_converter.replace_missing(data) if self.use_pickle: with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file: vprint (verbose, "Saving pickle file : " + os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")) p = pickle.Pickler(pickle_file) p.fast = True p.dump(data) end = time.time() if verbose: print( "[+] Success in %5.2f sec" % (end - start)) return data def loadLabel (self, filename, verbose=True): ''' Get the solution/truth values''' if verbose: print("========= Reading " + filename) start = time.time() if self.use_pickle and os.path.exists (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")): with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file: vprint (verbose, "Loading pickle file : " + os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")) return pickle.load(pickle_file) if 'task' not in self.info.keys(): self.getTypeProblem(filename) # IG: Here change to accommodate the new multiclass label format if self.info['task'] == 'multilabel.classification': label = data_io.data(filename) elif self.info['task'] == 'multiclass.classification': label = data_converter.convert_to_num(data_io.data(filename)) else: label = np.ravel(data_io.data(filename)) # get a column vector #label = np.array([np.ravel(data_io.data(filename))]).transpose() # get a column vector if self.use_pickle: with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file: vprint (verbose, "Saving pickle file : " + os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")) p = pickle.Pickler(pickle_file) p.fast = True p.dump(label) end = time.time() if verbose: print( "[+] Success in %5.2f sec" % (end - start)) return label def loadType (self, filename, verbose=True): ''' Get the variable types''' if verbose: print("========= Reading " + filename) start = time.time() type_list = [] if os.path.isfile(filename): type_list = data_converter.file_to_array (filename, verbose=False) else: n=self.info['feat_num'] type_list = [self.info['feat_type']]*n type_list = np.array(type_list).ravel() end = time.time() if verbose: print( "[+] Success in %5.2f sec" % (end - start)) return type_list def getInfo (self, filename, verbose=True): ''' Get all information {attribute = value} pairs from the filename (public.info file), if it exists, otherwise, output default values''' if filename==None: basename = self.basename input_dir = self.input_dir else: basename = os.path.basename(filename).split('_')[0] input_dir = os.path.dirname(filename) if os.path.exists(filename): self.getInfoFromFile (filename) vprint (verbose, "Info file found : " + os.path.abspath(filename)) # Finds the data format ('dense', 'sparse', or 'sparse_binary') self.getFormatData(os.path.join(input_dir, basename + '_train.data')) else: vprint (verbose, "Info file NOT found : " + os.path.abspath(filename)) # Hopefully this never happens because this is done in a very inefficient way # reading the data multiple times... self.info['usage'] = 'No Info File' self.info['name'] = basename # Get the data format and sparsity self.getFormatData(os.path.join(input_dir, basename + '_train.data')) # Assume no categorical variable and no missing value (we'll deal with that later) self.info['has_categorical'] = 0 self.info['has_missing'] = 0 # Get the target number, label number, target type and task self.getTypeProblem(os.path.join(input_dir, basename + '_train.solution')) if self.info['task']=='regression': self.info['metric'] = 'r2_metric' else: self.info['metric'] = 'auc_metric' # Feature type: Numerical, Categorical, or Binary # Can also be determined from [filename].type self.info['feat_type'] = 'Mixed' # Get the number of features and patterns self.getNbrFeatures(os.path.join(input_dir, basename + '_train.data'), os.path.join(input_dir, basename + '_test.data'), os.path.join(input_dir, basename + '_valid.data')) self.getNbrPatterns(basename, input_dir, 'train') self.getNbrPatterns(basename, input_dir, 'valid') self.getNbrPatterns(basename, input_dir, 'test') # Set default time budget self.info['time_budget'] = 600 return self.info def getInfoFromFile (self, filename): ''' Get all information {attribute = value} pairs from the public.info file''' with open (filename, "r") as info_file: lines = info_file.readlines() features_list = list(map(lambda x: tuple(x.strip("\'").split(" = ")), lines)) for (key, value) in features_list: self.info[key] = value.rstrip().strip("'").strip(' ') if self.info[key].isdigit(): # if we have a number, we want it to be an integer self.info[key] = int(self.info[key]) return self.info def getFormatData(self,filename): ''' Get the data format directly from the data file (in case we do not have an info file)''' if 'format' in self.info.keys(): return self.info['format'] if 'is_sparse' in self.info.keys(): if self.info['is_sparse'] == 0: self.info['format'] = 'dense' else: data = data_converter.read_first_line (filename) if ':' in data[0]: self.info['format'] = 'sparse' else: self.info['format'] = 'sparse_binary' else: data = data_converter.file_to_array (filename) if ':' in data[0][0]: self.info['is_sparse'] = 1 self.info['format'] = 'sparse' else: nbr_columns = len(data[0]) for row in range (len(data)): if len(data[row]) != nbr_columns: self.info['format'] = 'sparse_binary' if 'format' not in self.info.keys(): self.info['format'] = 'dense' self.info['is_sparse'] = 0 return self.info['format'] def getNbrFeatures (self, *filenames): ''' Get the number of features directly from the data file (in case we do not have an info file)''' if 'feat_num' not in self.info.keys(): self.getFormatData(filenames[0]) if self.info['format'] == 'dense': data = data_converter.file_to_array(filenames[0]) self.info['feat_num'] = len(data[0]) elif self.info['format'] == 'sparse': self.info['feat_num'] = 0 for filename in filenames: sparse_list = data_converter.sparse_file_to_sparse_list (filename) last_column = [sparse_list[i][-1] for i in range(len(sparse_list))] last_column_feature = [a for (a,b) in last_column] self.info['feat_num'] = max(self.info['feat_num'], max(last_column_feature)) elif self.info['format'] == 'sparse_binary': self.info['feat_num'] = 0 for filename in filenames: data = data_converter.file_to_array (filename) last_column = [int(data[i][-1]) for i in range(len(data))] self.info['feat_num'] = max(self.info['feat_num'], max(last_column)) return self.info['feat_num'] def getNbrPatterns (self, basename, info_dir, datatype): ''' Get the number of patterns directly from the data file (in case we do not have an info file)''' line_num = data_converter.num_lines(os.path.join(info_dir, basename + '_' + datatype + '.data')) self.info[datatype+'_num'] = line_num return line_num def getTypeProblem (self, solution_filename): ''' Get the type of problem directly from the solution file (in case we do not have an info file)''' if 'task' not in self.info.keys(): solution = np.array(data_converter.file_to_array(solution_filename)) target_num = solution.shape[1] self.info['target_num']=target_num if target_num == 1: # if we have only one column solution = np.ravel(solution) # flatten nbr_unique_values = len(np.unique(solution)) if nbr_unique_values < len(solution)/8: # Classification self.info['label_num'] = nbr_unique_values if nbr_unique_values == 2: self.info['task'] = 'binary.classification' self.info['target_type'] = 'Binary' else: self.info['task'] = 'multiclass.classification' self.info['target_type'] = 'Categorical' else: # Regression self.info['label_num'] = 0 self.info['task'] = 'regression' self.info['target_type'] = 'Numerical' else: # Multilabel or multiclass self.info['label_num'] = target_num self.info['target_type'] = 'Binary' if any(item > 1 for item in map(np.sum,solution.astype(int))): self.info['task'] = 'multilabel.classification' else: self.info['task'] = 'multiclass.classification' return self.info['task']