import itertools from operator import itemgetter import os from colour import Color, color_scale, hsl2hex import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt from matplotlib.patches import PathPatch from matplotlib.lines import Line2D plt.style.use(['fivethirtyeight']) from scipy.stats import zscore from sklearn.preprocessing import MinMaxScaler from .benchmark import generate_tests def set_print_options(rows=None, cols=None): """Sets the print options for pandas to show all columns or rows""" if not rows: pd.set_option('display.max_rows', rows) if not cols: pd.set_option('display.max_columns', cols) def square_fac(n): """Gets the factors closest to square of a number n""" upper_bound = int(n**0.5)+1 for c in range(upper_bound, 0, -1): if n % c == 0: break rslts = [c, int(n/c)] return min(rslts), max(rslts) def compute_missing_runs(runs_df): """Computes the runs which don't have results Args: runs_df (pd.Dataframe): A list of all the runs Returns: A pandas dataframe of all the missing runs """ tests = generate_tests() test_len = len(tests) keyed = {test_id: t for test_id, t in zip(range(test_len), tests)} missing = [keyed[i] for i in range(test_len) if i not in runs_df['ID'].tolist()] missing_df = pd.DataFrame(missing, columns=['ID', 'MODEL', 'DATASET_ID', 'TYPE', 'SEED']) missing_df['DATASET_ID'] = missing_df['DATASET_ID'].astype(int) return missing_df, len(tests) def drop_missing_datasets(runs_df, missing_df, missing_thresh): """If a dataset is missing more than or equal to the missing_thresh for a specific combination of model and dataset, the dataset and its data is dropped from all models Args: runs_df (pd.Dataframe): A list of all computed runs missing_df (pd.Dataframe): A list of all missing runs missing_thresh (int): missing threshold (0-10) Returns: An augmented pandas dataframe with removed datasets """ counts = missing_df.groupby(['TYPE', 'MODEL'])['DATASET_ID'].value_counts() counts = counts[counts >= missing_thresh] drop_datasets = counts.index.get_level_values('DATASET_ID').values drop_dids = pd.unique(drop_datasets).tolist() drop_num = len(set(runs_df['DATASET_ID'].values.tolist()) & set(drop_dids)) runs_df = runs_df[~runs_df['DATASET_ID'].isin(drop_dids)] return runs_df, drop_num def drop_missing_runs(runs_df, missing_df): """In order to make the comparisons even across models, all runs that did not complete in one model are removed from all models Args: missing_df (pd.Dataframe): A list of all missing runs Returns: A index list """ drop_tuples = list(set(missing_df.set_index(['DATASET_ID', 'SEED']).index.values.tolist())) dataset_missing = pd.DataFrame(drop_tuples, columns=['DATASET_ID', 'SEED'])['DATASET_ID'].value_counts() runs_df = runs_df.set_index(['DATASET_ID', 'SEED']) runs_df = runs_df.drop(index=drop_tuples).reset_index() runs_df = runs_df[['ID', 'MODEL', 'DATASET_ID', 'TYPE', 'SEED', 'MSE', 'R2_SCORE', 'LOGLOSS', 'F1_SCORE']] return runs_df, len(drop_tuples) def split_by_type(runs_df): runs_grouped = runs_df.groupby('TYPE') return (runs_grouped.get_group('classification').drop(columns=['MSE', 'R2_SCORE']), runs_grouped.get_group('regression').drop(columns=['F1_SCORE', 'LOGLOSS'])) def data_distributions(data_df, target): """Plots the spread of multiple runs (seeds) across a dataframe Args: data_df (pd.Dataframe): A dataframe holding the results of the runs target (str): a pandas column header to represent the response variable """ grouped = data_df.groupby(['MODEL', 'DATASET_ID']) for k, df in grouped: plt.hist(df['F1_SCORE'].values, alpha=0.5, label=k) plt.legend(loc='upper right') plt.show() def correlation_viz(mu_df, targets): """Creates scatterplots of correlation betwene dataset stats and model performance Args: mu_df (pd.DataFrame): A dataframe holding all failures targets (dict(str,list(str))): Column names from mu_df to perform analysis on """ def get_true_features(d_id): df = pd.read_csv('datasets/{}.csv'.format(d_id)) df_types = pd.read_csv('datasets/{}_types.csv'.format(d_id)) #Get categorical encoded column count df_types_cat = df_types.loc[df_types['TYPE'] == 'categorical']['NAME'] df_cat = df[df.columns.intersection(df_types_cat.values)] uniques = [len(df_cat[col].unique()) for col in df_cat] df_types_num = df_types.loc[df_types['TYPE'] == 'numerical']['NAME'] df_num = df[df.columns.intersection(df_types_num.values)] count = np.sum(uniques) + len(df_num.columns) return count plt.gcf().set_size_inches(20, 15) meta_c_df = pd.read_csv('datasets/study_classification_info.csv') meta_r_df = pd.read_csv('datasets/study_regression_info.csv') meta_df = pd.concat([meta_c_df, meta_r_df]) meta_df['DIMENSIONALITY'] = meta_df.apply(lambda row: get_true_features(row['DATASET_ID']), axis=1) full_data = pd.merge(mu_df, meta_df, how='left') models = full_data['MODEL'].unique() row_size = max([len(x) for x in targets.values()]) base_colors = [hsl2hex(c) for c in color_scale((0., 0.8, 0.6), (0.8, 0.8, 0.6), len(models))] lines = None for j, TYPE in enumerate(targets): for i, BASE in enumerate(targets[TYPE][1]): all_data = full_data.loc[full_data['TYPE'] == TYPE] all_data = all_data[['MODEL','DATASET_ID',BASE,targets[TYPE][0]]] all_data = all_data.groupby(['MODEL','DATASET_ID',BASE], as_index=False).mean() all_data = all_data.sort_values(BASE) plt.subplot(len(targets),row_size,row_size*j+i+1) ylabel_str = targets[TYPE][0] if ylabel_str.lower() == 'mse': label_str = 'standardized negated mse' plt.xlabel(BASE.replace('_',' ').capitalize()) plt.ylabel("{} {}".format(TYPE.replace('_',' ').capitalize(), ylabel_str.replace('_',' ').capitalize())) local_lines = [] for k, m in enumerate(models): ss = all_data.loc[all_data['MODEL'] == m] ss[BASE] = ss[BASE].rolling(int(len(ss[BASE])/2)).median() ss[targets[TYPE][0]] = ss[targets[TYPE][0]].rolling(int(len(ss[BASE])/2)).median() x = ss[BASE] y = ss[targets[TYPE][0]] line, = plt.plot(x, y, color=base_colors[k], alpha=0.7, label=m) local_lines.append(line) if lines == None: lines = local_lines plt.figlegend(lines, models, fancybox=True, framealpha=0.0) plt.gcf().suptitle('Dataset Dependent Performance Analysis') if not os.path.exists('figures'): os.makedirs('figures') plt.savefig('figures/DatasetPerformance.pdf', dpi=plt.gcf().dpi, transparent=True) # plt.show() def dataset_viz(mu_df, targets): """Creates histograms for given dataset, type filter and targets Args: mu_df (pd.DataFrame): A dataframe holding all failures targets (dict(str,list(str))): Column names from mu_df to perform analysis on """ def lengths(x): if isinstance(x,list): yield len(x) for y in x: yield from lengths(y) rows = len(targets.keys()) cols = max(lengths(list(targets.values()))) fig, axes_list = plt.subplots(rows, cols) axes_list[-1, -1].axis('off') fig.set_size_inches(18, 8) meta_c_df = pd.read_csv('datasets/study_classification_info.csv') meta_r_df = pd.read_csv('datasets/study_regression_info.csv') meta_df = pd.concat([meta_c_df, meta_r_df]) row_size = max([len(x) for x in targets.values()]) base_colors = [hsl2hex(c) for c in color_scale((0., 0.8, 0.6), (0.8, 0.8, 0.6), row_size)] for j, TYPE in enumerate(targets): for i, BASE in enumerate(targets[TYPE]): all_data = pd.merge(mu_df.loc[mu_df['TYPE']==TYPE], meta_df, how='left') full_data = pd.merge(mu_df, meta_df, how='left') ax = axes_list[j][i] ax.set_xlabel(BASE.capitalize() + " Count (Log Scale)") ax.set_ylabel("{} Frequency".format(BASE.capitalize())) counts, bins, bars = ax.hist(all_data[BASE], bins=np.logspace(np.log10(np.min(full_data[BASE])), np.log10(np.max(full_data[BASE])), 30), stacked=True, color=base_colors[i], alpha=0.7, edgecolor='black', linewidth=0.6) ax.set_xscale('log') fig.suptitle('Content Analysis of Datasets') fig.subplots_adjust(hspace=0.4, wspace=0.3) if not os.path.exists('figures'): os.makedirs('figures') plt.savefig('figures/DatasetShapes.pdf', dpi=fig.dpi, transparent=True) # plt.show() def pairwise_comp_viz(mu_df, target): """Creates a pariwise interaction visualization plot comparing each model against the other Args: mu_df (pd.Dataframe): A dataframe of valid runs with type and model as indicies with aggregated means across runs c_df_info (pd.Dataframe): A dataframe with information about each dataset type """ def plot_comp(mu_df, m1, m2, target, vmin, vmax, cmap, ax): m1_values = mu_df.xs(m1, level=1).values m2_values = mu_df.xs(m2, level=1).values # difference from y=x color mapping (not magnitude because independent) colors = np.array([m_2 - m_1 for m_2, m_1 in zip(m2_values, m1_values)]) sc = ax.scatter(m1_values, m2_values, alpha=0.7, s=15, c=colors, cmap=cmap, zorder=10, norm=MidpointNormalize(vmin=vmin, vmax=vmax, midpoint=0)) ax.set_xlabel(m1) ax.set_ylabel(m2) lims = [np.min([ax.get_xlim(), ax.get_ylim()]), np.max([ax.get_xlim(), ax.get_ylim()])] ax.plot(lims, lims, 'k-', lw=0.7, alpha=0.7, zorder=0) ax.set_aspect('equal') ax.set_xlim(lims) ax.set_ylim(lims) return sc class MidpointNormalize(mpl.colors.Normalize): def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False): self.midpoint = midpoint mpl.colors.Normalize.__init__(self, vmin, vmax, clip) def __call__(self, value, clip=None): x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1] return np.ma.masked_array(np.interp(value, x, y)) def get_color_range(c1, c2, bins): MAX_L = 0.7 c1h, c1s, c1l = c1.hsl c2h, c2s, c2l = c2.hsl c1_bins = [Color(hsl=(c1h, c1s, var_l)) for var_l in np.linspace(c1l, MAX_L, int(bins/2))] c2_bins = [Color(hsl=(c2h, c2s, var_l)) for var_l in np.linspace(MAX_L, c2l, int(bins/2))] color_range = [c.hex_l for c in (c1_bins + c2_bins)] return color_range sort_order = {'auto-sklearn': 1, 'tpot': 2, 'h2o': 3, 'auto_ml': 4} mu_df = mu_df[target] models = sorted(pd.unique(mu_df.index.get_level_values('MODEL').values), key=lambda x: sort_order[x]) combos = list(itertools.combinations(models, 2)) sorted_combos = list(sorted(combos, key=lambda x: (sort_order[x[0]], sort_order[x[1]]))) plot_count = len(sorted_combos) rows, cols = square_fac(plot_count) fig, ax_list = plt.subplots(rows, cols) fig.set_size_inches(17, 8) metric_name = target.replace('_', ' ').title() if target == 'F1_SCORE' else target base_colors = [hsl2hex(c) for c in color_scale((0, 0.7, 0.4), (1, 0.7, 0.4), plot_count)] model_colors = {m: c for m, c in zip(models, base_colors)} color_bins = 10 scatters = [] # get min-max of differences vmin = np.inf vmax = -np.inf for m1, m2 in sorted_combos: m1_values = mu_df.xs(m1, level=1).values m2_values = mu_df.xs(m2, level=1).values colors = np.array([m_2 - m_1 for m_2, m_1 in zip(m2_values, m1_values)]) if np.max(colors) > vmax: vmax = np.max(colors) if np.min(colors) < vmin: vmin = np.min(colors) for combo, ax in zip(sorted_combos, ax_list.ravel()): m1, m2 = combo color_range = get_color_range(Color(model_colors[m1]), Color(model_colors[m2]), color_bins) cmap = mpl.colors.ListedColormap(color_range) scatters.append(plot_comp(mu_df, m1, m2, target, vmin, vmax, cmap, ax)) for sc, ax in zip(scatters, ax_list.ravel()): cbar = fig.colorbar(sc, ax=ax, fraction=0.046, pad=0.08) cbar.ax.tick_params(labelsize=10) ax_str = '{} Difference' if target == 'F1_score' else 'Standardized Inverted {} Difference' cbar.set_label(ax_str.format(metric_name), rotation=90, fontsize=8, labelpad=-57) # if target == 'F1_SCORE' else -65) fig.suptitle('Dataset Mean {} Across Frameworks'.format(metric_name)) if not os.path.exists('figures'): os.makedirs('figures') plt.savefig('figures/DatasetMean{}.pdf'.format(metric_name.replace(' ', '')), dpi=fig.dpi, transparent=True) # plt.show() def boxplot_viz(clean_df, target): clean_df = clean_df[target] models = pd.unique(clean_df.index.values) data_arr = np.array([clean_df[m].values for m in models]).T base_colors = [hsl2hex(c) for c in color_scale((0., 0.8, 0.6), (0.8, 0.8, 0.6), len(models))] plt.figure(figsize=(7, 3.5)) title_str = "Raw Per Model {} Comparison ({})".format('Classification' if target=='F1_SCORE' else 'Regression', target) plt.title(title_str, size=12) bplot = plt.boxplot(data_arr, vert=False, patch_artist=True, notch=True, labels=" ", positions=list(reversed(range(1, len(models)+1)))) for p, c in zip(bplot['boxes'], base_colors): p.set_facecolor(c) plt.legend(bplot['boxes'], models, loc='lower left', prop={'size': 8}, fancybox=True, framealpha=0.6) plt.setp(bplot['fliers'], markeredgecolor='grey') plt.setp(bplot['medians'], color='black') # plt.show() plt.savefig('figures/RawDataBoxPlot{}.pdf'.format(target), dpi=plt.gcf().dpi, transparent=True) def standardize_scale(runs_df, target, invert=False): runs_df = runs_df.copy() print('Standardizing and scaling {}...'.format(target)) m_type = 'classification' if target == 'F1_SCORE' else 'regression' d_ids = pd.unique(runs_df[runs_df['TYPE'] == m_type]['DATASET_ID'].values) for d_id in d_ids: transformation = MinMaxScaler().fit_transform(zscore( runs_df[runs_df['DATASET_ID'] == d_id][target].values).reshape((-1, 1))).ravel() runs_df.loc[runs_df['DATASET_ID'] == d_id, target] = 1 - transformation if invert else transformation return runs_df def per_model_median_confidence(runs_df): """Computes the grouped median and iqr by model type Args: runs_df (pd.Dataframe): A list of all the runs Returns: A tuple of pandas Dataframes that represent the median and iqr of each model """ overall = runs_df.drop(columns=['SEED', 'ID']).groupby(['TYPE', 'MODEL', 'DATASET_ID'], as_index=False).mean() collected = overall.drop(columns=['DATASET_ID']).groupby(['TYPE', 'MODEL']) N = len(runs_df)/4 return collected.median(), 1.57*(collected.quantile(0.75)-collected.quantile(0.25))/np.sqrt(N) def per_model_mean(runs_df): """Computes the grouped mean and std by model type Args: runs_df (pd.Dataframe): A list of all the runs Returns: A tuple of pandas Dataframes that represent the mean and std of each model """ overall = runs_df.drop(columns=['SEED', 'ID']).groupby(['TYPE', 'MODEL', 'DATASET_ID'], as_index=False).mean() collected = overall.drop(columns=['DATASET_ID']).groupby(['TYPE', 'MODEL']) return collected.mean() def per_dataset_mean_std(runs_df): """Computes the overall mean and median of each dataset grouped by model Args: runs_df (pd.Dataframe): A list of all the runs Returns: A tuple of pandas Dataframes that represent the mean and variance of each dataset by model """ processed = runs_df.drop(columns=['SEED', 'ID']).groupby(['TYPE', 'MODEL', 'DATASET_ID']) return processed.mean(), processed.std() def original_dataset_clean(runs_df): return runs_df.drop(columns=['SEED', 'ID']).set_index(['TYPE', 'MODEL']) def analysis_suite(): """An automatic suite that performs analysis on the computed results of the benchmarking process""" runs_df = pd.read_csv('./compiled_results.csv') runs_df['R2_SCORE'] = runs_df['R2_SCORE'].abs() missing_df, total_run_count = compute_missing_runs(runs_df) runs_df, drop_d_count = drop_missing_datasets(runs_df, missing_df, 10) runs_df, drop_r_count = drop_missing_runs(runs_df, missing_df) scaled_df = standardize_scale(runs_df, 'MSE', invert=True) scaled_df = standardize_scale(scaled_df, 'F1_SCORE') c_df, r_df = split_by_type(scaled_df) raw_c_df, raw_r_df = split_by_type(runs_df) c_mu, c_std = per_dataset_mean_std(c_df) r_mu, r_std = per_dataset_mean_std(r_df) c_median, c_iqr = per_model_median_confidence(c_df) r_median, r_iqr = per_model_median_confidence(r_df) raw_c_mu = per_model_mean(raw_c_df) raw_r_mu = per_model_mean(raw_r_df) total_dropped_points = drop_d_count*40+drop_r_count*4 print('Missing by Model...\n', missing_df['MODEL'].value_counts()) print('Total dropped datasets: ', drop_d_count) print('Other dropped points: ', drop_r_count) print('percentage {}/{}: {}'.format(total_dropped_points, total_run_count, total_dropped_points/total_run_count)) print('Classification per model medians...\n', c_median.round(3)) print('Classification per model iqrs...\n', c_iqr.round(3)) print('Regression per model medians...\n', r_median.round(3)) print('Regression per model iqrs...\n', r_iqr.round(3)) print('Raw Classification per model means...\n', raw_c_mu.round(3)) print('Raw Regression per model means...\n', raw_r_mu.round(3)) print('Creating classification pairwise visualization...') pairwise_comp_viz(c_mu, target='F1_SCORE') print('Creating regression pairwise visualization...') pairwise_comp_viz(r_mu, target='MSE') print('Creating dataset visualization...') dataset_viz(scaled_df, targets={'classification':['FEATURES','ROWS','CLASSES'], 'regression':['FEATURES','ROWS']}) print('Creating metric correlation visualization...') correlation_viz(scaled_df, targets={'classification':('F1_SCORE',['DIMENSIONALITY','ROWS']), 'regression':('MSE',['DIMENSIONALITY','ROWS'])}) print('Creating classification boxplot visualization...') boxplot_viz(c_df.drop(columns=['ID', 'SEED', 'TYPE']).set_index(['MODEL']), target='F1_SCORE') print('Creating regression boxplot visualization...') boxplot_viz(r_df.drop(columns=['ID', 'SEED', 'TYPE']).set_index(['MODEL']), target='MSE')