# coding=utf-8 # Copyright 2019 The Authors of RL Reliability Metrics. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Class for making plots of robustness metric results and statistics.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import datetime import math import os from absl import logging from matplotlib import pyplot as plt import numpy as np from rl_reliability_metrics.analysis import io_utils_oss as io_utils from rl_reliability_metrics.analysis import plot_utils from rl_reliability_metrics.analysis import stats from rl_reliability_metrics.analysis import stats_utils # Internal gfile dependencies HATCH_PATTERNS = ('-', '/', '.', 'O', '+', 'o', 'x', '*', '\\') ALGO_COLORS = ('r', 'y', 'g', 'b', 'm') MARKERS = ('o', 's', 'v', '^', '<', '>') TIMEFRAME_NAMES = ['Beginning', 'Middle', 'End'] UP_ARROW = r' $\uparrow$' DOWN_ARROW = r' $\downarrow$' class Plotter(object): """Class for making plots of metric results and statistics.""" def __init__(self, data, pvals_dir, confidence_intervals_dir, n_timeframes, algorithms=None, out_dir=None, pthresh=0.01, multiple_comparisons_method='benjamini-yekutieli', subplot_axis_labels=True, make_legend=False): """Initialize Plotter object. Args: data: DataDef object containing all the metric results. pvals_dir: Path to directory containing p-values for comparisons between pairs of algorithms. confidence_intervals_dir: Path to directory containing bootstrap confidence intervals. n_timeframes: Total number of timeframes we are dividing each run into. algorithms: If specified, these algorithms will be plotted, in this order. If None, we plot all algorithms available in the data (order not guaranteed). out_dir: Path to directory where we save the plot images. If None, we simply display the images without saving. pthresh: p-value threshold for significance. multiple_comparisons_method: String indicating method to use for multiple comparisons correction. See self._do_multiple_comparisons_correction for options. subplot_axis_labels: Whether to add x- and y-axis labels for each subplot. make_legend: Whether to make a legend. """ self.data_def = data self.pvals_dir = pvals_dir self.confidence_intervals_dir = confidence_intervals_dir self.n_timeframes = n_timeframes self.out_dir = out_dir self.pthresh = pthresh self.multiple_comparisons_method = multiple_comparisons_method self.subplot_axis_labels = subplot_axis_labels self.make_legend = make_legend # Parse information from data_def self.dataset = self.data_def.dataset self.algorithms = algorithms if algorithms else self.data_def.algorithms self.n_algo = len(self.algorithms) self.n_task = len(self.data_def.tasks) # Bonferroni-corrected p-value threshold self.pthresh_corrected = stats_utils.multiple_comparisons_correction( self.n_algo, self.pthresh, self.multiple_comparisons_method) def make_plots(self, metric): """Make all plots for a given metric. Args: metric: String name of the metric. """ plot_utils.paper_figure_configs() # Create a metric-specific StatsRunner object stats_runner = stats.StatsRunner(self.data_def, metric, self.n_timeframes) result_dims = stats_runner.result_dims if result_dims == 'ATRP': # Within-runs metric with eval points. self._make_plots_with_eval_points(metric, stats_runner) elif result_dims == 'ATR': # Within-runs metrics without eval points (one value per run). self._make_plots_no_eval_points(metric, stats_runner) elif result_dims == 'ATP': # Across-runs metric with eval points self._make_plots_with_eval_points(metric, stats_runner) else: raise ValueError('plotting not implemented for result_dims: %s' % result_dims) def _save_fig(self, metric, plot_name): timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f') filepath = os.path.join(self.out_dir, '%s__%s__%s.png' % (metric, plot_name, timestamp)) io_utils.makedirs(os.path.dirname(filepath)) with open(filepath, 'wb') as f: plt.savefig(f) def _make_plots_with_eval_points(self, metric, stats_runner): """Make plots for a metric evaluated at multiple evaluation points per run. e.g. 'ATP' or 'ATRP' metrics. Plot 1: raw metric values per task. * One subplot per task. * Each subplot contains a plot showing the metric values across evaluation points. For ATRP metrics, we show the median metric values and fill plots indicating the IQR at each evaluation point. Plot 2: Mean rankings across tasks. * One subplot per timeframe. * One bar plot showing the mean ranking for each algorithm, and horizontal line segments indicating which pairs of algorithms are statistically different. Args: metric: String specifying the metric. stats_runner: StatsRunner object """ # Set up figure for per-task raw values. subplot_ncol_1 = 4 n_subplots_1 = self.n_task + 1 if self.make_legend else self.n_task subplot_nrow_1 = math.ceil(n_subplots_1 / subplot_ncol_1) fig1 = plt.figure(figsize=(4 * subplot_ncol_1, 4 * subplot_nrow_1)) # Set up figure for mean rankings. subplot_ncol_2 = self.n_timeframes if self.make_legend: subplot_ncol_2 += 1 subplot_nrow_2 = 1 fig2 = plt.figure(figsize=(4 * subplot_ncol_2, 4 * subplot_nrow_2)) ##=== Plot 1: Raw metric values per task ===## plt.figure(fig1.number) eval_point_idxs = stats_runner.get_timeframe_points(None) eval_point_values = self.data_def.metric_params[metric]['eval_points'] metric_results = stats_runner.load_metric_results( self.algorithms, eval_point_idxs, collapse_on_timepoints=False) result_dims = stats_runner.result_dims for i_task in range(self.n_task): plt.subplot(subplot_nrow_1, subplot_ncol_1, i_task + 1) task_results = np.squeeze(metric_results[:, i_task]) if len(eval_point_idxs) == 1: task_results = np.expand_dims(task_results, -1) if result_dims == 'ATP': # For across-run metrics, we plot a single curve. for i_algo in range(self.n_algo): plt.plot(eval_point_values, task_results[i_algo, :], marker=MARKERS[i_algo]) if self.subplot_axis_labels: plt.xlabel('evaluation points', fontsize=16) plt.ylabel('metric values', fontsize=16) elif result_dims == 'ATRP': # For per-run metrics, we plot the median and IQR across curves. for i_algo in range(self.n_algo): algo_color = ALGO_COLORS[i_algo] task_algo_results = task_results[i_algo] # n_runs x n_eval_points result_medians = np.median(task_algo_results, axis=0) result_quartile1 = np.percentile(task_algo_results, q=25, axis=0) result_quartile3 = np.percentile(task_algo_results, q=75, axis=0) plt.plot(eval_point_values, result_medians, algo_color, marker=MARKERS[i_algo]) plt.fill_between( eval_point_values, result_quartile1, result_quartile3, alpha=0.3, color=algo_color) if self.subplot_axis_labels: plt.xlabel('evaluation points', fontsize=16) plt.ylabel('metric values', fontsize=16) else: raise ValueError('result_dims must be ATP or ATRP, not %s' % result_dims) plot_utils.simple_axis(plt.gca()) plt.title(self.data_def.tasks[i_task]) # plot the legend if self.make_legend: plt.subplot(subplot_nrow_1, subplot_ncol_1, n_subplots_1) self._lineplot_legend() ##=== Plot 2: Mean rankings (mean across tasks) ===## for timeframe in range(self.n_timeframes): # Load data for plotting. timeframe_points = stats_runner.get_timeframe_points(timeframe) pvals = self._load_pvals(metric, timeframe) confidence_intervals = self._load_confidence_intervals( metric, stats_runner, timeframe) plt.figure(fig2.number) metric_results = stats_runner.load_metric_results( self.algorithms, timeframe_points, collapse_on_timepoints=True) plt.subplot(subplot_nrow_2, subplot_ncol_2, timeframe + 1) self._plot_bars_and_significant_differences(metric_results, pvals, confidence_intervals, stats_runner) plt.title(TIMEFRAME_NAMES[timeframe], fontsize=14) # plot the legend if self.make_legend: plt.subplot(subplot_nrow_2, subplot_ncol_2, subplot_ncol_2) self._barplot_legend() ##=== Wrap up the figures ===## for fig, plot_name in [(fig1, 'per-task_raw'), (fig2, 'mean_rankings')]: if plot_name == 'per-task_raw': suptitle_suffix = ( UP_ARROW if stats_runner.bigger_is_better else DOWN_ARROW) else: suptitle_suffix = '' plt.figure(fig.number, plot_name) self._wrap_up_figure(metric, plot_name, suptitle_suffix) def _make_plots_no_eval_points(self, metric, stats_runner): """Make plots for a metric without evaluation points (one value per run). e.g. 'ATR' metrics. Plot 1: Raw metric values per task. * One subplot per task. * Each subplot contains a box-and-whisker plot showing the median metric values for each algorithm, a box indicating 1st and 3rd quartiles, and whiskers indicating the minimum and maximum values (excluding outliers, defined as being outside 1.5x the inter-quartile range from the 1st and 3rd quartiles). Plot 2: Mean rankings across tasks. * One bar plot showing the mean ranking for each algorithm, and horizontal line segments indicating which pairs of algorithms are statistically different. Args: metric: String specifying the metric. stats_runner: StatsRunner object """ # Load data for plotting. metric_results = stats_runner.load_metric_results( self.algorithms, timeframe_points=None) pvals = self._load_pvals(metric) confidence_intervals = self._load_confidence_intervals(metric, stats_runner) ##=== Plot 1: Raw metric values per task ===## # Set up figure. subplot_ncol = 4 n_subplot = self.n_task if self.make_legend: n_subplot += 1 subplot_nrow = math.ceil(n_subplot / subplot_ncol) plt.figure(figsize=(4 * subplot_ncol, 4 * subplot_nrow)) # Plot the raw metric values as box-and-whisker plots. for i_task in range(self.n_task): plt.subplot(subplot_nrow, subplot_ncol, i_task + 1) task_results = np.squeeze(metric_results[:, i_task, :]) boxplot = plt.boxplot(task_results.T, patch_artist=True) for part in ['boxes', 'whiskers', 'fliers', 'means', 'medians', 'caps']: plt.setp(boxplot[part], color='k') for i_patch, patch in enumerate(boxplot['boxes']): patch.set(facecolor=ALGO_COLORS[i_patch]) plt.title(self.data_def.tasks[i_task], fontsize=16) self._configure_axes('Raw metric values') self._extend_ylims_past_zero(task_results) plot_utils.simple_axis(plt.gca()) if self.make_legend: plt.subplot(subplot_nrow, subplot_ncol, n_subplot) self._barplot_legend() # Wrap up the figure. suptitle_suffix = ( UP_ARROW if stats_runner.bigger_is_better else DOWN_ARROW) self._wrap_up_figure( metric, plot_name='per-task_raw', suptitle_suffix=suptitle_suffix) ##=== Plot 2: Mean rankings (mean across tasks) ===## # Set up figure. subplot_ncol = 2 if self.make_legend else 1 subplot_nrow = 1 plt.figure(figsize=(4 * subplot_ncol, 4 * subplot_nrow)) # Plot mean rankings and show statistical differences plt.subplot(subplot_nrow, subplot_ncol, 1) self._plot_bars_and_significant_differences(metric_results, pvals, confidence_intervals, stats_runner) plot_utils.simple_axis(plt.gca()) # plot the legend if self.make_legend: plt.subplot(subplot_nrow, subplot_ncol, subplot_ncol) self._barplot_legend() # Wrap up the figure. self._wrap_up_figure(metric, plot_name='mean_rankings') def _wrap_up_figure(self, metric, plot_name, suptitle_suffix=''): """Add suptitle, set tight layout, and save the figure.""" plt.suptitle( plot_utils.METRICS_DISPLAY_NAMES[metric] + suptitle_suffix, fontsize=14) plt.tight_layout(rect=[0, 0.03, 1, 0.95]) if self.out_dir: self._save_fig(metric, plot_name) def _load_pvals(self, metric, timeframe=None): """Load previously computed p-values. Args: metric: Which metric we are plotting. timeframe: Which timeframe we are plotting. Set None if irrelevant (for metrics that are not evaluated at specific eval points). Returns: Dictionary of p-values, with entries {'algo1.algo2': pval} """ pvals = {} for algo1 in self.algorithms: for algo2 in self.algorithms: # Get path to p-value pvals_filepath = ('%s/%s_%s_%s' % (self.pvals_dir, metric, algo1, algo2)) if timeframe is not None: pvals_filepath += '_%d' % timeframe # Load the p-value with open(pvals_filepath, 'r') as f: pval = float(f.readline()) pvals['%s.%s' % (algo1, algo2)] = pval logging.info('P-values loaded:') logging.info(pvals) return pvals def _load_confidence_intervals(self, metric, stats_runner, timeframe=None): """Load previously computed confidence intervals. Args: metric: Which metric we are plotting. stats_runner: StatsRunner object timeframe: Which timeframe we are plotting. Set None if irrelevant (for metrics that are not evaluated at specific eval points). Returns: Dictionary of confidence intervals, with entries {'algo': [ci_lower, ci_upper]} """ cis = {} for algo in self.algorithms: # Get path to confidence intervals ci_filepath = '%s/%s_%s' % (self.confidence_intervals_dir, metric, algo) if timeframe is not None: ci_filepath += '_%d' % timeframe # Load the p-value with open(ci_filepath, 'r') as f: line = f.readline() ci = list(map(float, line.split(','))) # Normalize to range (1, n_metrics) if 'R' in stats_runner.result_dims: ci[0] /= self.data_def.n_runs_per_experiment ci[1] /= self.data_def.n_runs_per_experiment cis[algo] = ci logging.info('Confidence intervals loaded:') logging.info(cis) return cis def _plot_bars_and_significant_differences(self, metric_results, pvals, confidence_intervals, stats_runner): """For a single timeframe, plot mean rank and show significant differences. Args: metric_results: Numpy array with metric values. First two dimensions should be (n_algorithm, n_task) pvals: p-values on comparison between each pair of algorithms. A dict with entries {'algo1.algo2': pvalue}. confidence_intervals: Confidence intervals on mean rank for each algorithm. A dict with entries {'algo': [ci_lower, ci_upper]}. stats_runner: StatsRunner object """ ymax = 1.32 * (len(self.algorithms)) y_pval_lines = 0.83 # First get the rankings across all algos metric_ranks = stats_runner.rank_per_task(metric_results) # Get mean ranks over tasks, for each algo # (collapse across all other dimensions) extra_dims = range(1, len(metric_ranks.shape)) mean_ranks = np.mean(metric_ranks, tuple(extra_dims)) # Normalize the ranks to range (1, n_algorithms) if 'R' in stats_runner.result_dims: mean_ranks /= self.data_def.n_runs_per_experiment # Plot the mean rankings and error bars for each algo for i_algo, algo in enumerate(self.algorithms): plot_utils.flipped_errorbar( x=i_algo, y=mean_ranks[i_algo], yerr=confidence_intervals[algo], ymax=self.n_algo, bar_color=ALGO_COLORS[i_algo], hatch_pattern=HATCH_PATTERNS[i_algo], x_offset=0.6, ) # Rank order the p-values. if self.multiple_comparisons_method != 'bonferroni': # Get subset of the p-values: we don't need the reverse comparisons, and # we don't need the self comparisons. pvals_subset = {} for i_algo, algo1 in enumerate(self.algorithms): for j_algo in range(i_algo + 1, self.n_algo): algo2 = self.algorithms[j_algo] algo_str = '%s.%s' % (algo1, algo2) pvals_subset[algo_str] = pvals[algo_str] sorted_keys = sorted(pvals_subset, key=pvals_subset.get) pval_ranks = {key: rank for rank, key in enumerate(sorted_keys)} # Plot black bars indicating significant differences. n_lines_plotted = 0 for i_algo, algo1 in enumerate(self.algorithms): for j_algo in range(i_algo + 1, self.n_algo): algo2 = self.algorithms[j_algo] algo_pair_str = '%s.%s' % (algo1, algo2) if self.multiple_comparisons_method != 'bonferroni': pval_rank = pval_ranks[algo_pair_str] pthresh_corrected = self.pthresh_corrected[pval_rank] else: pthresh_corrected = self.pthresh_corrected if pvals[algo_pair_str] < pthresh_corrected: x = [i_algo + 1, j_algo + 1] y = [(y_pval_lines + n_lines_plotted * 0.03) * ymax] * 2 plt.plot(x, y, color='k') n_lines_plotted += 1 self._configure_axes('normalized mean rank', range(1, self.n_algo + 1), range(self.n_algo, 0, -1)) def _configure_axes(self, y_label, y_ticks=None, y_tick_labels=None): """Configure axis limits and labels.""" algo_abbreviations = [ plot_utils.ALGO_ABBREVIATIONS[algo] for algo in self.algorithms ] plt.xticks(range(1, self.n_algo + 1), algo_abbreviations) plt.xlim(0, len(self.algorithms) + 1) if y_ticks: plt.yticks(y_ticks) if y_tick_labels: plt.gca().set_yticklabels(y_tick_labels) if self.subplot_axis_labels: plt.xlabel('algorithm', fontsize=16) plt.ylabel(y_label, fontsize=16) plt.tick_params(top='off') @staticmethod def _extend_ylims_past_zero(data, tolerance=0.01, extension=0.1): """Extend y-axis to ensure that zero-values in the data are visible. Args: data: Data being plotted. tolerance: Determines what values are considered too close to zero. extension: Determines how far to extend the y-axis. """ ylims_orig = plt.gca().get_ylim() abs_min = np.abs(np.min(data)) abs_max = np.abs(np.max(data)) # Extend below zero. if abs_min < tolerance * abs_max: ylim_lower = -ylims_orig[1] * extension plt.ylim([ylim_lower, ylims_orig[1]]) # Extend above zero. elif abs_max < tolerance * abs_min: ylim_upper = -ylims_orig[0] * extension plt.ylim([ylims_orig[0], ylim_upper]) def _barplot_legend(self): """Plot a legend showing the color/texture for each algorithm.""" for ibox in range(self.n_algo): box_y = self.n_algo - ibox plt.scatter( 0, box_y, s=300, marker='s', facecolor=ALGO_COLORS[ibox], edgecolor='k', hatch=HATCH_PATTERNS[ibox], label=HATCH_PATTERNS[ibox]) plt.text(0.008, box_y - 0.15, self.algorithms[ibox], fontsize=14) plt.xlim(-0.01, 0.05) plot_utils.no_axis(plt.gca()) def _lineplot_legend(self): """Plot a legend showing the color/marker for each algorithm.""" for i_algo in range(self.n_algo): y = self.n_algo - i_algo color = ALGO_COLORS[i_algo] plt.plot([0, 2], [y, y], color=color) plt.plot(1, y, marker=MARKERS[i_algo], color=color) plt.text(2.5, y - 0.002, self.algorithms[i_algo], fontsize=14) ax = plt.gca() plot_utils.no_axis(ax) ax.set_axis_bgcolor('white') plt.xlim([0, 10]) plt.ylim([0, self.n_algo + 1])