# Copyright 2019 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Core module for //ads/metrics/lib/meterstick.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import collections import copy from typing import List, Union import attr from meterstick import comparisons from meterstick import metrics as metrics_module from meterstick import pdutils import numpy as np import pandas as pd from six import string_types def _merge_metrics(row): non_empty = [str(r) for r in row if pd.notnull(r)] if not non_empty: return None elif len(non_empty) == 1: return non_empty[0] else: return "::".join(non_empty) # TODO(dlsun): Remove AnalysisParameters and incorporate the # attributes directly into the Analyze object. @attr.s() class AnalysisParameters(object): """Class for the parameters of a data analysis. Attributes: metrics: A Metric or a list of Metric objects specifying which metric(s) to compute. split_index: A Pandas index object. split_vars: A variable or a list of variables to split the data upon. comparison: A Comparison object which performs the comparison. se_method: A function which calculates standard errors. sort: If to sort on split_vars. """ metrics = attr.ib(default=None) split_index = attr.ib(default=None, type=pd.Index) split_vars = attr.ib(factory=list, type=Union[str, List[str], None]) comparison = attr.ib(default=None) se_method = attr.ib(default=None) sort = attr.ib(type=bool, default=False) class Analyze(object): """Base analysis class; provides base functionality for data analysis. Attributes: data: A pandas dataframe. parameters: An AnalysisParameters object. """ def __init__(self, data, parameters=None): """Initializes Analysis object. Args: data: A pandas dataframe. parameters: An AnalysisParameters object. """ self.data = data.copy() self.parameters = parameters or AnalysisParameters() def where(self, query): """Subsets the data according to a query. Args: query: A string which evaluates to an array of Booleans. Returns: An Analysis object. Raises: ValueError: Query does not evaluate to an array of Booleans. """ query_results = self.data.eval(query).values if query_results.dtype == np.bool: self.data = self.data[query_results] else: raise ValueError("The query (%s) does not evalulate to " "an array of Booleans." % query) return self def split_by(self, split_vars, expand=False, sort=True, encoding="utf8"): """Splits the analysis by categorical variables. Args: split_vars: A string or a list of strings representing variables to split upon. expand: A boolean indicating whether the index that is created should contain the full expanded product of all possible combinations of levels of the variables in split_vars. Otherwise, the index will only contain combinations that were actually observed in the data set. sort: A boolean indicating whether or not the levels in each slice should be sorted. Returns: An Analysis object with the split_vars attribute set to input. Raises: ValueError: Split variables are already defined. TypeError: Split variable is not a string or list of strings. """ if self.parameters.split_vars: raise ValueError("Split variables are already defined.") if isinstance(split_vars, string_types): self.parameters.split_vars = [split_vars] else: try: all_strings = all(isinstance(var, string_types) for var in split_vars) except KeyError: raise TypeError("Split variable is not a string or list of strings.") if not all_strings: raise TypeError("Split variable is not a string or list of strings.") self.parameters.split_vars = split_vars # Explicitly decode columns that will become part of the index, # to avoid UnicodeDecodeErrors. Since pandas has no way to distinguish # between bytes and strings, we just try decoding and catch any failures. for col in self.parameters.split_vars: # Only convert columns of type "object". if self.data[col].dtype == np.object_: try: decoded = self.data[col].str.decode(encoding) self.data.loc[~decoded.isna(), col] = decoded[~decoded.isna()] except UnicodeEncodeError: pass # Determine the index for split_vars self.parameters.split_index = pdutils.index_product_from_vars( self.data, self.parameters.split_vars, expand) self.parameters.sort = sort return self def relative_to(self, comparison, sort=True): """Specifies the comparison for the analysis. Args: comparison: A Comparison object. sort: Boolean indicating whether to sort the conditions by name. Returns: An Analysis object with the comparison attribute set to the input. Raises: ValueError: Comparison is already defined. """ if self.parameters.comparison is not None: raise ValueError("Comparison is already defined.") comparison.sort = sort self.parameters.comparison = copy.deepcopy(comparison) return self def with_standard_errors(self, method): """Specifies standard error method for analysis. Args: method: A standard error method. Returns: An Analysis object with the standard error method set. Raises: ValueError: Standard error method is already defined. """ if self.parameters.se_method is not None: raise ValueError("Standard error method is already defined.") self.parameters.se_method = copy.deepcopy(method) return self def calculate(self, metrics): """Specifies metrics to calculate. Args: metrics: A Metric object or a list of Metric objects. Returns: An Analysis object with the metrics set. Raises: ValueError: Metrics are already defined. TypeError: .calculate() takes a Metric or a list of Metrics. ValueError: A metric appears more than once in the list of Metrics. """ if self.parameters.metrics is not None: raise ValueError("Metrics are already defined.") if isinstance(metrics, collections.Iterable): if all(isinstance(metric, metrics_module.Metric) for metric in metrics): # Check that no two metrics have the same name. metric_names = set() for metric in metrics: if metric.name in metric_names: raise ValueError( "'%s' appears more than once in the metrics." % metric.name ) else: metric_names.add(metric.name) # Create a list of (copies of) each metric self.parameters.metrics = [copy.deepcopy(metric) for metric in metrics] else: raise TypeError(".calculate() takes a Metric or a list of Metrics") elif isinstance(metrics, metrics_module.Metric): self.parameters.metrics = [copy.deepcopy(metrics)] else: raise TypeError(".calculate() takes a Metric or a list of Metrics") return self def run(self, melted=False, var_name="Metric", value_name="Value", encoding="utf8"): """Runs the analysis, returning the output in the specified form. Args: melted: Boolean indicating whether to return the output in "melted" form (i.e., with a separate row for each metric, as opposed to a separate column for each metric). var_name: Column name to use for the metric when data is melted (equivalent to var_name in pd.melt). value_name: Column name to use for the values of the metric when data is melted (equivalent to value_name in pd.melt). encoding: String encoding to use for string columns. Returns: A pandas dataframe with the results. Raises: ValueError: No metrics to calculate. """ results = [] split_index = self.parameters.split_index split_vars = self.parameters.split_vars comparison = self.parameters.comparison se_method = self.parameters.se_method if self.parameters.metrics is None: raise ValueError("No metrics to calculate.") if (self.data is None) or self.data.empty: raise ValueError("The dataset is empty.") # Do calculations that only need to be done once. index_vars = [] if se_method is not None: se_method.precalculate(self.data) if comparison is not None: comparison.precalculate(self.data) index_vars.append(comparison.condition_column) for metric in self.parameters.metrics: metric.precalculate(self.data, split_index) index_vars.extend(split_vars) # Explicitly decode columns that will become part of the index, # to avoid UnicodeDecodeErrors. Since pandas has no way to distinguish # between bytes and strings, we just try decoding and catch any failures. for col in index_vars: # Only convert columns of type "object". if self.data[col].dtype == np.object_: try: decoded = self.data[col].str.decode(encoding) self.data.loc[~decoded.isna(), col] = decoded[~decoded.isna()] except UnicodeEncodeError: pass # Set the index of the dataframe. if index_vars: df = self.data.set_index(index_vars) else: df = self.data def _compute_metric(metric): """Computes the metric and process the output. 1. Computes the metric. 2. Transforms to pd.DataFrame. 3. Melts the dataframe if melted=True. 4. Gives columns reasonable names. 5. Records the MetricIndex and OverColumns in metric if present. 6. Orders the indexes and sorts if asked. Args: metric: A Metric() instance to compute. Returns: The computed metric in pd.DataFrame. """ if se_method is not None: output = se_method(df, metric, comparison) elif comparison is not None: output = comparison(df, metric) else: output = metric(df) all_metric_indices.extend(metric.metric_idx) all_over_columns.extend(metric.over) # Convert output to dataframe. if np.isscalar(output): output = pd.DataFrame([output], columns=[""]) elif isinstance(output, pd.Series): output = pd.DataFrame(output) # To melt data, add an index called var_name that stores the metric. if melted: output[var_name] = metric.name output.rename(columns={"": value_name}, inplace=True) output.set_index(var_name, append=True, inplace=True) # Otherwise, depending on if there's stderr calculated, we either append # metric name to the beginning of each of the columns, or make results a # MultiIndex DataFrame. else: if se_method: output.columns = pd.MultiIndex.from_product([[metric.name], output.columns]) else: output.columns = ["{} {}".format(metric.name, col).strip() for col in output.columns] if comparison and len(output.index.names) > 1: # Comparison is by default the first index. Put it to the last. output.index = output.index.reorder_levels( np.roll(output.index.names, -1)) if split_vars and self.parameters.sort: output.sort_index(level=split_vars, inplace=True, sort_remaining=False) return output # Calculate the metrics. For the meaning of MetricIndex and OverColumns, # see the doc of Metric(). all_metric_indices = [] all_over_columns = [] for metric in self.parameters.metrics: results.append(_compute_metric(metric)) all_metric_indices = list(pd.unique(all_metric_indices)) all_over_columns = list(pd.unique(all_over_columns)) def _fill_missing_indexes(results): """Fills missing index in results so all elements have same index. When MetricIndex or OverColumn exist, each metric might have different MetricIndex or OverColumn. As we will concat all metrics later, we need them to have exactly same indexes. The function fills the missing indexes. Args: results: The list of metrics computed, with possible missing indexes. Returns: The list of metrics computed, all having same indexes. """ all_metric_indices_over_columns = all_metric_indices + all_over_columns all_idx = split_vars + [var_name] if melted else split_vars[:] all_idx += all_metric_indices all_idx += [comparison.condition_column] if comparison else [] all_idx += all_over_columns # Metrics might have different MetricIndex and Over columns. To concat # metrics later, we need all metrics to have same indexes. for i, output in enumerate(results): for col in all_metric_indices_over_columns: if col not in output.index.names: output[col] = "" output.set_index(col, append=True, inplace=True) # To make the indexes of all metrics be in the same order. output = output.reset_index(all_idx).set_index(all_idx) if not melted: # df.unstack(df.index) is a pd.Series, not a pd.DataFrame anymore. # This will introduce trouble when operating with other df. So add a # placeholder index column in such case. if len(output.index.names) == len(all_metric_indices): output[""] = 0 output.set_index("", append=True, inplace=True) output = output.unstack(all_metric_indices) results[i] = output return results if all_metric_indices or all_over_columns: results = _fill_missing_indexes(results) # concatenate results for each metric into a single dataframe results = pdutils.concat(results, axis=0 if melted else 1) if not melted and se_method: column_multiindex = results.rename(columns={"": value_name}, level=1).columns if not se_method.flat_index: results.columns = column_multiindex else: # Merge the first two levels metric and value type. if all_metric_indices: new_idx = [ [" ".join(c[:2]).strip()] + list(c[2:]) for c in results.columns ] names = [None] + list(results.columns.names[2:]) new_idx = pd.MultiIndex.from_tuples(new_idx, names=names) else: new_idx = [" ".join(c).strip() for c in results.columns] results.columns = new_idx return results