import os import pickle import argparse import numpy as np from itertools import cycle from collections import defaultdict from functools import partial from typing import List, Dict, Tuple from qanta.guesser.abstract import AbstractGuesser from qanta.datasets.quiz_bowl import QuestionDatabase from qanta.util import constants as c from qanta.buzzer import constants as bc from qanta import logging from qanta.buzzer.util import GUESSERS, load_protobowl from qanta.reporting.report_generator import ReportGenerator from qanta.util.multiprocess import _multiprocess import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt log = qlogging.get(__name__) N_GUESSERS = len(GUESSERS) MAXINT = 99999 HISTO_RATIOS = [0, 0.2, 0.4, 0.6, 0.8, 1.0] # continuous valued statistics EOP_STAT_KEYS_0 = [ 'buzz', # did the buzzer buzz 'choose_best', # did the buzzer choose the best guesser (earliest correct) 'choose_hopeful', # did the buzzer choose a hopeful guesser 'rush', # did the buzzer rush (w.r.t to all guessers) 'late', # did the buzzer buzz too late (w.r.t to all guessers) 'not_buzzing_when_shouldnt', 'reward', 'hopeful', # is the question hopeful (w.r.t to all guessers) 'correct' # how many correct buzzers ] # discrete valued statistics EOP_STAT_KEYS_1 = [ 'choose_guesser', # the guesser chosen by the buzzer 'best_guesser' # the best guesser ] # overall guesser accuracy and buzzing frequency HISTO_KEYS_0 = ['acc', 'buzz'] # ['acc_{}'.format(g) for g in GUESSERS] + \ # ['buzz_{}'.format(g) for g in GUESSERS] # HISTO_KEYS_1 = [ 'buzz_correct', 'wait_correct', 'wait_wrong', 'buzz_wrong', 'wait_impossible', 'buzz_impossible', 'buzz_miss'] LINE_STYLES = {'acc': '-', 'buzz': '-'} _STYLES = [':', '--', '-.'] for guesser, style in zip(GUESSERS, cycle(_STYLES)): LINE_STYLES['acc_{}'.format(guesser)] = style LINE_STYLES['buzz_{}'.format(guesser)] = style def _get_top_guesses(qnum, question): top_guesses = [] # length * n_guessers # FIXME because there can be missing guessers, must iterate position first for _, position in question.groupby(['sentence', 'token']): top_guesses.append([]) position = position.groupby('guesser') for guesser in GUESSERS: if guesser not in position.groups: top_guesses[-1].append(None) else: guesses = position.get_group(guesser).sort_values( 'score', ascending=False) top_guesses[-1].append(guesses.iloc[0].guess) # transpose top_guesses -> n_guessers * length return qnum, list(map(list, zip(*top_guesses))) def _get_eop_stats(buzzes: Dict[int, List[List[float]]], answers: Dict[int, str], qnum, top_guesses) \ -> Tuple[int, Dict[str, int]]: buzz = buzzes[qnum] answer = answers[qnum] # top_guesses: n_guessers * length length = len(top_guesses[0]) if len(buzz) != length: raise ValueError("Length of buzzes {0} does not match with \ guesses {1}".format(len(buzz), length)) stats = {k: -1 for k in EOP_STAT_KEYS_0 + EOP_STAT_KEYS_1} # the first correct position of each guesser correct = [g.index(answer) if answer in g else MAXINT for g in top_guesses] best_guesser = -1 if np.all(correct == MAXINT) else np.argmin(correct) stats['best_guesser'] = best_guesser stats['correct'] = sum(x != MAXINT for x in correct) stats['hopeful'] = stats['correct'] > 0 hopeful = stats['hopeful'] # the buzzing position and chosen guesser pos, chosen = -1, -1 for i in range(length): action = np.argmax(buzz[i]) if action < len(GUESSERS): pos = i chosen = action break if pos == -1: # not buzzing stats['buzz'] = 0 stats['reward'] = 0 stats['not_buzzing_when_shouldnt'] = int(not hopeful) else: stats['buzz'] = 1 stats['choose_guesser'] = chosen stats['choose_hopeful'] = int(correct[chosen] != MAXINT) stats['reward'] = 10 if pos >= correct[chosen] else -5 if hopeful: stats['choose_best'] = int(chosen == best_guesser) # stats['late'] = max(0, pos - correct[best_guesser]) # stats['rush'] = max(0, correct[best_guesser] - pos) stats['late'] = int(pos > correct[best_guesser]) stats['rush'] = int(correct[best_guesser] > pos) return qnum, stats def _get_his_stats(buzzes: Dict[int, List[List[float]]], answers: Dict[int, str], qnum, top_guesses) \ -> Tuple[int, Dict[str, List[int]]]: buzz = buzzes[qnum] answer = answers[qnum] # top_guesses: n_guessers * length length = len(top_guesses[0]) if len(buzz) != length: raise ValueError("Length of buzzes {0} does not match with \ guesses {1}".format(len(buzz), length)) # n_guessers * length -> length * n_guessers top_guesses = list(map(list, zip(*top_guesses))) guesser_correct = [[int(x == answer) for x in g] for g in top_guesses] buzzer_correct = [] for i, x in enumerate(buzz): x = np.argmax(x) if x < N_GUESSERS and guesser_correct[i][x]: buzzer_correct.append(1) else: buzzer_correct.append(0) stats = {k: [-1 for _ in HISTO_RATIOS] for k in HISTO_KEYS_0 + HISTO_KEYS_1} for i, r in enumerate(HISTO_RATIOS): pos = int(length * r) for j, g in enumerate(GUESSERS): cor = sum(x[j] for x in guesser_correct[:pos]) buz = sum(np.argmax(x) == j for x in buzz[:pos]) # stats['acc_{}'.format(g)][i] = int(cor > 0) # stats['buzz_{}'.format(g)][i] = int(buz > 0) cor_before = sum(sum(x) for x in guesser_correct[:pos]) cor_after = sum(sum(x) for x in guesser_correct[pos:]) buz = sum(np.argmax(x) < N_GUESSERS for x in buzz[:pos]) buz_cor = sum(buzzer_correct[:pos]) stats['acc'][i] = int(cor_before > 0) stats['buzz'][i] = int(buz > 0) stats['buzz_correct'][i] = int(buz_cor > 0) stats['wait_correct'][i] = int(buz == 0 and cor_before == 0 and cor_after > 0) stats['wait_impossible'][i] = int(buz == 0 and cor_before == 0 and cor_after == 0) stats['wait_wrong'][i] = int(buz == 0 and cor_before > 0) stats['buzz_wrong'][i] = int(buz > 0 and cor_before == 0 and cor_after > 0) stats['buzz_miss'][i] = int(buz > 0 and cor_before > 0 and buz_cor == 0) stats['buzz_impossible'][i] = int(buz > 0 and cor_before == 0 and cor_after == 0) ssum = stats['buzz_correct'][i] + stats['wait_correct'][i] + \ stats['wait_wrong'][i] + stats['buzz_wrong'][i] + \ stats['wait_impossible'][i] + stats['buzz_impossible'][i] +\ stats['buzz_miss'][i] assert ssum == 1 return qnum, stats def get_eop_stats(top_guesses, buzzes, answers, variables, fold, save_dir): log.info('[{}] End-of-pipelin reporting'.format(fold)) inputs = top_guesses.items() worker = partial(_get_eop_stats, buzzes, answers) eop_stats = _multiprocess(worker, inputs, info='End-of-pipeline stats', multi=True) # qnum -> key -> int eop_stats = {k: v for k, v in eop_stats} # key -> int _eop_stats = defaultdict(lambda: []) eop_output = "" for qnum, stat in eop_stats.items(): for key in EOP_STAT_KEYS_0 + EOP_STAT_KEYS_1: if stat[key] != -1: _eop_stats[key].append(stat[key]) for key in EOP_STAT_KEYS_0: values = _eop_stats[key] value = sum(values) / len(values) if len(values) > 0 else 0 _eop_stats[key] = value output = "{0} {1:.3f}".format(key, value) eop_output += output + '\n' # print(output) for key in EOP_STAT_KEYS_1: output = key values = _eop_stats[key] _eop_stats[key] = dict() for i, guesser in enumerate(GUESSERS): output += " {0} {1}".format(guesser, values.count(i)) _eop_stats[key][guesser] = values.count(i) eop_output += output + '\n' # print(output) if variables is not None: variables['eop_stats'][fold] = _eop_stats return _eop_stats def get_his_stats(top_guesses, buzzes, answers, variables, fold, save_dir): log.info('[{}] Histogram reporting'.format(fold)) inputs = top_guesses.items() worker = partial(_get_his_stats, buzzes, answers) his_stats = _multiprocess(worker, inputs, info='Histogram stats', multi=True) # qnum -> key -> list(int) his_stats = {k: v for k, v in his_stats} # key -> list(int) _his_stats = defaultdict(lambda: [[] for _ in HISTO_RATIOS]) for stats in his_stats.values(): for key in HISTO_KEYS_0 + HISTO_KEYS_1: for i, r in enumerate(HISTO_RATIOS): if stats[key][i] != -1: _his_stats[key][i].append(stats[key][i]) for key in HISTO_KEYS_0 + HISTO_KEYS_1: for i, r in enumerate(HISTO_RATIOS): s = _his_stats[key][i] _his_stats[key][i] = sum(s) / len(s) if len(s) > 0 else 0 _his_stats = dict(_his_stats) his_output = "" for i, r in enumerate(HISTO_RATIOS): output = "{}:".format(r) for key in HISTO_KEYS_0 + HISTO_KEYS_1: output += " {0} {1:.2f}".format(key, _his_stats[key][i]) his_output += output + '\n' # print(output) ##### plot lines ##### fig, ax = plt.subplots() lines = [] for k in HISTO_KEYS_0: v = _his_stats[k] lines.append(plt.plot(HISTO_RATIOS, v, LINE_STYLES[k], label=k)[0]) ax.set_xticks(HISTO_RATIOS) plt.legend(handles=lines) plt.title('{} histogram lines chart'.format(fold)) if save_dir is not None: his_lines_dir = os.path.join(save_dir, 'his_{}_lines.pdf'.format(fold)) plt.savefig(his_lines_dir, bbox_inches='tight') else: plt.show() plt.close() ##### plot stacked area chart ##### plt.plot([],[],color='c', alpha=0.5, label='buzz_correct') plt.plot([],[],color='y', alpha=0.5, label='buzz_miss') plt.plot([],[],color='r', alpha=0.5, label='buzz_wrong') plt.plot([],[],color='k', alpha=0.5, label='buzz_impossible') plt.plot([],[],color='m', alpha=0.5, label='wait_wrong') plt.plot([],[],color='g', alpha=0.5, label='wait_correct') plt.plot([],[],color='w', alpha=0.5, label='wait_impossible') plt.stackplot(list(range(len(HISTO_RATIOS))), _his_stats['buzz_correct'], _his_stats['buzz_miss'], _his_stats['buzz_wrong'], _his_stats['buzz_impossible'], _his_stats['wait_wrong'], _his_stats['wait_correct'], _his_stats['wait_impossible'], colors=['c', 'y', 'r', 'k', 'm', 'g', 'w'], alpha=0.5) plt.legend() plt.title('{} stacked area chart'.format(fold)) if save_dir is not None: his_stacked_dir = os.path.join(save_dir, 'his_{}_stacked.pdf'.format(fold)) plt.savefig(his_stacked_dir, bbox_inches='tight') else: plt.show() plt.close() if variables is not None: variables['his_stats'][fold] = _his_stats variables['his_lines'][fold] = his_lines_dir variables['his_stacked'][fold] = his_stacked_dir return _his_stats def get_hyper_search(top_guesses, buzzes, answers, variables, fold, save_dir): log.info('[{}] Hyperparameter search reporting'.format(fold)) cfg_buzzes_dir = 'output/buzzer/cfg_buzzes_{}.pkl'.format(fold) if not os.path.exists(cfg_buzzes_dir): return with open(cfg_buzzes_dir, 'rb') as infile: cfg_buzzes = pickle.load(infile) n_configs = len(cfg_buzzes) configs, rushs, lates = [], [], [] choose_best, choose_hopeful = [], [] for config, buzzes in cfg_buzzes: s = get_eop_stats(top_guesses, buzzes, answers, None, fold, save_dir) configs.append(config) rushs.append(s['rush']) lates.append(s['late']) choose_best.append(s['choose_best']) choose_hopeful.append(s['choose_hopeful']) config_names = list(range(n_configs)) ##### plot rush and late ##### pos = list(range(n_configs)) width = 0.5 fig, ax = plt.subplots() bars = [] bars.append(plt.bar(pos, rushs, width, alpha=0.5, color='#EE3224')[0]) bars.append(plt.bar(pos, lates, width, bottom=rushs, alpha=0.5, color='#F78F1E')[0]) plt.legend(bars, ('rush', 'late')) ax.set_ylabel('%') ax.set_title('Rush and Late') ax.set_xticks([p + 1.42 * width for p in pos]) ax.set_xticklabels(config_names) plt.grid() plt.title('{} rush & late chart'.format(fold)) if save_dir is not None: rush_late_dir = os.path.join(save_dir, 'rush_late_{}.pdf'.format(fold)) plt.savefig(rush_late_dir, bbox_inches='tight') else: plt.show() plt.close() ##### plot choose best and choose hopeful ##### pos = list(range(n_configs)) width = 0.5 fig, ax = plt.subplots() bars1 = [] bars1.append(plt.bar(pos, choose_best, width, alpha=0.5, color='#EE3224')[0]) bars1.append(plt.bar(pos, choose_hopeful, width, alpha=0.5, color='#F78F1E')[0]) plt.legend(bars1, ('choose_best', 'choose_hopeful')) ax.set_ylabel('%') ax.set_title('Choose hopeful and best') ax.set_xticks([p + 1.42 * width for p in pos]) ax.set_xticklabels(config_names) plt.grid() plt.title('{} choices chart'.format(fold)) if save_dir is not None: choice_dir = os.path.join(save_dir, 'choose_{}.pdf'.format(fold)) plt.savefig(choice_dir, bbox_inches='tight') else: plt.show() plt.close() if variables is not None: variables['rush_late_plot'][fold] = rush_late_dir variables['choice_plot'][fold] = choice_dir variables['hype_configs']['dev'] = list(zip(config_names, configs)) def get_protobowl(inputs): question_texts, protobowl_ids, protobowl_df, questions, \ top_guesses, buzzes, answers, variables, fold, save_dir = inputs protobowl_keys = ['correct_before', 'correct_after', 'rush_possible', 'rush_impossible', 'late_possible', 'late_impossible', 'buzz_before_op', 'buzz_after_op', 'reward'] avg_stats = {k: [] for k in protobowl_keys} n_questions = 0 for qnum, guess_list in top_guesses.items(): if qnum not in protobowl_ids: continue protobowl_id = protobowl_ids[qnum] if protobowl_id not in protobowl_df.groups: continue n_questions += 1 buzz = buzzes[qnum] answer = answers[qnum] # position in guesses -> real position position_mapping = [] g_group = questions.get_group(qnum) text = question_texts[qnum] g_group = g_group.groupby(['sentence', 'token']).groups _count = 0 for sent in text: for word, x in enumerate(text[sent].split()): if (sent, word) in g_group: position_mapping.append(_count) _count += 1 position_mapping.append(_count) if len(position_mapping) != len(buzz): print(len(position_mapping), len(buzz)) continue correct_position = len(buzz) buzzing_result = False for i in range(N_GUESSERS): for j in range(len(buzz)): if guess_list[i][j] == answer: if j < correct_position: correct_position = j break buzzing_position = len(buzz) for i in range(len(buzz)): choice = np.argmax(buzz[i]) if choice < N_GUESSERS: buzzing_position = position_mapping[i] buzzing_result = (guess_list[choice][i] == answer) break final_choice = np.argmax(buzz[-1][:N_GUESSERS]) final_result = guess_list[final_choice][-1] == answer stats = {k: 0 for k in protobowl_keys} n_opponents = 0 for opponent in protobowl_df.get_group(protobowl_id).itertuples(): n_opponents += 1 if opponent.position > buzzing_position: stats['buzz_before_op'] += 1 if buzzing_result: stats['reward'] += 10 stats['correct_before'] += 1 else: stats['reward'] -= 5 if correct_position >= opponent.position and opponent.result == True: stats['rush_impossible'] += 1 else: stats['rush_possible'] += 1 if opponent.result == True: stats['reward'] -= 10 else: stats['buzz_after_op'] += 1 if opponent.result == True: stats['reward'] -= 10 if correct_position <= opponent.position: stats['late_possible'] += 1 else: stats['late_impossible'] += 1 else: stats['reward'] += 5 if final_result: stats['correct_after'] += 1 stats['reward'] += 10 for k, v in dict(stats).items(): avg_stats[k].append(v / n_opponents) for k, v in avg_stats.items(): avg_stats[k] = sum(v) / n_questions # plotting plot_keys = protobowl_keys[:-1] plt.clf() ind = 0 width = 0.5 labels = [] for k in plot_keys: if k not in avg_stats: continue plt.bar(ind, avg_stats[k], width) labels.append(k) ind += width * 2 plt.xticks(list(range(len(labels))), labels, rotation=30) plt.subplots_adjust(bottom=0.3) plt.title('{} stats against Protobowl'.format(fold)) if save_dir is not None: plot_dir = os.path.join(save_dir, '{}_protobowl.pdf'.format(fold)) plt.savefig(plot_dir, bbox_inches='tight') else: plt.show() plt.clf() if variables is not None: variables['protobowl_plot'][fold] = plot_dir variables['protobowl_stats'][fold] = avg_stats return avg_stats def main(folds, model_name): all_questions = QuestionDatabase().all_questions() answers = {k: v.page for k, v in all_questions.items()} question_texts = {k: v.text for k, v in all_questions.items()} protobowl_ids = {k: all_questions[k].protobowl for k in all_questions if all_questions[k].protobowl != ''} protobowl_df = load_protobowl().groupby('qid') save_dir = 'output/summary/new_performance/' if not os.path.exists(save_dir): os.makedirs(save_dir) # feature -> fold -> value variables = defaultdict(lambda: defaultdict()) for fold in folds: guesses_df = AbstractGuesser.load_guesses( bc.GUESSES_DIR, folds=[fold]) questions = guesses_df.groupby('qnum') buzzes_dir = bc.BUZZES_DIR.format(fold, model_name) with open(buzzes_dir, 'rb') as infile: buzzes = pickle.load(infile) log.info('Buzzes loaded from {}.'.format(buzzes_dir)) # qnum -> n_guessers * length top_guesses = _multiprocess(_get_top_guesses, questions, info='Top guesses', multi=True) top_guesses = {k: v for k, v in top_guesses} inputs = [top_guesses, buzzes, answers, variables, fold, save_dir] # get_eop_stats(*inputs) get_his_stats(*inputs) # get_hyper_search(*inputs) p_inputs = [question_texts, protobowl_ids, protobowl_df, questions] + inputs get_protobowl(p_inputs) for key, value in variables.items(): variables[key] = dict(value) variables = dict(variables) report(variables, save_dir, folds) def report(variables, save_dir, folds): # use this to have jinja skip non-existent features jinja_keys = ['his_lines', 'his_stacked', 'rush_late_plot', 'choice_plot', 'hype_configs', 'protobowl_plot', 'protobowl_stats'] _variables = {k: dict() for k in jinja_keys} _variables.update(variables) if len(folds) == 1: output = os.path.join(save_dir, 'report_{}.pdf'.format(folds[0])) else: output = os.path.join(save_dir, 'report_all.pdf') report_generator = ReportGenerator('new_performance.md') report_generator.create(_variables, output) def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('-f', '--fold', default=None) parser.add_argument('-m', '--model', required=True) return parser.parse_args() if __name__ == '__main__': args = parse_args() if args.fold != None: folds = [args.fold] else: folds = c.BUZZER_GENERATION_FOLDS[:-1] main(folds, args.model)