# -*- coding: utf-8 -*- # @Time : 5/3/18 10:24 # @Author : Shun Zheng from __future__ import print_function import csv import sys import os import shutil import time import random from collections import Counter, OrderedDict from collections import defaultdict import matplotlib as mpl import matplotlib.pyplot as plt import numpy as np import torch import torch.nn as nn from mpl_toolkits.axes_grid1 import make_axes_locatable from torchtext.vocab import pretrained_aliases, Vectors from sklearn.metrics import average_precision_score, precision_recall_curve, precision_recall_fscore_support, auc # a temporary hack for using chinese word vectors in the financial domain class MyVec(Vectors): def __init__(self, name='wind_vec.50d.txt', **kwargs): super(MyVec, self).__init__(name, **kwargs) def build_field_vocab_from_dict(field, vocab_freq_dict, **kwargs): print('Build vocabulary from dict') vocab_counter = Counter(vocab_freq_dict) # taken from torchtext.data.Field.build_vocab() specials = list(OrderedDict.fromkeys( tok for tok in [field.unk_token, field.pad_token, field.init_token, field.eos_token] if tok is not None)) field.vocab = field.vocab_cls(vocab_counter, specials=specials, **kwargs) print('Total vocabulary size:', len(field.vocab.freqs), 'effective size:', len(field.vocab)) def build_field_vocab_from_file(field, vocab_freq_file, **kwargs): print('Build Field vocabulary from existing vocabulary files', vocab_freq_file) # read vocabulary frequency file with open(vocab_freq_file, 'r') as fin: csv_reader = csv.reader(fin) vocab_freq = [] for row in csv_reader: word, freq = row # word = word.decode('utf-8') # for python3 freq = int(freq) vocab_freq.append((word, freq)) vocab_counter = Counter(dict(vocab_freq)) # taken from torchtext.data.Field.build_vocab() specials = list(OrderedDict.fromkeys( tok for tok in [field.unk_token, field.pad_token, field.init_token, field.eos_token] if tok is not None)) field.vocab = field.vocab_cls(vocab_counter, specials=specials, **kwargs) print('Total vocabulary size:', len(field.vocab.freqs), 'effective size:', len(field.vocab)) def build_field_vocab_from_dataset(field, data_set, vocab_freq_file=None, **kwargs): # build field vocabulary from data_set print('Build Field vocabulary from dataset') field.build_vocab(data_set, **kwargs) print('Total vocabulary size:', len(field.vocab.freqs), 'effective size:', len(field.vocab)) if isinstance(vocab_freq_file, str): print('Dump vocabulary frequencies into', vocab_freq_file) # dump vocabulary frequency sorted_vocab_freq = sorted(field.vocab.freqs.items(), key=lambda x: x[1]) with open(vocab_freq_file, 'w') as fout: csv_writer = csv.writer(fout) for row in sorted_vocab_freq: word, freq = row # word = word.encode('utf-8') # for python3 csv_writer.writerow([word, freq]) def build_field_vocabulary(field, from_vocab=True, vocab_freq_file=None, vocab_freq_dict=None, data_set=None, **kwargs): """ Build field vocabulary with three options: 1. from vocabulary frequency file 2. from vocabulary frequency dict 3. by counting tokens in dataset and dump into vocab_freq_file accordingly Args: field: torchtext.data.Field object from_vocab: flag of whether to recover from the vocabulary file directly vocab_freq_file: the absolute path of the vocabulary file vocab_freq_dict: the vocabulary frequency dictionary data_set: torchtext.data.Dataset object **kwargs: key word arguments to be parsed to torchtext.Vocab class """ if 'vectors' in kwargs and isinstance(kwargs['vectors'], str) and kwargs['vectors'] not in pretrained_aliases: print('Read from self-pretrained vectors', kwargs['vectors']) kwargs['vectors'] = MyVec(name=kwargs['vectors']) if from_vocab and isinstance(vocab_freq_file, str) and os.path.exists(vocab_freq_file): build_field_vocab_from_file(field, vocab_freq_file, **kwargs) elif from_vocab and isinstance(vocab_freq_dict, dict): build_field_vocab_from_dict(field, vocab_freq_dict, **kwargs) elif data_set is not None: build_field_vocab_from_dataset(field, data_set, vocab_freq_file, **kwargs) else: raise Exception('Build field vocabulary failed, please check input arguments!') def random_init_certain_vector(vocab, token='<unk>', mean=0, std=0.5): """ Randomly initialize certain vector of the vocabulary object Args: vocab: the object of torchtext.vocab.Vocab class token: token string mean: mean of the normal distribution std: std of the normal distribution """ idx = vocab.stoi[token] nn.init.normal_(vocab.vectors[idx], mean=mean, std=std) def save_checkpoint(state_dict, is_best, file_path_prefix, file_name_suffix=''): file_path = file_path_prefix + file_name_suffix torch.save(state_dict, file_path) if is_best: shutil.copyfile(file_path, file_path_prefix + '.best') def resume_checkpoint(net, model_file_path, strict=False, resume_key='model', print_keys=('epoch', 'dev_f1', 'dev_avg_prec')): if os.path.exists(model_file_path): resume_dict = torch.load(model_file_path) print('Resume from previous model checkpoint {}'.format(model_file_path)) for key in print_keys: if key in resume_dict: print('{}: {}'.format(key, resume_dict[key])) net.load_state_dict(resume_dict[resume_key], strict=strict) print('Resume successfully') return True else: print(Warning('Warning: model resume failed because', model_file_path, 'not found')) return False def id_to_word(word_ids, itos): words = [] for wid in word_ids: words.append(itos[wid]) return words def show_word_score_heatmap(score_tensor, x_ticks, y_ticks, figsize=(3, 8)): # to make colorbar a proper size w.r.t the image def colorbar(mappable): ax = mappable.axes fig = ax.figure divider = make_axes_locatable(ax) cax = divider.append_axes("right", size="10%", pad=0.1) return fig.colorbar(mappable, cax=cax) mpl.rcParams['font.sans-serif'] = ['simhei'] mpl.rcParams['axes.unicode_minus'] = False fig, ax = plt.subplots(nrows=1, ncols=1, figsize=figsize) img = ax.matshow(score_tensor.numpy()) plt.xticks(range(score_tensor.size(1)), x_ticks, fontsize=14) plt.yticks(range(score_tensor.size(0)), y_ticks, fontsize=14) colorbar(img) ax.set_aspect('auto') plt.show() def show_word_scores_heatmap(score_tensor_tup, x_ticks, y_ticks, nrows=1, ncols=1, titles=None, figsize=(8, 8), fontsize=14): def colorbar(mappable): ax = mappable.axes fig = ax.figure divider = make_axes_locatable(ax) cax = divider.append_axes("right", size="1%", pad=0.1) return fig.colorbar(mappable, cax=cax) if not isinstance(score_tensor_tup, tuple): score_tensor_tup = (score_tensor_tup, ) fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize) for idx, ax in enumerate(axs): score_tensor = score_tensor_tup[idx] img = ax.matshow(score_tensor.numpy()) plt.sca(ax) plt.xticks(range(score_tensor.size(1)), x_ticks, fontsize=fontsize) plt.yticks(range(score_tensor.size(0)), y_ticks, fontsize=fontsize) if titles is not None: plt.title(titles[idx], fontsize=fontsize + 2) colorbar(img) for ax in axs: ax.set_aspect('auto') plt.tight_layout(h_pad=1) plt.show() def build_salient_phrase_candidates(sample_decomp_info, num_classes=2, score_threshold=1.1): # logic to filter unimportant phrases def is_salient_phrase(phrase_scores, num_classes, score_threshold): for cid in range(num_classes): x = phrase_scores[:, cid] > score_threshold if x.sum() == len(x): # all values for cid are greater than the threshold return True return False phrase_candidate_dict = defaultdict(lambda: {'sample_ids': [], 'decompose_scores': [], 'count': None, 'average_score': None, 'phrase_score': None, 'class_id': None}) # iterate through all samples to construct phrase candidates for idx, decomp_sample in enumerate(sample_decomp_info): sample_id = decomp_sample[0] word_ids = decomp_sample[1] decomp_scores = np.array(decomp_sample[2]) for idx in range(len(word_ids)): ngram_max = len(word_ids) - idx for ngram_len in range(1, ngram_max + 1): sid = idx eid = idx + ngram_len tmp_phr_scores = decomp_scores[sid:eid, :] if is_salient_phrase(tmp_phr_scores, num_classes, score_threshold): tmp_phr_ids = tuple(word_ids[sid:eid]) tmp_phr_total_score = np.prod(tmp_phr_scores, axis=0, keepdims=True) # record salient phrase candidate phrase_candidate_dict[tmp_phr_ids]['sample_ids'].append(sample_id) phrase_candidate_dict[tmp_phr_ids]['decompose_scores'].append(tmp_phr_total_score) else: # because later ngrams cannot be salient phrases break # calculate average scores and associated information for each phrase for key in phrase_candidate_dict: phr_dict = phrase_candidate_dict[key] # get expected decomposition score avg_score = np.mean(np.concatenate(phr_dict['decompose_scores'], axis=0), axis=0) # normalize avg_score = avg_score / avg_score.sum() max_score = np.max(avg_score) class_id = np.argmax(avg_score) phr_dict['count'] = len(phr_dict['sample_ids']) phr_dict['average_score'] = avg_score phr_dict['phrase_score'] = max_score phr_dict['class_id'] = class_id return phrase_candidate_dict def get_salient_phrases(phrase_candidate_dict, word_id2str=None, num_classes=2, min_count=0): salient_phrases = [[] for _ in range(num_classes)] for phrase_ids_key in phrase_candidate_dict: phr_dict = phrase_candidate_dict[phrase_ids_key] if phr_dict['count'] < min_count: continue if word_id2str is None: words = None else: words = id_to_word(phrase_ids_key, word_id2str) # Note: this is not a deep copy, it just creates a new dict() object, # but values in the dictionary refer previous memory spaces. new_phr_dict = {'phrase_ids': phrase_ids_key, 'words': words} for key in ['sample_ids', 'phrase_score']: new_phr_dict[key] = phr_dict[key] # note: this is a shallow copy cid = phr_dict['class_id'] salient_phrases[cid].append(new_phr_dict) for phr_dicts in salient_phrases: phr_dicts.sort(key=lambda x: x['phrase_score'], reverse=True) return salient_phrases def get_confusion_matrix(raw_ids, pred_probs, true_labels, threshold=0.5): tps = [] fps = [] fns = [] tns = [] for idx, rid in enumerate(raw_ids): p = pred_probs[idx] if p >= threshold: if true_labels[idx] == 1: tps.append((rid, p)) elif true_labels[idx] == 0: fps.append((rid, p)) else: raise ValueError('Value for the label must be 1 or 0') else: if true_labels[idx] == 1: fns.append((rid, p)) elif true_labels[idx] == 0: tns.append((rid, p)) else: raise ValueError('Value for the label must be 1 or 0') return tps, fps, fns, tns def resume_and_evaluate(rel_task, cpt_file_path, rel_dataset_iter): print('{} Resume and evaluate'.format(time.asctime(), cpt_file_path)) if cpt_file_path is not None: rel_task.resume_model_from(cpt_file_path, strict=True) eids, pred_probs, true_labels = rel_task.get_prediction_probs_info(rel_dataset_iter) example_ids = eids.tolist() pred_probs = pred_probs[:, 1].numpy() true_labels = true_labels.numpy() pred_labels = (pred_probs > 0.5).astype(int) pred_info = { 'example_ids': example_ids, 'true_labels': true_labels, 'pred_probs': pred_probs, 'pred_labels': pred_labels } precs, recalls, threshes = precision_recall_curve(true_labels, pred_probs) pr_auc = auc(recalls, precs) avg_prec = average_precision_score(true_labels, pred_probs) dec_prec, dec_recall, dec_f1_score, _ = precision_recall_fscore_support(true_labels, pred_labels, average='binary') print('[Evaluate Results]: prec {:.3f}, recall {:.3f}, f1 {:.3f}, avg prec {:.3f}, pr auc {:.3f}'.format( dec_prec, dec_recall, dec_f1_score, avg_prec, pr_auc)) return pred_info, precs, recalls, pr_auc, avg_prec, dec_f1_score, dec_prec, dec_recall def retrain_and_evaluate(rel_task, new_train_file, model_store_prefix, rel_dataset_iter, print_loss_freq=1000): print('{} Re-train relation task with {}'.format(time.asctime(), new_train_file)) rel_task.config['train_file'] = new_train_file rel_task.config['model_store_name_prefix'] = model_store_prefix rel_task.init_train_set() # read new training data rel_task.init_neural_network() # initialize neural network, optimizer, loss, dev state rel_task.train(print_loss_freq=print_loss_freq) best_cpt_path = os.path.join(rel_task.config['model_dir'], '{}.best'.format(model_store_prefix)) dev_eval_results = resume_and_evaluate(rel_task, best_cpt_path, rel_dataset_iter) dev_f1_score = dev_eval_results[5] print('{} Re-train procedure completes, dev f1 score is {}'.format(time.asctime(), dev_f1_score)) return dev_f1_score def plot_multi_pr_curves(plot_tuples, plot_title='Precision Recall Curves', figsize=(12, 8), xlim=(0, 1), ylim=(0, 1), basic_font_size=14): plt.figure(figsize=figsize) for eval_infos, line_name, line_color in plot_tuples: precs = eval_infos[0] recalls = eval_infos[1] avg_prec = eval_infos[3] f1_score = eval_infos[6] plt.step(recalls, precs, label=line_name + ' (AUC {0:.3f}, F1 {1:.3f})'.format(avg_prec, f1_score), color=line_color) dec_prec = eval_infos[4] dec_recall = eval_infos[5] plt.plot(dec_recall, dec_prec, 'o', color=line_color, markersize=8) plt.vlines(dec_recall, 0, dec_prec, linestyles='dashed', colors=line_color) plt.hlines(dec_prec, 0, dec_recall, linestyles='dashed', colors=line_color) plt.legend(fontsize=basic_font_size) plt.title(plot_title, fontsize=basic_font_size+ 2) plt.xlabel('Recall', fontsize=basic_font_size) plt.ylabel('Precision', fontsize=basic_font_size) plt.xticks(fontsize=basic_font_size) plt.yticks(fontsize=basic_font_size) plt.xlim(xlim) plt.ylim(ylim) def plot_multi_agg_pr_curves(line_name2pr_list, plot_title='Aggregated Precision-Recall Curve', figsize=(12, 8), xlim=(0, 1), ylim=(0, 1), basic_font_size=14): plt.figure(figsize=figsize) for line_name, (prec_list, recall_list) in line_name2pr_list.items(): plt.step(recall_list, prec_list, label=line_name) plt.legend(fontsize=basic_font_size) plt.title(plot_title, fontsize=basic_font_size+ 2) plt.xlabel('Recall', fontsize=basic_font_size) plt.ylabel('Precision', fontsize=basic_font_size) plt.xticks(fontsize=basic_font_size) plt.yticks(fontsize=basic_font_size) plt.grid(True) plt.xlim(xlim) plt.ylim(ylim) def get_gpu_mem_usage(gpu_id): gpu_qargs = ['index', 'gpu_name', 'memory.used', 'memory.total'] query_cmd = 'nvidia-smi -i {} --query-gpu={} --format=csv,noheader'.format(gpu_id, ','.join(gpu_qargs)) pipe = os.popen(query_cmd) query_res = pipe.readlines()[0].strip('\n') items = query_res.split(',') mem_used = float(items[-2].strip(' MiB')) mem_total = float(items[-1].strip(' MiB')) return mem_used / mem_total def wait_idle_gpu(gpu_id=None, mem_usage_ratio=0.01, sleep_second=2): if gpu_id is None: gpu_id = os.environ['CUDA_VISIBLE_DEVICES'] print('{} Choose GPU {}, wait for memory usage ratio <= {}'.format( time.asctime(), gpu_id, mem_usage_ratio)) sys.stdout.flush() while True: cur_mem_usage = get_gpu_mem_usage(gpu_id) if cur_mem_usage <= mem_usage_ratio: print('{} Current memory usage {:.5f}, start to bind gpu {}'.format( time.asctime(), cur_mem_usage, gpu_id )) apply_gpu_memory(gpu_id=0) break ss = random.randint(sleep_second, sleep_second + 20) time.sleep(ss) def apply_gpu_memory(gpu_id=0): print('{} Choose gpu {}'.format(time.asctime(), os.environ['CUDA_VISIBLE_DEVICES'])) # quickly apply a small part of gpu memory tmp_tensor = torch.FloatTensor(100, 100) cuda_device = 'cuda:{}'.format(gpu_id) tmp_tensor.to(cuda_device) def set_all_random_seed(seed): print('Set random seed {}'.format(seed)) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed)