python source code of eval

#!/usr/bin/env python
# encoding: utf-8

import sys
import numpy as np
from joblib import Parallel, delayed
from sklearn.metrics import roc_auc_score

# Raw formats:
# 1. true: all indices and values, predicted: all indices and values
# Should change to:
#    true: query indexed lists (sequentially ordered), predicted: query indexed lists (sequentially ordered)

def dcg_score(y_true, y_score, k=10):
    """Discounted cumulative gain (DCG) at rank k

    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).

    y_score : array-like, shape = [n_samples]
        Predicted scores.

    k : int
        Rank.

    Returns
    -------
    DCG @k : float
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gains = 2 ** y_true - 1

    # highest rank is 1 so +2 instead of +1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)

def ndcg_score(y_true, y_score, k=10):
    """Normalized discounted cumulative gain (NDCG) at rank k

    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).

    y_score : array-like, shape = [n_samples]
        Predicted scores.

    k : int
        Rank.

    Returns
    -------
    NDCG @k : float
    """
    best = dcg_score(y_true, y_true, k)
    actual = dcg_score(y_true, y_score, k)
    return actual / best



def average_precision(y_true, y_score, k=None):
    """Average precision at rank k

    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).

    y_score : array-like, shape = [n_samples]
        Predicted scores.

    k : int
        Rank.

    Returns
    -------
    average precision @k : float
    """
    assert len(y_true) == len(y_score)
    unique_y = np.unique(y_true)

    if len(unique_y) > 2:
        raise ValueError("Only supported for two relevance levels.")
    elif len(unique_y) <= 1:
        return 0.0

    if k is None:
        k = len(y_true)
    else:
        k = min(k, len(y_true))

    pos_label = unique_y[1]
    n_pos = np.sum(y_true == pos_label)

    order = np.argsort(y_score)[::-1][:k]
    y_true = np.asarray(y_true)[order]

    score = 0
    cur_count = 0
    for i in xrange(len(y_true)):
        if y_true[i] == pos_label:
            # Compute precision up to document i
            # i.e, percentage of relevant documents up to document i.
            cur_count += 1
            score += cur_count / (i + 1.0)

    if cur_count == 0:
        return 0

    return score / cur_count

# preprocessing for reading data, return a dict of sorted list (of the predicting data)
# rank it in according to index
def read_score_file(filename):
    score_lists = {}
    for line in open(filename):
        pair = line.split()
        q = pair[0]
        doc = pair[1]
        score = float(pair[2])
        if q not in score_lists:
            score_lists[q] = [[],[]]
        score_lists[q][0].append(doc)
        score_lists[q][1].append(score)
    index_ranked_lists = {}
    for q in score_lists:
        docs, scores = zip(*sorted(zip(score_lists[q][0],score_lists[q][1])))
        index_ranked_lists[q] = list(scores)
    return index_ranked_lists

# remove queries that have no relevant documents
def strip_all_0_queries(y_dict):
    y_stripped_dict = {}
    for q in y_dict:
        val = y_dict[q]
        if np.sum(val) != 0:
            y_stripped_dict[q] = val
    return y_stripped_dict

def eval_RMSE(Y_true, Y_pred, x_index, y_index):
    return np.sqrt(np.mean((np.asarray(Y_true[x_index, y_index]).flatten() - Y_pred[x_index, y_index])**2))

def eval_MAP(relevanceLists, predicted):
    aps = []
    for q in relevanceLists:
        lhat = predicted[q]
        ltruth = relevanceLists[q]
        correct = 0.0
        sum_precision = 0.0
        for pos in range(len(lhat)):
            if lhat[pos] in ltruth:
                correct = correct + 1
                sum_precision = sum_precision + correct/(pos+1)
        aps.append(sum_precision/len(ltruth))
    return sum(aps)/len(aps)

# not including MAP and AUC
def eval_cf_scores(y_true_dict, y_score_dict):
    # when calculating RMSE, make sure y_true_dict is the full dict of list
    ndcgs = [[], [], []] # return ndcg at 1, 3, 5
    for q in y_true_dict:
        if q not in y_score_dict:
            raise ValueError("Prediction has missing items.")
        if np.sum(y_true_dict[q]) != 0:
            ndcgs[0].append(ndcg_score(y_true_dict[q], y_score_dict[q], k=1))
            ndcgs[1].append(ndcg_score(y_true_dict[q], y_score_dict[q], k=3))
            ndcgs[2].append(ndcg_score(y_true_dict[q], y_score_dict[q], k=5))
    ndcgs = np.asarray(ndcgs)

    y_true_list = trans_dict_to_list(y_true_dict, y_true_dict)
    y_score_list = trans_dict_to_list(y_true_dict, y_score_dict)

    rmse = np.mean((y_true_list - y_score_list)**2)

    # ndcg@1, ndcg@3, ndcg@5, rmse
    return np.mean(ndcgs[0,:]), np.mean(ndcgs[1,:]), np.mean(ndcgs[2,:]), np.sqrt(rmse)

# including MAP and AUC
def eval_all_scores(y_true_dict, y_score_dict):
    # when calculating RMSE, make sure y_true_dict is the full dict of list
    aps = []    # average precisions
    ndcgs = [[], [], []] # return ndcg at 1, 3, 5
    for q in y_true_dict:
        if q not in y_score_dict:
            raise ValueError("Prediction has missing items.")
        if np.sum(y_true_dict[q]) != 0:
            aps.append(average_precision(y_true_dict[q], y_score_dict[q]))
            ndcgs[0].append(ndcg_score(y_true_dict[q], y_score_dict[q], k=1))
            ndcgs[1].append(ndcg_score(y_true_dict[q], y_score_dict[q], k=3))
            ndcgs[2].append(ndcg_score(y_true_dict[q], y_score_dict[q], k=5))
    ndcgs = np.asarray(ndcgs)

    y_true_list = trans_dict_to_list(y_true_dict, y_true_dict)
    y_score_list = trans_dict_to_list(y_true_dict, y_score_dict)

    auc = roc_auc_score(y_true_list, y_score_list)
    rmse = np.mean((y_true_list - y_score_list)**2)

    # map, ndcg@1, ndcg@3, ndcg@5, auc, rmse
    return sum(aps)/len(aps), np.mean(ndcgs[0,:]), np.mean(ndcgs[1,:]), np.mean(ndcgs[2,:]), auc, np.sqrt(rmse)

# including MAP and AUC
def eval_all_scores_from_array(y_true_array, y_score_array, mask):
    y_true_dict = trans_array_to_dict(y_true_array, mask)
    y_score_dict = trans_array_to_dict(y_score_array, mask)
    return eval_all_scores(y_true_dict, y_score_dict)

# excluding MAP and AUC
def eval_cf_scores_from_array(y_true_array, y_score_array, mask):
    y_true_dict = trans_array_to_dict(y_true_array, mask)
    y_score_dict = trans_array_to_dict(y_score_array, mask)
    return eval_cf_scores(y_true_dict, y_score_dict)

# return the dict, note 'mask' has to be of dtype boolean
def trans_array_to_dict(array, mask):
    ret_dict = {}
    for i in xrange(mask.shape[0]):
        ret_dict[i] = array[i][mask[i]]
    return ret_dict


# return the numpy array list
def trans_dict_to_list(keys, y_dict):
    stack_list = [y_dict[key] for key in keys]
    return np.hstack(stack_list)

def eval_all_scores_from_file(y_true_file, y_score_file):
    y_true_dict = read_score_file(y_true_file)
    y_score_dict = read_score_file(y_score_file)
    return eval_all_scores(y_true_dict, y_score_dict)

def eval_cf_scores_from_file(y_true_file, y_score_file):
    y_true_dict = read_score_file(y_true_file)
    y_score_dict = read_score_file(y_score_file)
    return eval_cf_scores(y_true_dict, y_score_dict)


if __name__ == '__main__':
    y_true_dict = read_score_file('/clair/yuexinw/research/conv_cf/relational/split_data/cmu/link.tes.5.txt')
    y_pred_dict = read_score_file('/clair/yuexinw/research/conv_cf/TOP-plus-plus/split_cmu_pred_dir/prediction')
    # y_true_dict = strip_all_0_queries(y_true_dict)
    print eval_all_scores(y_true_dict, y_pred_dict)