python source code of evaluation

import argparse
import importlib
import os

import numpy as np
import pandas as pd
from keras import backend as K
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import MultiLabelBinarizer

from experiments.settings import *


class Evaluator(object):

    def __init__(self, model_module, weights_path, evaluation_strategy="s2"):
        """
        Test metadata format
        ---------------------
        filename : string
        class_ids: string of ints with space as a delimiter
        """
        test_dataset = pd.read_csv(IRMAS_TESTING_META_PATH, names=["filename", "class_ids"])
        self.X = list(test_dataset.filename)
        targets = [[int(category) for category in target.split()] for target in test_dataset.class_ids]
        self.ml_binarizer = MultiLabelBinarizer().fit(targets)
        self.y_true = self.ml_binarizer.transform(targets)

        self.y_pred = np.zeros(shape=self.y_true.shape)
        self.y_pred_raw = np.zeros(shape=self.y_true.shape)
        self.y_pred_raw_average = np.zeros(shape=self.y_true.shape)
        self.model_module = model_module
        self.weights_path = weights_path
        self.feature_filenames = os.listdir(os.path.join(IRMAS_TEST_FEATURE_BASEPATH, model_module.BASE_NAME))
        self.dataset_mean = np.load(os.path.join(MODEL_MEANS_BASEPATH, "{}_mean.npy".format(model_module.BASE_NAME)))
        self.evaluation_strategy = evaluation_strategy
        self.thresholds_s1 = [0.10, 0.12, 0.14, 0.16, 0.18, 0.20, 0.22, 0.24]
        self.thresholds_s2 = [0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]

    def compute_prediction_scores_raw(self, model):
        for i, (data_for_excerpt, filename) in enumerate(self._batch_generator(self.X)):
            one_excerpt_prediction = model.predict_on_batch(data_for_excerpt)
            if self.evaluation_strategy == "s2":
                self.y_pred_raw[i, :] = self._compute_prediction_sum(one_excerpt_prediction)
            else:
                self.y_pred_raw_average[i, :] = self._compute_prediction_sum(one_excerpt_prediction)

    def report_metrics(self, threshold):
        for average_strategy in ["micro", "macro"]:
            print("{} average strategy, threshold {}".format(average_strategy, threshold))
            print("precision:\t{}".format(precision_score(self.y_true, self.y_pred, average=average_strategy)))
            print("recall:\t{}".format(recall_score(self.y_true, self.y_pred, average=average_strategy)))
            print("f1:\t{}".format(f1_score(self.y_true, self.y_pred, average=average_strategy)))

    def evaluate(self):
        model = self.model_module.build_model(IRMAS_N_CLASSES)
        model.load_weights(self.weights_path)
        model.compile(optimizer="sgd", loss="categorical_crossentropy")
        self.compute_prediction_scores_raw(model)
        if self.evaluation_strategy == "s2":
            for threshold in self.thresholds_s2:
                self.y_pred = np.copy(self.y_pred_raw)
                for i in range(self.y_pred.shape[0]):
                    self.y_pred[i, :] /= self.y_pred[i, :].max()
                self.y_pred[self.y_pred >= threshold] = 1
                self.y_pred[self.y_pred < threshold] = 0
                self.report_metrics(threshold)
        else:
            for threshold in self.thresholds_s1:
                self.y_pred = np.copy(self.y_pred_raw_average)
                self.y_pred[self.y_pred < threshold] = 0
                self.y_pred[self.y_pred > threshold] = 1
                self.report_metrics(threshold)

    def _load_features(self, audio_filename):
        features = list()
        for feature_filename in self.feature_filenames:
            if audio_filename in feature_filename:
                filename_full_path = os.path.join(IRMAS_TEST_FEATURE_BASEPATH,
                                                  self.model_module.BASE_NAME,
                                                  feature_filename)
                feature = np.load(filename_full_path)
                feature -= self.dataset_mean
                features.append(feature)

        if K.image_dim_ordering() == "th":
            features = np.array(features).reshape(-1, 1, self.model_module.N_MEL_BANDS, self.model_module.SEGMENT_DUR)
        else:
            features = np.array(features).reshape(-1, self.model_module.N_MEL_BANDS, self.model_module.SEGMENT_DUR, 1)
        return features

    def _batch_generator(self, inputs):
        for audio_filename in inputs:
            yield self._load_features(audio_filename), audio_filename

    def _compute_prediction_sum(self, predictions):
        prediction_sum = np.zeros(IRMAS_N_CLASSES)
        for prediction in predictions:
            prediction_sum += prediction
        if self.evaluation_strategy == "s1":    # simple averaging strategy
            prediction_sum /= predictions.shape[0]
        return prediction_sum


def main():
    aparser = argparse.ArgumentParser()
    aparser.add_argument("-m",
                         action="store",
                         dest="model",
                         help="-m model to evaluate")
    aparser.add_argument("-w",
                         action="store",
                         dest="weights_path",
                         help="-w path to file with weights for selected model")
    aparser.add_argument("-s",
                         action="store",
                         dest="evaluation_strategy",
                         default="s2",
                         help="-s evaluation strategy: `s1` (simple averaging and thresholding) or `s2` ("
                              "summarization, normalization by max probability and thresholding)")

    args = aparser.parse_args()

    if not (args.model and args.weights_path):
        aparser.error("Please, specify the model and the weights path to evaluate!")
    try:
        if args.model in ALLOWED_MODELS:
            model_module = importlib.import_module(".{}".format(args.model), "experiments.models")
            print "{} imported as 'model'".format(args.model)
        else:
            print "The specified model is not allowed"
        if not os.path.exists(args.weights_path):
            print "The specified weights path doesn't exist"
    except ImportError, e:
        print e

    evaluator = Evaluator(model_module, args.weights_path, args.evaluation_strategy)
    evaluator.evaluate()


if __name__ == "__main__":
    main()