python source code of exp4p

""" EXP4.P: An extention to exponential-weight algorithm for exploration and
exploitation. This module contains a class that implements EXP4.P, a contextual
bandit algorithm with expert advice.
"""

import logging

import six
from six.moves import zip
import numpy as np

from striatum.bandit.bandit import BaseBandit

LOGGER = logging.getLogger(__name__)


class Exp4P(BaseBandit):
    r"""Exp4.P with pre-trained supervised learning algorithm.

    Parameters
    ----------
    actions : list of Action objects
        List of actions to be chosen from.

    historystorage: a HistoryStorage object
        The place where we store the histories of contexts and rewards.

    modelstorage: a ModelStorage object
        The place where we store the model parameters.

    delta: float, 0 < delta <= 1
        With probability 1 - delta, LinThompSamp satisfies the theoretical
        regret bound.

    p_min: float, 0 < p_min < 1/k
        The minimum probability to choose each action.

    References
    ----------
    .. [1]  Beygelzimer, Alina, et al. "Contextual bandit algorithms with
            supervised learning guarantees." International Conference on
            Artificial Intelligence and Statistics (AISTATS). 2011u.
    """

    def __init__(self, actions, historystorage, modelstorage, delta=0.1,
                 p_min=None, max_rounds=10000):
        super(Exp4P, self).__init__(historystorage, modelstorage, actions)
        self.n_total = 0
        # number of actions (i.e. K in the paper)
        self.n_actions = len(self._actions)
        self.max_rounds = max_rounds

        # delta > 0
        if not isinstance(delta, float):
            raise ValueError("delta should be float, the one"
                             "given is: %f" % p_min)
        self.delta = delta

        # p_min in [0, 1/k]
        if p_min is None:
            self.p_min = np.sqrt(np.log(10) / self.n_actions / self.max_rounds)
        elif not isinstance(p_min, float):
            raise ValueError("p_min should be float, the one"
                             "given is: %f" % p_min)
        elif (p_min < 0) or (p_min > (1. / self.n_actions)):
            raise ValueError("p_min should be in [0, 1/k], the one"
                             "given is: %f" % p_min)
        else:
            self.p_min = p_min

        # Initialize the model storage

        model = {
            # probability distribution for action recommendation
            'action_probs': {},
            # weight vector for each expert
            'w': {},
        }
        self._modelstorage.save_model(model)

    def _exp4p_score(self, context):
        """The main part of Exp4.P.
        """
        advisor_ids = list(six.viewkeys(context))

        w = self._modelstorage.get_model()['w']
        if len(w) == 0:
            for i in advisor_ids:
                w[i] = 1
        w_sum = sum(six.viewvalues(w))

        action_probs_list = []
        for action_id in self.action_ids:
            weighted_exp = [w[advisor_id] * context[advisor_id][action_id]
                            for advisor_id in advisor_ids]
            prob_vector = np.sum(weighted_exp) / w_sum
            action_probs_list.append((1 - self.n_actions * self.p_min)
                                     * prob_vector
                                     + self.p_min)
        action_probs_list = np.asarray(action_probs_list)
        action_probs_list /= action_probs_list.sum()

        estimated_reward = {}
        uncertainty = {}
        score = {}
        for action_id, action_prob in zip(self.action_ids, action_probs_list):
            estimated_reward[action_id] = action_prob
            uncertainty[action_id] = 0
            score[action_id] = action_prob
        self._modelstorage.save_model(
            {'action_probs': estimated_reward, 'w': w})

        return estimated_reward, uncertainty, score

    def get_action(self, context=None, n_actions=1):
        """Return the action to perform

        Parameters
        ----------
        context : dictionary
            Contexts {expert_id: {action_id: expert_prediction}} of
            different actions.

        n_actions: int
            Number of actions wanted to recommend users.

        Returns
        -------
        history_id : int
            The history id of the action.

        action_recommendation : list of dictionaries
            In each dictionary, it will contains {Action object,
            estimated_reward, uncertainty}.
        """
        estimated_reward, uncertainty, score = self._exp4p_score(context)

        action_recommendation = []
        action_recommendation_ids = sorted(score, key=score.get,
                                           reverse=True)[:n_actions]

        for action_id in action_recommendation_ids:
            action = self.get_action_with_id(action_id)
            action_recommendation.append({
                'action': action,
                'estimated_reward': estimated_reward[action_id],
                'uncertainty': uncertainty[action_id],
                'score': score[action_id],
            })

        self.n_total += 1
        history_id = self._historystorage.add_history(
            context, action_recommendation, reward=None)
        return history_id, action_recommendation

    def reward(self, history_id, rewards):
        """Reward the previous action with reward.

        Parameters
        ----------
        history_id : int
            The history id of the action to reward.

        rewards : dictionary
            The dictionary {action_id, reward}, where reward is a float.
        """
        context = (self._historystorage
                   .get_unrewarded_history(history_id)
                   .context)

        model = self._modelstorage.get_model()
        w = model['w']
        action_probs = model['action_probs']
        action_ids = list(six.viewkeys(six.next(six.itervalues(context))))

        # Update the model
        for action_id, reward in six.viewitems(rewards):
            y_hat = {}
            v_hat = {}
            for i in six.viewkeys(context):
                y_hat[i] = (context[i][action_id] * reward
                            / action_probs[action_id])
                v_hat[i] = sum(
                    [context[i][k] / action_probs[k] for k in action_ids])
                w[i] = w[i] * np.exp(
                    self.p_min / 2
                    * (y_hat[i] + v_hat[i]
                       * np.sqrt(np.log(len(context) / self.delta)
                                 / (len(action_ids) * self.max_rounds))))

        self._modelstorage.save_model({
            'action_probs': action_probs, 'w': w})

        # Update the history
        self._historystorage.add_reward(history_id, rewards)