# -*- coding: utf-8 -*-

import numpy as np, warnings, ctypes
from .utils import _check_constructor_input, _check_beta_prior, \
            _check_smoothing, _check_fit_input, _check_X_input, _check_1d_inp, \
            _ZeroPredictor, _OnePredictor, _OneVsRest,\
            _BootstrappedClassifier_w_predict, _BootstrappedClassifier_w_predict_proba, \
            _BootstrappedClassifier_w_decision_function, _check_njobs, \
            _check_bools, _check_refit_buffer, _check_refit_inp, _check_random_state, \
            _check_autograd_supported, _get_logistic_grads_norms, \
            _gen_random_grad_norms, _gen_zero_norms, \
            _apply_softmax, _apply_inverse_sigmoid, \
            _LinUCB_n_TS_single, _LogisticUCB_n_TS_single, \
            _TreeUCB_n_TS_single
from ._cy_utils import _choice_over_rows, topN_byrow, topN_byrow_softmax

__all__ = ["BootstrappedUCB", "BootstrappedTS",
           "LogisticUCB", "LogisticTS",
           "SeparateClassifiers", "EpsilonGreedy", "AdaptiveGreedy",
           "ExploreFirst", "ActiveExplorer", "SoftmaxExplorer",
           "LinUCB", "LinTS", "ParametricTS",
           "PartitionedUCB", "PartitionedTS"]

class _BasePolicy:
    def _add_common_params(self, base_algorithm, beta_prior, smoothing, noise_to_smooth,
            njobs, nchoices, batch_train, refit_buffer, deep_copy_buffer, assume_unique_reward,
            random_state, assign_algo = True, prior_def_ucb = False,
            force_unfit_predict = False):
        
        if isinstance(base_algorithm, np.ndarray) or base_algorithm.__class__.__name__ == "Series":
            base_algorithm = list(base_algorithm)

        self._add_choices(nchoices)
        _check_constructor_input(base_algorithm, self.nchoices, batch_train)
        self.smoothing = _check_smoothing(smoothing)
        self.noise_to_smooth = bool(noise_to_smooth)
        self.njobs = _check_njobs(njobs)
        self.batch_train, self.assume_unique_reward = _check_bools(batch_train, assume_unique_reward)
        self.beta_prior = _check_beta_prior(beta_prior, self.nchoices, prior_def_ucb)
        self.random_state = _check_random_state(random_state)
        self.force_unfit_predict = bool(force_unfit_predict)

        if assign_algo:
            self.base_algorithm = base_algorithm
            if ("warm_start" in dir(self.base_algorithm)) and (self.base_algorithm.warm_start):
                self.has_warm_start = True
            else:
                self.has_warm_start = False
        else:
            self.has_warm_start = False

        self.refit_buffer = _check_refit_buffer(refit_buffer, self.batch_train)
        self.deep_copy_buffer = bool(deep_copy_buffer)

        ### For compatibility with the active policies
        self._force_fit = self.force_unfit_predict
        self._force_counters = False

        self.is_fitted = False

    def _add_choices(self, nchoices):
        if isinstance(nchoices, int):
            self.nchoices = nchoices
            self.choice_names = None
        elif isinstance(nchoices, list) or nchoices.__class__.__name__ == "Series" or nchoices.__class__.__name__ == "DataFrame":
            self.choice_names = np.array(nchoices).reshape(-1)
            self.nchoices = self.choice_names.shape[0]
            if np.unique(self.choice_names).shape[0] != self.choice_names.shape[0]:
                raise ValueError("Arm/choice names contain duplicates.")
        elif isinstance(nchoices, np.ndarray):
            self.choice_names = nchoices.reshape(-1)
            self.nchoices = self.choice_names.shape[0]
            if np.unique(self.choice_names).shape[0] != self.choice_names.shape[0]:
                raise ValueError("Arm/choice names contain duplicates.")
        else:
            raise ValueError("'nchoices' must be an integer or list with named arms.")

    def _add_bootstrapped_inputs(self, base_algorithm, batch_sample_method,
                                 nsamples, njobs_samples, percentile,
                                 ts_byrow = False, ts_weighted = False):
        assert (batch_sample_method == 'gamma') or (batch_sample_method == 'poisson')
        assert isinstance(nsamples, int)
        assert nsamples >= 1
        self.batch_sample_method = batch_sample_method
        self.nsamples = nsamples
        self.njobs_samples = _check_njobs(njobs_samples)
        if not isinstance(base_algorithm, list):
            self.base_algorithm = self._make_bootstrapped(base_algorithm, percentile,
                                                          ts_byrow, ts_weighted)
        else:
            self.base_algorithm = [ \
                self._make_bootstrapped(alg, percentile, ts_byrow, ts_weighted) \
                for alg in base_algorithm]

    def _make_bootstrapped(self, base_algorithm, percentile,
                           ts_byrow, ts_weighted):
        if "predict_proba" in dir(base_algorithm):
            return _BootstrappedClassifier_w_predict_proba(
                base_algorithm, self.nsamples, percentile,
                self.batch_train, self.batch_sample_method,
                random_state = 1, ### gets changed later
                njobs = self.njobs_samples,
                ts_byrow = ts_byrow,
                ts_weighted = ts_weighted
                )
        elif "decision_function" in dir(base_algorithm):
            return _BootstrappedClassifier_w_decision_function(
                base_algorithm, self.nsamples, percentile,
                self.batch_train, self.batch_sample_method,
                random_state = 1, ### gets changed later
                njobs = self.njobs_samples,
                ts_byrow = ts_byrow,
                ts_weighted = ts_weighted
                )
        else:
            return _BootstrappedClassifier_w_predict(
                base_algorithm, self.nsamples, percentile,
                self.batch_train, self.batch_sample_method,
                random_state = 1, ### gets changed later
                njobs = self.njobs_samples,
                ts_byrow = ts_byrow,
                ts_weighted = ts_weighted
                )

    def _name_arms(self, pred):
        if self.choice_names is None:
            return pred
        else:
            if not np.issubdtype(pred.dtype, np.integer):
                pred = pred.astype(int)
            return self.choice_names[pred]

    def drop_arm(self, arm_name):
        """
        Drop an arm/choice

        Drops (removes/deletes) an arm from the set of available choices to the policy.

        Note
        ----
        The available arms, if named, are stored in attribute 'choice_names'.
        
        Parameters
        ----------
        arm_name : int or object
            Arm to drop. If passing an integer, will drop at that index (starting at zero). Otherwise,
            will drop the arm matching this name (argument must be of the same type as the individual entries
            passed to 'nchoices' in the initialization).

        Returns
        -------
        self : object
            This object
        """
        if not self.is_fitted:
            raise ValueError("Cannot drop arm from unifitted policy.")
        drop_ix = self._get_drop_ix(arm_name)
        self._oracles._drop_arm(drop_ix)
        self._drop_ix(drop_ix)
        self.has_warm_start = False
        return self

    def _get_drop_ix(self, arm_name):
        if isinstance(arm_name, int):
            if arm_name > self.nchoices:
                raise ValueError("Object has only ", str(self.nchoices), " arms.")
            drop_ix = arm_name
        else:
            if self.choice_names is None:
                raise ValueError("If arms are not named, must pass an integer value.")
            for choice in range(self.nchoices):
                if self.choice_names[choice] == arm_name:
                    drop_ix = choice
                    break
            else:
                raise ValueError("No arm named '", str(arm_name), "' - current names are stored in attribute 'choice_names'.")
        return drop_ix

    def _drop_ix(self, drop_ix):
        if self.choice_names is None:
            self.choice_names = np.arange(self.nchoices)
        self.nchoices -= 1
        self.choice_names = np.r_[self.choice_names[:drop_ix], self.choice_names[drop_ix + 1:]]
        if isinstance(self, _ActivePolicy):
            if isinstance(self._get_grad_norms, list):
                self._get_grad_norms[:drop_ix] + self._get_grad_norms[drop_ix + 1:]
            if isinstance(self._rand_grad_norms, list):
                self._rand_grad_norms[:drop_ix] + self._rand_grad_norms[drop_ix + 1:]

    ## TODO: maybe add functionality to take an arm from another object of this class

    def add_arm(self, arm_name = None, fitted_classifier = None,
                n_w_rew = 0, n_wo_rew = 0,
                refit_buffer_X = None, refit_buffer_r = None,
                f_grad_norm = None, case_one_class = None):
        """
        Adds a new arm to the pool of choices

        Parameters
        ----------
        arm_name : object
            Name for this arm. Only applicable when using named arms. If None, will use the name of the last
            arm plus 1 (will only work when the names are integers).
        fitted_classifier : object
            If a classifier has already been fit to rewards coming from this arm, you can pass it here, otherwise,
            will be started from the same 'base_classifier' as the initial arms. If using bootstrapped methods or methods from this module which do not
            accept arbitrary classifiers as input,
            don't pass a classifier here (unless using the classes like e.g. `utils._BootstrappedClassifierBase`)
        n_w_rew : int
            Number of trials/rounds with rewards coming from this arm (only used when using a beta prior or smoothing).
        n_wo_rew : int
            Number of trials/rounds without rewards coming from this arm (only used when using a beta prior or smoothing).
        refit_buffer_X : array(m, n) or None
            Refit buffer of 'X' data to use for the new arm. Ignored when using
            'batch_train=False' or 'refit_buffer=None'.
        refit_buffer_r : array(m,) or None
            Refit buffer of rewards data to use for the new arm. Ignored when using
            'batch_train=False' or 'refit_buffer=None'.
        f_grad_norm : function
            Gradient calculation function to use for this arm. This is only
            for the policies that make choices according to active learning
            criteria, and only for situations in which the policy was passed
            different functions for each arm.
        case_one_class : function
            Gradient workaround function for single-class data. This is only
            for the policies that make choices according to active learning
            criteria, and only for situations in which the policy was passed
            different functions for each arm.

        Returns
        -------
        self : object
            This object
        """
        if not self.is_fitted:
            raise ValueError("Cannot add arm to unfitted policy.")
        assert isinstance(n_w_rew,  int)
        assert isinstance(n_wo_rew, int)
        assert n_w_rew >= 0
        assert n_wo_rew >= 0
        refit_buffer_X, refit_buffer_r = \
            _check_refit_inp(refit_buffer_X, refit_buffer_r, self.refit_buffer)
        arm_name = self._check_new_arm_name(arm_name)
        if isinstance(self, _ActivePolicy):
            if isinstance(self._get_grad_norms, list):
                if not callable(f_grad_norm):
                    raise ValueError("'f_grad_norm' must be a function.")
            if isinstance(self._rand_grad_norms, list):
                if not callable(case_one_class):
                    raise ValueError("'case_one_class' must be a function.")

        self._oracles._spawn_arm(fitted_classifier, n_w_rew, n_wo_rew,
                                 refit_buffer_X, refit_buffer_r)
        self._append_arm(arm_name, f_grad_norm, case_one_class)
        return self

    def _check_new_arm_name(self, arm_name):
        if self.choice_names is None and arm_name is not None:
            raise ValueError("Cannot create a named arm when no names were passed to 'nchoices'.")
        if arm_name is None and self.choice_names is not None:
            try:
                arm_name = self.choice_names[-1] + 1
            except:
                raise ValueError("Must provide an arm name when using named arms.")
        return arm_name

    def _append_arm(self, arm_name, f_grad_norm, case_one_class):
        if self.choice_names is not None:
            self.choice_names = np.r_[self.choice_names, np.array(arm_name).reshape(-1)]
        if f_grad_norm is not None:
            self._get_grad_norms.append(f_grad_norm)
        if case_one_class is not None:
            self._rand_grad_norms.append(case_one_class)
        self.nchoices += 1

    def fit(self, X, a, r, warm_start=False):
        """
        Fits the base algorithm (one per class [and per sample if bootstrapped]) to partially labeled data.

        Parameters
        ----------
        X : array(n_samples, n_features) or CSR(n_samples, n_features)
            Matrix of covariates for the available data.
        a : array(n_samples, ), int type
            Arms or actions that were chosen for each observations.
        r : array(n_samples, ), {0,1}
            Rewards that were observed for the chosen actions. Must be binary rewards 0/1.
        warm_start : bool
            Whether to use the results of previous calls to 'fit' as a start
            for fitting to the 'X' data passed here. This will only be available
            if the base classifier has a property ``warm_start`` too and that
            property is also set to 'True'. You can double-check that it's
            recognized as such by checking this object's property
            ``has_warm_start``. Passing 'True' when the classifier doesn't
            support warm start despite having the property might slow down
            things.
            Dropping arms will make this functionality unavailable.
            This options is not available for 'BootstrappedUCB',
            nor for 'BootstrappedTS'.

        Returns
        -------
        self : obj
            This object
        """
        X, a, r = _check_fit_input(X, a, r, self.choice_names)
        use_warm = warm_start and self.has_warm_start and self.is_fitted
        self._oracles = _OneVsRest(self.base_algorithm,
                                   X, a, r,
                                   self.nchoices,
                                   self.beta_prior[1], self.beta_prior[0][0], self.beta_prior[0][1],
                                   self.random_state,
                                   self.smoothing, self.noise_to_smooth,
                                   self.assume_unique_reward,
                                   self.batch_train,
                                   refit_buffer = self.refit_buffer,
                                   deep_copy = self.deep_copy_buffer,
                                   force_fit = self._force_fit,
                                   force_counters = self._force_counters,
                                   prev_ovr = self._oracles if self.is_fitted else None,
                                   warm = use_warm,
                                   force_unfit_predict = self.force_unfit_predict,
                                   njobs = self.njobs)
        self.is_fitted = True
        return self
    
    def partial_fit(self, X, a, r):
        """
        Fits the base algorithm (one per class) to partially labeled data in batches.
        
        Note
        ----
        In order to use this method, the base classifier must have a 'partial_fit' method,
        such as 'sklearn.linear_model.SGDClassifier'. This method is not available
        for 'LogisticUCB', nor for 'LogisticTS'.

        Parameters
        ----------
        X : array(n_samples, n_features) or CSR(n_samples, n_features)
            Matrix of covariates for the available data.
        a : array(n_samples, ), int type
            Arms or actions that were chosen for each observations.
        r : array(n_samples, ), {0,1}
            Rewards that were observed for the chosen actions. Must be binary rewards 0/1.

        Returns
        -------
        self : obj
            This object
        """
        if not self.batch_train:
            raise ValueError("Must pass 'batch_train' = 'True' to use '.partial_fit'.")
        if '_oracles' in dir(self):
            X, a, r =_check_fit_input(X, a, r, self.choice_names)
            self._oracles.partial_fit(X, a, r)
            self.is_fitted = True
            return self
        else:
            return self.fit(X, a, r)
    
    def decision_function(self, X):
        """
        Get the scores for each arm following this policy's action-choosing criteria.
        
        Parameters
        ----------
        X : array (n_samples, n_features)
            Data for which to obtain decision function scores for each arm.
        
        Returns
        -------
        scores : array (n_samples, n_choices)
            Scores following this policy for each arm.
        """
        X = _check_X_input(X)
        if not self.is_fitted:
            warnings.warn("Model object has not been fit to data, predictions will be random.")
            return self.random_state.random(size=(X.shape[0], self.nchoices))
        return self._score_matrix(X)

    def _score_matrix(self, X):
        return self._oracles.decision_function(X)

    def _predict_random_if_unfit(self, X, output_score):
        warnings.warn("Model object has not been fit to data, predictions will be random.")
        X = _check_X_input(X)
        pred = self._name_arms(self.random_state.integers(self.nchoices, size = X.shape[0]))
        if not output_score:
            return pred
        else:
            return {"choice" : pred, "score" : (1.0 / self.nchoices) * np.ones(size = X.shape[0], dtype = "float64")}

    def topN(self, X, n):
        """
        Get top-N ranked actions for each observation

        Note
        ----
        This method will rank choices/arms according to what the policy
        dictates - it is not an exploitation-mode rank, so if e.g. there are
        random choices for some observations, there will be random ranks in here.

        Parameters
        ----------
        X : array (n_samples, n_features)
            New observations for which to rank actions according to this policy.
        n : int
            Number of top-ranked actions to output

        Returns
        -------
        topN : array(n_samples, n)
            The top-ranked actions for each observation
        """
        assert n >= 1
        if isinstance(n, float):
            n = int(n)
        assert isinstance(n, int)
        if n > self.nchoices:
            raise ValueError("'n' cannot be greater than 'nchoices'.")
        X = _check_X_input(X)
        scores = self._score_matrix(X)
        if n == self.nchoices:
            topN = np.argsort(scores, axis=1)
        else:
            topN = topN_byrow(scores, n, self.njobs)
        return self._name_arms(topN)


class _BasePolicyWithExploit(_BasePolicy):
    def _exploit(self, X):
        return self._oracles.exploit(X)

    def predict(self, X, exploit = False, output_score = False):
        """
        Selects actions according to this policy for new data.
        
        Parameters
        ----------
        X : array (n_samples, n_features)
            New observations for which to choose an action according to this policy.
        exploit : bool
            Whether to make a prediction according to the policy, or to just choose the
            arm with the highest expected reward according to current models.
        output_score : bool
            Whether to output the score that this method predicted, in case it is desired to use
            it with this pakckage's offpolicy and evaluation modules.
            
        Returns
        -------
        pred : array (n_samples,) or dict("choice" : array(n_samples,), "score" : array(n_samples,))
            Actions chosen by the policy. If passing output_score=True, it will be a dictionary
            with the chosen arm and the score that the arm got following this policy with the classifiers used.
        """
        if not self.is_fitted:
            return self._predict_random_if_unfit(X, output_score)

        if exploit:
            scores = self._exploit(X)
        else:
            scores = self.decision_function(X)
        pred = self._name_arms(np.argmax(scores, axis = 1))

        if not output_score:
            return pred
        else:
            score_max = np.max(scores, axis=1).reshape((-1, 1))
            return {"choice" : pred, "score" : score_max}

class BootstrappedUCB(_BasePolicyWithExploit):
    """
    Bootstrapped Upper Confidence Bound

    Obtains an upper confidence bound by taking the percentile of the predictions from a
    set of classifiers, all fit with different bootstrapped samples (multiple samples per arm).
    
    Note
    ----
    When fitting the algorithm to data in batches (online), it's not possible to take an
    exact bootstrapped sample, as the sample is not known in advance. In theory, as the sample size
    grows to infinity, the number of times that an observation appears in a bootstrapped sample is
    distributed ~ Poisson(1). However, assigning random gamma-distributed weights to observations
    produces a more stable effect, so it also has the option to assign weights randomly ~ Gamma(1,1).
    
    Parameters
    ----------
    base_algorithm : obj or list
        Base binary classifier for which each sample for each class will be fit.
        Will look for, in this order:
            1) A 'predict_proba' method with outputs (n_samples, 2), values in [0,1], rows suming to 1
            2) A 'decision_function' method with unbounded outputs (n_samples,) to which it will apply a sigmoid function.
            3) A 'predict' method with outputs (n_samples,) with values in [0,1].
        Can also pass a list with a different (or already-fit) classifier for each arm.
    nchoices : int or list-like
        Number of arms/labels to choose from. Can also pass a list, array, or Series with arm names, in which case
        the outputs from predict will follow these names and arms can be dropped by name, and new ones added with a
        custom name.
    nsamples : int
        Number of bootstrapped samples per class to take.
    percentile : int [0,100]
        Percentile of the predictions sample to take
    beta_prior : str 'auto', None, or tuple ((a,b), n)
        If not 'None', when there are less than 'n' samples with and without
        a reward from a given arm, it will predict the score for that class as a
        random number drawn from a beta distribution with the prior
        specified by 'a' and 'b'. If set to "auto", will be calculated as:
            beta_prior = ((3/log2(nchoices), 4), 2)
        Note that it will only generate one random number per arm, so the 'a'
        parameter should be higher than for other methods.
        This parameter can have a very large impact in the end results, and it's
        recommended to tune it accordingly - scenarios with low expected reward rates
        should have priors that result in drawing small random numbers, whereas
        scenarios with large expected reward rates should have stronger priors and
        tend towards larger random numbers. Also, the more arms there are, the smaller
        the optimal expected value for these random numbers.
        Recommended to use only one of ``beta_prior`` or ``smoothing``.
    smoothing : None or tuple (a,b)
        If not None, predictions will be smoothed as yhat_smooth = (yhat*n + a)/(n + b),
        where 'n' is the number of times each arm was chosen in the training data.
        This will not work well with non-probabilistic classifiers such as SVM, in which case you might
        want to define a class that embeds it with some recalibration built-in.
        Recommended to use only one of ``beta_prior`` or ``smoothing``.
    noise_to_smooth : bool
        If passing ``smoothing``, whether to add a small amount of random
        noise ~ Uniform(0, 10^-12) in order to break ties at random instead of
        choosing the smallest arm index.
        Ignored when passing ``smoothing=None``.
    batch_train : bool
        Whether the base algorithm will be fit to the data in batches as it comes (streaming),
        or to the whole dataset each time it is refit. Requires a classifier with a
        'partial_fit' method.
    refit_buffer : int or None
        Number of observations per arm to keep as a reserve for passing to
        'partial_fit'. If passing it, up until the moment there are at least this
        number of observations for a given arm, that arm will keep the observations
        when calling 'fit' and 'partial_fit', and will translate calls to
        'partial_fit' to calls to 'fit' with the new plus stored observations.
        After the reserve number is reached, calls to 'partial_fit' will enlarge
        the data batch with the stored observations, and old stored observations
        will be gradually replaced with the new ones (at random, not on a FIFO
        basis). This technique can greatly enchance the performance when fitting
        the data in batches, but memory consumption can grow quite large.
        If passing sparse CSR matrices as input to 'fit' and 'partial_fit',
        these will be converted to dense once they go into this reserve, and
        then converted back to CSR to augment the new data.
        Calls to 'fit' will override this reserve.
        Ignored when passing 'batch_train=False'.
    deep_copy_buffer : bool
        Whether to make deep copies of the data that is stored in the
        reserve for ``refit_buffer``. If passing 'False', when the reserve is
        not yet full, these will only store shallow copies of the data, which
        is faster but will not let Python's garbage collector free memory
        after deleting the data, and if the original data is overwritten, so will
        this buffer.
        Ignored when not using ``refit_buffer``.
    assume_unique_reward : bool
        Whether to assume that only one arm has a reward per observation. If set to 'True',
        whenever an arm receives a reward, the classifiers for all other arms will be
        fit to that observation too, having negative label.
    batch_sample_method : str, either 'gamma' or 'poisson'
        How to simulate bootstrapped samples when training in batch mode (online).
        See Note.
    random_state : int, None, RandomState, or Generator
        Either an integer which will be used as seed for initializing a
        ``Generator`` object for random number generation, a ``RandomState``
        object (from NumPy) from which to draw an integer, or a ``Generator``
        object (from NumPy), which will be used directly.
        While this controls random number generation for this meteheuristic,
        there can still be other sources of variations upon re-runs, such as
        data aggregations in parallel (e.g. from OpenMP or BLAS functions).
    njobs_arms : int or None
        Number of parallel jobs to run (for dividing work across arms). If passing None will set it to 1.
        If passing -1 will set it to the number of CPU cores. Note that if the base algorithm is itself
        parallelized, this might result in a slowdown as both compete for available threads, so don't set
        parallelization in both. The total number of parallel jobs will be njobs_arms * njobs_samples. The parallelization uses shared memory, thus you will only
        see a speed up if your base classifier releases the Python GIL, and will
        otherwise result in slower runs.
    njobs_samples : int or None
        Number of parallel jobs to run (for dividing work across samples within one arm). If passing None
        will set it to 1. If passing -1 will set it to the number of CPU cores. The total number of parallel
        jobs will be njobs_arms * njobs_samples.
        The parallelization uses shared memory, thus you will only
        see a speed up if your base classifier releases the Python GIL, and will
        otherwise result in slower runs.

    References
    ----------
    .. [1] Cortes, David. "Adapting multi-armed bandits policies to contextual bandits scenarios."
           arXiv preprint arXiv:1811.04383 (2018).
    """
    def __init__(self, base_algorithm, nchoices, nsamples=10, percentile=80,
                 beta_prior='auto', smoothing=None, noise_to_smooth=True, batch_train=False,
                 refit_buffer=None, deep_copy_buffer=True,
                 assume_unique_reward=False, batch_sample_method='gamma',
                 random_state=None, njobs_arms=-1, njobs_samples=1):
        assert (percentile > 0) and (percentile < 100)
        assert nsamples >= 2
        self._add_common_params(base_algorithm, beta_prior, smoothing, noise_to_smooth, njobs_arms,
                                nchoices, batch_train, refit_buffer, deep_copy_buffer,
                                assume_unique_reward, random_state,
                                assign_algo = False, prior_def_ucb = True)
        self.percentile = percentile
        self._add_bootstrapped_inputs(base_algorithm, batch_sample_method, nsamples, njobs_samples, self.percentile)

    def reset_percentile(self, percentile=80):
        """
        Set the upper confidence bound percentile to a custom number

        Parameters
        ----------
        percentile : int [0,100]
            Percentile of the confidence interval to take.

        Returns
        -------
        self : obj
            This object
        """
        assert (percentile > 0) and (percentile < 100)
        if self.is_fitted:
            self._oracles.reset_attribute("percentile", percentile)
        self.base_algorithm.percentile = percentile
        return self

class BootstrappedTS(_BasePolicyWithExploit):
    """
    Bootstrapped Thompson Sampling
    
    Performs Thompson Sampling by fitting several models per class on bootstrapped samples,
    then makes predictions by taking one of them at random for each class.
    
    Note
    ----
    When fitting the algorithm to data in batches (online), it's not possible to take an
    exact bootstrapped sample, as the sample is not known in advance. In theory, as the sample size
    grows to infinity, the number of times that an observation appears in a bootstrapped sample is
    distributed ~ Poisson(1). However, assigning random gamma-distributed weights to observations
    produces a more stable effect, so it also has the option to assign weights randomly ~ Gamma(1,1).

    Note
    ----
    If you plan to make only one call to 'predict' between calls to 'fit' and have
    ``sample_unique=False``, you can pass ``nsamples=1`` without losing any precision.
    
    Parameters
    ----------
    base_algorithm : obj
        Base binary classifier for which each sample for each class will be fit.
        Will look for, in this order:
            1) A 'predict_proba' method with outputs (n_samples, 2), values in [0,1], rows suming to 1
            2) A 'decision_function' method with unbounded outputs (n_samples,) to which it will apply a sigmoid function.
            3) A 'predict' method with outputs (n_samples,) with values in [0,1].
        Can also pass a list with a different (or already-fit) classifier for each arm.
    nchoices : int or list-like
        Number of arms/labels to choose from. Can also pass a list, array, or Series with arm names, in which case
        the outputs from predict will follow these names and arms can be dropped by name, and new ones added with a
        custom name.
    nsamples : int
        Number of bootstrapped samples per class to take.
    beta_prior : str 'auto', None, or tuple ((a,b), n)
        If not 'None', when there are less than 'n' samples with and without
        a reward from a given arm, it will predict the score for that class as a
        random number drawn from a beta distribution with the prior
        specified by 'a' and 'b'. If set to "auto", will be calculated as:
            beta_prior = ((2/log2(nchoices), 4), 2)
        This parameter can have a very large impact in the end results, and it's
        recommended to tune it accordingly - scenarios with low expected reward rates
        should have priors that result in drawing small random numbers, whereas
        scenarios with large expected reward rates should have stronger priors and
        tend towards larger random numbers. Also, the more arms there are, the smaller
        the optimal expected value for these random numbers.
        Recommended to use only one of ``beta_prior`` or ``smoothing``.
    smoothing : None or tuple (a,b)
        If not None, predictions will be smoothed as yhat_smooth = (yhat*n + a)/(n + b),
        where 'n' is the number of times each arm was chosen in the training data.
        This will not work well with non-probabilistic classifiers such as SVM, in which case you might
        want to define a class that embeds it with some recalibration built-in.
        Recommended to use only one of ``beta_prior`` or ``smoothing``.
    noise_to_smooth : bool
        If passing ``smoothing``, whether to add a small amount of random
        noise ~ Uniform(0, 10^-12) in order to break ties at random instead of
        choosing the smallest arm index.
        Ignored when passing ``smoothing=None``.
    sample_unique : bool
        Whether to use a different bootstrapped classifier per row at each arm when
        calling 'predict'. If passing 'False', will take the same bootstrapped
        classifier within an arm for all the rows passed in a single call to 'predict'.
        Passing 'False' is a faster alternative, but the theoretically correct way
        is using a different one per row.
        Forced to 'True' when passing ``sample_weighted=True``.
    sample_weighted : bool
        Whether to take a weighted average from the predictions from each bootstrapped
        classifier at a given arm, with random weights. This will make the predictions
        more variable (i.e. more randomness in exploration). The alternative (and
        default) is to take a prediction from a single classifier each time.
    batch_train : bool
        Whether the base algorithm will be fit to the data in batches as it comes (streaming),
        or to the whole dataset each time it is refit. Requires a classifier with a
        'partial_fit' method.
    refit_buffer : int or None
        Number of observations per arm to keep as a reserve for passing to
        'partial_fit'. If passing it, up until the moment there are at least this
        number of observations for a given arm, that arm will keep the observations
        when calling 'fit' and 'partial_fit', and will translate calls to
        'partial_fit' to calls to 'fit' with the new plus stored observations.
        After the reserve number is reached, calls to 'partial_fit' will enlarge
        the data batch with the stored observations, and old stored observations
        will be gradually replaced with the new ones (at random, not on a FIFO
        basis). This technique can greatly enchance the performance when fitting
        the data in batches, but memory consumption can grow quite large.
        If passing sparse CSR matrices as input to 'fit' and 'partial_fit',
        these will be converted to dense once they go into this reserve, and
        then converted back to CSR to augment the new data.
        Calls to 'fit' will override this reserve.
        Ignored when passing 'batch_train=False'.
    deep_copy_buffer : bool
        Whether to make deep copies of the data that is stored in the
        reserve for ``refit_buffer``. If passing 'False', when the reserve is
        not yet full, these will only store shallow copies of the data, which
        is faster but will not let Python's garbage collector free memory
        after deleting the data, and if the original data is overwritten, so will
        this buffer.
        Ignored when not using ``refit_buffer``.
    assume_unique_reward : bool
        Whether to assume that only one arm has a reward per observation. If set to 'True',
        whenever an arm receives a reward, the classifiers for all other arms will be
        fit to that observation too, having negative label.
    batch_sample_method : str, either 'gamma' or 'poisson'
        How to simulate bootstrapped samples when training in batch mode (online).
        See Note.
    random_state : int, None, RandomState, or Generator
        Either an integer which will be used as seed for initializing a
        ``Generator`` object for random number generation, a ``RandomState``
        object (from NumPy) from which to draw an integer, or a ``Generator``
        object (from NumPy), which will be used directly.
        While this controls random number generation for this meteheuristic,
        there can still be other sources of variations upon re-runs, such as
        data aggregations in parallel (e.g. from OpenMP or BLAS functions).
    njobs_arms : int or None
        Number of parallel jobs to run (for dividing work across arms). If passing None will set it to 1.
        If passing -1 will set it to the number of CPU cores. Note that if the base algorithm is itself
        parallelized, this might result in a slowdown as both compete for available threads, so don't set
        parallelization in both. The total number of parallel jobs will be njobs_arms * njobs_samples.
        The parallelization uses shared memory, thus you will only
        see a speed up if your base classifier releases the Python GIL, and will
        otherwise result in slower runs.
    njobs_samples : int or None
        Number of parallel jobs to run (for dividing work across samples within one arm). If passing None
        will set it to 1. If passing -1 will set it to the number of CPU cores. The total number of parallel
        jobs will be njobs_arms * njobs_samples.
        The parallelization uses shared memory, thus you will only
        see a speed up if your base classifier releases the Python GIL, and will
        otherwise result in slower runs.
    
    References
    ----------
    .. [1] Cortes, David. "Adapting multi-armed bandits policies to contextual bandits scenarios."
           arXiv preprint arXiv:1811.04383 (2018).
    .. [2] Chapelle, Olivier, and Lihong Li. "An empirical evaluation of thompson sampling."
           Advances in neural information processing systems. 2011.
    """
    def __init__(self, base_algorithm, nchoices, nsamples=10, beta_prior='auto',
                 smoothing=None, noise_to_smooth=True,
                 sample_unique = True, sample_weighted = False,
                 batch_train=False, refit_buffer=None, deep_copy_buffer=True,
                 assume_unique_reward=False, batch_sample_method='gamma',
                 random_state=None, njobs_arms=-1, njobs_samples=1):
        if sample_weighted:
            sample_unique = True
        self._add_common_params(base_algorithm, beta_prior, smoothing, noise_to_smooth, njobs_arms,
                                nchoices, batch_train, refit_buffer, deep_copy_buffer,
                                assume_unique_reward, random_state, assign_algo=False)
        self._add_bootstrapped_inputs(base_algorithm, batch_sample_method,
                                      nsamples, njobs_samples, None,
                                      ts_byrow = sample_unique, ts_weighted = sample_weighted)

class LogisticUCB(_BasePolicyWithExploit):
    """
    Logistic Regression with Confidence Interval

    Logistic regression classifier which constructs an upper bound on the
    predicted probabilities through a confidence interval calculated from
    the variance-covariance matrix of the predictors.

    Note
    ----
    This strategy is implemented for comparison purposes only and it's not
    recommended to rely on it, particularly not for large datasets.

    Note
    ----
    This strategy does not support fitting the data in batches ('partial_fit'
    will not be available), nor does it support using any other classifier.
    See 'BootstrappedUCB' for a more generalizable version.

    Note
    ----
    This strategy requires each fitted classifier to store a square matrix with
    dimension equal to the number of features. Thus, memory consumption can grow
    very high with this method.

    Parameters
    ----------
    nchoices : int or list-like
        Number of arms/labels to choose from. Can also pass a list, array, or Series with arm names, in which case
        the outputs from predict will follow these names and arms can be dropped by name, and new ones added with a
        custom name.
    percentile : int [0,100]
        Percentile of the confidence interval to take.
    fit_intercept : bool
        Whether to add an intercept term to the models.
    lambda_ : float
        Strenght of the L2 regularization. Must be greater than zero.
    ucb_from_empty : bool
        Whether to make upper confidence bounds on arms with no observations according
        to the formula (ties are broken at random for
        them). Choosing this option leads to policies that usually start making random
        predictions until having sampled from all arms, and as such, it's not
        recommended when the number of arms is large relative to the number of rounds.
        Instead, it's recommended to use ``beta_prior``, which acts in the same way
        as for the other policies in this library.
    beta_prior : str 'auto', None, or tuple ((a,b), n)
        If not 'None', when there are less than 'n' samples with and without
        a reward from a given arm, it will predict the score for that class as a
        random number drawn from a beta distribution with the prior
        specified by 'a' and 'b'. If set to "auto", will be calculated as:
            beta_prior = ((3/log2(nchoices), 4), 2)
        This parameter can have a very large impact in the end results, and it's
        recommended to tune it accordingly - scenarios with low expected reward rates
        should have priors that result in drawing small random numbers, whereas
        scenarios with large expected reward rates should have stronger priors and
        tend towards larger random numbers. Also, the more arms there are, the smaller
        the optimal expected value for these random numbers.
        Note that this method calculates upper bounds rather than expectations, so the 'a'
        parameter should be higher than for other methods.
        Recommended to use only one of ``beta_prior`` or ``smoothing``. Ignored when
        passing ``ucb_from_empty=True``.
    smoothing : None or tuple (a,b)
        If not None, predictions will be smoothed as yhat_smooth = (yhat*n + a)/(n + b),
        where 'n' is the number of times each arm was chosen in the training data.
        Recommended to use only one of ``beta_prior`` or ``smoothing``.
    noise_to_smooth : bool
        If passing ``smoothing``, whether to add a small amount of random
        noise ~ Uniform(0, 10^-12) in order to break ties at random instead of
        choosing the smallest arm index.
        Ignored when passing ``smoothing=None``.
    assume_unique_reward : bool
        Whether to assume that only one arm has a reward per observation. If set to 'True',
        whenever an arm receives a reward, the classifiers for all other arms will be
        fit to that observation too, having negative label.
    random_state : int, None, RandomState, or Generator
        Either an integer which will be used as seed for initializing a
        ``Generator`` object for random number generation, a ``RandomState``
        object (from NumPy) from which to draw an integer, or a ``Generator``
        object (from NumPy), which will be used directly.
        While this controls random number generation for this meteheuristic,
        there can still be other sources of variations upon re-runs, such as
        data aggregations in parallel (e.g. from OpenMP or BLAS functions).
    njobs : int or None
        Number of parallel jobs to run. If passing None will set it to 1. If passing -1 will
        set it to the number of CPU cores. Be aware that the algorithm will use BLAS function calls,
        and if these have multi-threading enabled, it might result in a slow-down
        as both functions compete for available threads.

    References
    ----------
    .. [1] Cortes, David. "Adapting multi-armed bandits policies to contextual bandits scenarios."
           arXiv preprint arXiv:1811.04383 (2018).
    """
    def __init__(self, nchoices, percentile=80, fit_intercept=True,
                 lambda_=1.0, ucb_from_empty=False,
                 beta_prior='auto', smoothing=None, noise_to_smooth=True,
                 assume_unique_reward=False,
                 random_state=None, njobs=-1):
        assert (percentile > 0) and (percentile < 100)
        assert lambda_ > 0.
        base = _LogisticUCB_n_TS_single(lambda_=float(lambda_),
                                        fit_intercept=fit_intercept,
                                        alpha=float(percentile),
                                        ts=False)
        self._add_common_params(base, beta_prior, smoothing, noise_to_smooth, njobs, nchoices,
                                False, None, False, assume_unique_reward,
                                random_state, assign_algo=True, prior_def_ucb=True,
                                force_unfit_predict = ucb_from_empty)
        self.percentile = percentile

    def reset_percentile(self, percentile=80):
        """
        Set the upper confidence bound percentile to a custom number

        Parameters
        ----------
        percentile : int [0,100]
            Percentile of the confidence interval to take.

        Returns
        -------
        self : obj
            This object
        """
        assert (percentile > 0) and (percentile < 100)
        if self.is_fitted:
            self._oracles.reset_attribute("alpha", percentile)
        self.base_algorithm.alpha = percentile
        return self

class LogisticTS(_BasePolicyWithExploit):
    """
    Logistic Regression with Thompson Sampling

    Logistic regression classifier which samples its coefficients using
    the variance-covariance matrix of the predictors, or which samples
    predicted values from a confidence interval as a faster alternative.

    Note
    ----
    This strategy is implemented for comparison purposes only and it's not
    recommended to rely on it, particularly not for large datasets.

    Note
    ----
    This strategy does not support fitting the data in batches ('partial_fit'
    will not be available), nor does it support using any other classifier.
    See 'BootstrappedTS' for a more generalizable version.

    Note
    ----
    This strategy requires each fitted model to store a square matrix with
    dimension equal to the number of features. Thus, memory consumption can grow
    very high with this method.

    Note
    ----
    Be aware that sampling coefficients is an operation that scales poorly with
    the number of columns/features/variables. For wide datasets, it might be
    slower than a bootstrapped approach, especially when using ``sample_unique=True``.

    Parameters
    ----------
    nchoices : int or list-like
        Number of arms/labels to choose from. Can also pass a list, array, or Series with arm names, in which case
        the outputs from predict will follow these names and arms can be dropped by name, and new ones added with a
        custom name.
    sample_from : str, one of "coef", "ci"
        Whether to make predictions by sampling the model coefficients or by
        sampling the predicted value from a confidence interval around the best-fit
        coefficients.
    ci_from_empty : bool
        Whether to construct a confidence interval on arms with no observations
        according to a variance-covariance matrix given by the regulatization
        parameter alone.
        Ignored when passing ``sample_from='coef'``.
    multiplier : float
        Multiplier for the covariance matrix. Pass 1 to take it as-is.
        Ignored when passing ``sample_from='ci'``.
    fit_intercept : bool
        Whether to add an intercept term to the models.
    lambda_ : float
        Strenght of the L2 regularization. Must be greater than zero.
    sample_unique : bool
        Whether to sample different coefficients each time a prediction is to
        be made. If passing 'False', when calling 'predict', it will sample
        the same coefficients for all the observations in the same call to
        'predict', whereas if passing 'True', will use a different set of
        coefficients for each observations. Passing 'False' leads to an
        approach which is theoretically wrong, but as sampling coefficients
        can be very slow, using 'False' can provide a reasonable speed up
        without much of a performance penalty.
        Ignored when passing ``sample_from='ci'``.
    beta_prior : str 'auto', None, or tuple ((a,b), n)
        If not 'None', when there are less than 'n' samples with and without
        a reward from a given arm, it will predict the score for that class as a
        random number drawn from a beta distribution with the prior
        specified by 'a' and 'b'. If set to "auto", will be calculated as:
            beta_prior = ((2/log2(nchoices), 4), 2)
        This parameter can have a very large impact in the end results, and it's
        recommended to tune it accordingly - scenarios with low expected reward rates
        should have priors that result in drawing small random numbers, whereas
        scenarios with large expected reward rates should have stronger priors and
        tend towards larger random numbers. Also, the more arms there are, the smaller
        the optimal expected value for these random numbers.
        Recommended to use only one of ``beta_prior``, ``smoothing``, ``ci_from_empty``.
        Ignored when passing ``ci_from_empty=True``.
    smoothing : None or tuple (a,b)
        If not None, predictions will be smoothed as yhat_smooth = (yhat*n + a)/(n + b),
        where 'n' is the number of times each arm was chosen in the training data.
        Recommended to use only one of ``beta_prior``, ``smoothing``, ``ci_from_empty``.
    noise_to_smooth : bool
        If passing ``smoothing``, whether to add a small amount of random
        noise ~ Uniform(0, 10^-12) in order to break ties at random instead of
        choosing the smallest arm index.
        Ignored when passing ``smoothing=None``.
    assume_unique_reward : bool
        Whether to assume that only one arm has a reward per observation. If set to 'True',
        whenever an arm receives a reward, the classifiers for all other arms will be
        fit to that observation too, having negative label.
    random_state : int, None, RandomState, or Generator
        Either an integer which will be used as seed for initializing a
        ``Generator`` object for random number generation, a ``RandomState``
        object (from NumPy) from which to draw an integer, or a ``Generator``
        object (from NumPy), which will be used directly.
        While this controls random number generation for this meteheuristic,
        there can still be other sources of variations upon re-runs, such as
        data aggregations in parallel (e.g. from OpenMP or BLAS functions).
    njobs : int or None
        Number of parallel jobs to run. If passing None will set it to 1. If passing -1 will
        set it to the number of CPU cores. Be aware that the algorithm will use BLAS function calls,
        and if these have multi-threading enabled, it might result in a slow-down
        as both functions compete for available threads.

    References
    ----------
    .. [1] Cortes, David. "Adapting multi-armed bandits policies to contextual bandits scenarios."
           arXiv preprint arXiv:1811.04383 (2018).
    """
    def __init__(self, nchoices, sample_from="ci", ci_from_empty=False, multiplier=1.0,
                 fit_intercept=True, lambda_=1.0, sample_unique=False,
                 beta_prior='auto', smoothing=None, noise_to_smooth=True,
                 assume_unique_reward=False, random_state=None, njobs=-1):
        warnings.warn("This class is experimental. Not recommended to rely on it.")
        assert sample_from in ["ci", "coef"]
        self.sample_from = sample_from
        assert lambda_ > 0.
        assert multiplier > 0.
        base = _LogisticUCB_n_TS_single(lambda_=lambda_,
                                        fit_intercept=fit_intercept,
                                        alpha=0.,
                                        m=multiplier,
                                        ts=True,
                                        ts_from_ci = (sample_from == "ci"),
                                        sample_unique=sample_unique)
        self._add_common_params(base, beta_prior, smoothing, noise_to_smooth, njobs, nchoices,
                                False, None, False, assume_unique_reward,
                                random_state, assign_algo=True, prior_def_ucb=False,
                                force_unfit_predict=ci_from_empty and sample_from == "ci")

class SeparateClassifiers(_BasePolicy):
    """
    Separate Clasifiers per arm
    
    Fits one classifier per arm using only the data on which that arm was chosen.
    Predicts as One-Vs-Rest, plus the usual metaheuristics from ``beta_prior``
    and ``smoothing``.
    
    Parameters
    ----------
    base_algorithm : obj
        Base binary classifier for which each sample for each class will be fit.
        Will look for, in this order:
            1) A 'predict_proba' method with outputs (n_samples, 2), values in [0,1], rows suming to 1
            2) A 'decision_function' method with unbounded outputs (n_samples,) to which it will apply a sigmoid function.
            3) A 'predict' method with outputs (n_samples,) with values in [0,1].
        Can also pass a list with a different (or already-fit) classifier for each arm.
    nchoices : int or list-like
        Number of arms/labels to choose from. Can also pass a list, array, or Series with arm names, in which case
        the outputs from predict will follow these names and arms can be dropped by name, and new ones added with a
        custom name.
    beta_prior : str 'auto', None, or tuple ((a,b), n)
        If not 'None', when there are less than 'n' samples with and without
        a reward from a given arm, it will predict the score for that class as a
        random number drawn from a beta distribution with the prior
        specified by 'a' and 'b'. If set to "auto", will be calculated as:
            beta_prior = ((2/log2(nchoices), 4), 2)
        This parameter can have a very large impact in the end results, and it's
        recommended to tune it accordingly - scenarios with low expected reward rates
        should have priors that result in drawing small random numbers, whereas
        scenarios with large expected reward rates should have stronger priors and
        tend towards larger random numbers. Also, the more arms there are, the smaller
        the optimal expected value for these random numbers.
        Recommended to use only one of ``beta_prior`` or ``smoothing``.
    smoothing : None or tuple (a,b)
        If not None, predictions will be smoothed as yhat_smooth = (yhat*n + a)/(n + b),
        where 'n' is the number of times each arm was chosen in the training data.
        This will not work well with non-probabilistic classifiers such as SVM, in which case you might
        want to define a class that embeds it with some recalibration built-in.
        Recommended to use only one of ``beta_prior`` or ``smoothing``.
    noise_to_smooth : bool
        If passing ``smoothing``, whether to add a small amount of random
        noise ~ Uniform(0, 10^-12) in order to break ties at random instead of
        choosing the smallest arm index.
        Ignored when passing ``smoothing=None``.
    batch_train : bool
        Whether the base algorithm will be fit to the data in batches as it comes (streaming),
        or to the whole dataset each time it is refit. Requires a classifier with a
        'partial_fit' method.
    refit_buffer : int or None
        Number of observations per arm to keep as a reserve for passing to
        'partial_fit'. If passing it, up until the moment there are at least this
        number of observations for a given arm, that arm will keep the observations
        when calling 'fit' and 'partial_fit', and will translate calls to
        'partial_fit' to calls to 'fit' with the new plus stored observations.
        After the reserve number is reached, calls to 'partial_fit' will enlarge
        the data batch with the stored observations, and old stored observations
        will be gradually replaced with the new ones (at random, not on a FIFO
        basis). This technique can greatly enchance the performance when fitting
        the data in batches, but memory consumption can grow quite large.
        If passing sparse CSR matrices as input to 'fit' and 'partial_fit',
        these will be converted to dense once they go into this reserve, and
        then converted back to CSR to augment the new data.
        Calls to 'fit' will override this reserve.
        Ignored when passing 'batch_train=False'.
    deep_copy_buffer : bool
        Whether to make deep copies of the data that is stored in the
        reserve for ``refit_buffer``. If passing 'False', when the reserve is
        not yet full, these will only store shallow copies of the data, which
        is faster but will not let Python's garbage collector free memory
        after deleting the data, and if the original data is overwritten, so will
        this buffer.
        Ignored when not using ``refit_buffer``.
    assume_unique_reward : bool
        Whether to assume that only one arm has a reward per observation. If set to 'True',
        whenever an arm receives a reward, the classifiers for all other arms will be
        fit to that observation too, having negative label.
    random_state : int, None, RandomState, or Generator
        Either an integer which will be used as seed for initializing a
        ``Generator`` object for random number generation, a ``RandomState``
        object (from NumPy) from which to draw an integer, or a ``Generator``
        object (from NumPy), which will be used directly.
        While this controls random number generation for this meteheuristic,
        there can still be other sources of variations upon re-runs, such as
        data aggregations in parallel (e.g. from OpenMP or BLAS functions).
    njobs : int or None
        Number of parallel jobs to run. If passing None will set it to 1. If passing -1 will
        set it to the number of CPU cores. Note that if the base algorithm is itself parallelized,
        this might result in a slowdown as both compete for available threads, so don't set
        parallelization in both. The parallelization uses shared memory, thus you will only
        see a speed up if your base classifier releases the Python GIL, and will
        otherwise result in slower runs.

    References
    ----------
    .. [1] Cortes, David. "Adapting multi-armed bandits policies to contextual bandits scenarios."
           arXiv preprint arXiv:1811.04383 (2018).
    """
    def __init__(self, base_algorithm, nchoices, beta_prior=None,
                 smoothing=None, noise_to_smooth=True,
                 batch_train=False, refit_buffer=None, deep_copy_buffer=True,
                 assume_unique_reward=False, random_state=None, njobs=-1):
        self._add_common_params(base_algorithm, beta_prior, smoothing, noise_to_smooth, njobs, nchoices,
                                batch_train, refit_buffer, deep_copy_buffer,
                                assume_unique_reward, random_state)
    
    def decision_function_std(self, X):
        """
        Get the predicted "probabilities" from each arm from the classifier that predicts it,
        standardized to sum up to 1 (note that these are no longer probabilities).
        
        Parameters
        ----------
        X : array (n_samples, n_features)
            Data for which to obtain decision function scores for each arm.
        
        Returns
        -------
        scores : array (n_samples, n_choices)
            Scores following this policy for each arm.
        """
        X = _check_X_input(X)
        if not self.is_fitted:
            raise ValueError("Object has not been fit to data.")
        return self._oracles.predict_proba(X)
    
    def predict_proba_separate(self, X):
        """
        Get the predicted probabilities from each arm from the classifier that predicts it.
        
        Note
        ----
        Classifiers are all fit on different data, so the probabilities will not add up to 1.
        
        Parameters
        ----------
        X : array (n_samples, n_features)
            Data for which to obtain decision function scores for each arm.
        
        Returns
        -------
        scores : array (n_samples, n_choices)
            Scores following this policy for each arm.
        """
        X = _check_X_input(X)
        if not self.is_fitted:
            raise ValueError("Object has not been fit to data.")
        return self._oracles.predict_proba_raw(X)
    
    def predict(self, X, output_score = False):
        """
        Selects actions according to this policy for new data.
        
        Parameters
        ----------
        X : array (n_samples, n_features)
            New observations for which to choose an action according to this policy.
        output_score : bool
            Whether to output the score that this method predicted, in case it is desired to use
            it with this pakckage's offpolicy and evaluation modules.
            
        Returns
        -------
        pred : array (n_samples,) or dict("choice" : array(n_samples,), "score" : array(n_samples,))
            Actions chosen by the policy. If passing output_score=True, it will be a dictionary
            with the chosen arm and the score that the arm got following this policy with the classifiers used.
        """
        if not self.is_fitted:
            return self._predict_random_if_unfit(X, output_score)

        scores = self.decision_function(X)
        pred = self._name_arms(np.argmax(scores, axis = 1))

        if not output_score:
            return pred
        else:
            score_max = np.max(scores, axis=1).reshape((-1, 1))
            return {"choice" : pred, "score" : score_max}

class EpsilonGreedy(_BasePolicy):
    """
    Epsilon Greedy
    
    Takes a random action with probability p, or the action with highest
    estimated reward with probability 1-p.
    
    Parameters
    ----------
    base_algorithm : obj
        Base binary classifier for which each sample for each class will be fit.
        Will look for, in this order:
            1) A 'predict_proba' method with outputs (n_samples, 2), values in [0,1], rows suming to 1
            2) A 'decision_function' method with unbounded outputs (n_samples,) to which it will apply a sigmoid function.
            3) A 'predict' method with outputs (n_samples,) with values in [0,1].
        Can also pass a list with a different (or already-fit) classifier for each arm.
    nchoices : int or list-like
        Number of arms/labels to choose from. Can also pass a list, array, or Series with arm names, in which case
        the outputs from predict will follow these names and arms can be dropped by name, and new ones added with a
        custom name.
    explore_prob : float (0,1)
        Probability of taking a random action at each round.
    decay : float (0,1)
        After each prediction, the explore probability reduces to
        p = p*decay
    beta_prior : str 'auto', None, or tuple ((a,b), n)
        If not 'None', when there are less than 'n' samples with and without
        a reward from a given arm, it will predict the score for that class as a
        random number drawn from a beta distribution with the prior
        specified by 'a' and 'b'. If set to "auto", will be calculated as:
            beta_prior = ((2/log2(nchoices), 4), 2)
        The impact of ``beta_prior`` for ``EpsilonGreedy`` is not as high as for other
        policies in this module.
        Recommended to use only one of ``beta_prior`` or ``smoothing``.
    smoothing : None or tuple (a,b)
        If not None, predictions will be smoothed as yhat_smooth = (yhat*n + a)/(n + b),
        where 'n' is the number of times each arm was chosen in the training data.
        This will not work well with non-probabilistic classifiers such as SVM, in which case you might
        want to define a class that embeds it with some recalibration built-in.
        Recommended to use only one of ``beta_prior`` or ``smoothing``.
    noise_to_smooth : bool
        If passing ``smoothing``, whether to add a small amount of random
        noise ~ Uniform(0, 10^-12) in order to break ties at random instead of
        choosing the smallest arm index.
        Ignored when passing ``smoothing=None``.
    batch_train : bool
        Whether the base algorithm will be fit to the data in batches as it comes (streaming),
        or to the whole dataset each time it is refit. Requires a classifier with a
        'partial_fit' method.
    refit_buffer : int or None
        Number of observations per arm to keep as a reserve for passing to
        'partial_fit'. If passing it, up until the moment there are at least this
        number of observations for a given arm, that arm will keep the observations
        when calling 'fit' and 'partial_fit', and will translate calls to
        'partial_fit' to calls to 'fit' with the new plus stored observations.
        After the reserve number is reached, calls to 'partial_fit' will enlarge
        the data batch with the stored observations, and old stored observations
        will be gradually replaced with the new ones (at random, not on a FIFO
        basis). This technique can greatly enchance the performance when fitting
        the data in batches, but memory consumption can grow quite large.
        If passing sparse CSR matrices as input to 'fit' and 'partial_fit',
        these will be converted to dense once they go into this reserve, and
        then converted back to CSR to augment the new data.
        Calls to 'fit' will override this reserve.
        Ignored when passing 'batch_train=False'.
    deep_copy_buffer : bool
        Whether to make deep copies of the data that is stored in the
        reserve for ``refit_buffer``. If passing 'False', when the reserve is
        not yet full, these will only store shallow copies of the data, which
        is faster but will not let Python's garbage collector free memory
        after deleting the data, and if the original data is overwritten, so will
        this buffer.
        Ignored when not using ``refit_buffer``.
    assume_unique_reward : bool
        Whether to assume that only one arm has a reward per observation. If set to 'True',
        whenever an arm receives a reward, the classifiers for all other arms will be
        fit to that observation too, having negative label.
    random_state : int, None, RandomState, or Generator
        Either an integer which will be used as seed for initializing a
        ``Generator`` object for random number generation, a ``RandomState``
        object (from NumPy) from which to draw an integer, or a ``Generator``
        object (from NumPy), which will be used directly.
        While this controls random number generation for this meteheuristic,
        there can still be other sources of variations upon re-runs, such as
        data aggregations in parallel (e.g. from OpenMP or BLAS functions).
    njobs : int or None
        Number of parallel jobs to run. If passing None will set it to 1. If passing -1 will
        set it to the number of CPU cores. Note that if the base algorithm is itself parallelized,
        this might result in a slowdown as both compete for available threads, so don't set
        parallelization in both. The parallelization uses shared memory, thus you will only
        see a speed up if your base classifier releases the Python GIL, and will
        otherwise result in slower runs.
    
    References
    ----------
    .. [1] Cortes, David. "Adapting multi-armed bandits policies to contextual bandits scenarios."
           arXiv preprint arXiv:1811.04383 (2018).
    .. [2] Yue, Yisong, et al. "The k-armed dueling bandits problem."
           Journal of Computer and System Sciences 78.5 (2012): 1538-1556.
    """
    def __init__(self, base_algorithm, nchoices, explore_prob=0.2, decay=0.9999,
                 beta_prior='auto', smoothing=None, noise_to_smooth=True,
                 batch_train=False, refit_buffer=None, deep_copy_buffer=True,
                 assume_unique_reward=False, random_state=None, njobs=-1):
        self._add_common_params(base_algorithm, beta_prior, smoothing, noise_to_smooth, njobs, nchoices,
                                batch_train, refit_buffer, deep_copy_buffer,
                                assume_unique_reward, random_state)
        assert (explore_prob>0) and (explore_prob<1)
        if decay is not None:
            assert (decay>0) and (decay<1)
            if decay <= .99:
                warnings.warn("Warning: 'EpsilonGreedy' has a very high decay rate.")
        self.explore_prob = explore_prob
        self.decay = decay

    def reset_epsilon(self, explore_prob=0.2):
        """
        Set the exploration probability to a custom number

        Parameters
        ----------
        explore_prob : float between 0 and 1
            The exploration probability to set. Note that it will still
            apply the decay after resetting it.

        Returns
        -------
        self : obj
            This object
        """
        assert explore_prob >= 0.
        assert explore_prob <= 1.
        self.explore_prob = explore_prob
        return self
    
    def predict(self, X, exploit = False, output_score = False):
        """
        Selects actions according to this policy for new data.
        
        Parameters
        ----------
        X : array (n_samples, n_features)
            New observations for which to choose an action according to this policy.
        exploit : bool
            Whether to make a prediction according to the policy, or to just choose the
            arm with the highest expected reward according to current models.
        output_score : bool
            Whether to output the score that this method predicted, in case it is desired to use
            it with this pakckage's offpolicy and evaluation modules.
            
        Returns
        -------
        pred : array (n_samples,) or dict("choice" : array(n_samples,), "score" : array(n_samples,))
            Actions chosen by the policy. If passing output_score=True, it will be a dictionary
            with the chosen arm and the score that the arm got following this policy with the classifiers used.
        """
        if not self.is_fitted:
            return self._predict_random_if_unfit(X, output_score)
        scores = self._oracles.decision_function(X)
        pred = np.argmax(scores, axis = 1)
        if not exploit:
            ix_change_rnd = (self.random_state.random(size = X.shape[0]) <= self.explore_prob)
            n_change_rnd = ix_change_rnd.sum()
            pred[ix_change_rnd] = self.random_state.integers(self.nchoices, size = n_change_rnd)
        pred = self._name_arms(pred)

        if self.decay is not None:
            self.explore_prob *= self.decay ** X.shape[0]
        
        if not output_score:
            return pred
        else:
            score_max = np.max(scores, axis = 1).reshape((-1, 1))
            score_max[ix_change_rnd] = 1. / self.nchoices
            return {"choice" : pred, "score" : score_max}

    def _score_matrix(self, X):
        scores = self._oracles.decision_function(X)
        ix_change_rnd = (self.random_state.random(size = X.shape[0]) <= self.explore_prob)
        n_change_rnd = ix_change_rnd.sum()
        scores[ix_change_rnd] = self.random_state.random(size=(n_change_rnd, self.nchoices))

        if self.decay is not None:
            self.explore_prob *= self.decay ** X.shape[0]
        return scores


class _ActivePolicy(_BasePolicy):
    def _check_active_inp(self, base_algorithm, f_grad_norm, case_one_class):
        if f_grad_norm == 'auto':
            if not isinstance(base_algorithm, list):
                _check_autograd_supported(base_algorithm)
            else:
                for alg in base_algorithm:
                    _check_autograd_supported(alg)
            self._get_grad_norms = _get_logistic_grads_norms
        else:
            if not isinstance(f_grad_norm, list):
                assert callable(f_grad_norm)
            else:
                if len(f_grad_norm) != self.nchoices:
                    raise ValueError("'f_grad_norm' must have 'nchoices' entries.")
                for fun in f_grad_norm:
                    if not callable(f_grad_norm):
                        raise ValueError("If passing a list for 'f_grad_norm', " +
                                         "entries must be functions")
            self._get_grad_norms = f_grad_norm

        if case_one_class == 'auto':
            self._force_fit = False
            self._rand_grad_norms = _gen_random_grad_norms
        elif case_one_class == 'zero':
            self._force_fit = False
            self._rand_grad_norms = _gen_zero_norms
        elif case_one_class is None:
            self._force_fit = True
            self._rand_grad_norms = None
        else:
            if not isinstance(case_one_class, list):
                assert callable(case_one_class)
            else:
                if len(case_one_class) != self.nchoices:
                    raise ValueError("'case_one_class' must have 'nchoices' entries.")
                for fun in case_one_class:
                    if not callable(case_one_class):
                        raise ValueError("If passing a list for 'case_one_class', " +
                                         "entries must be functions")
            self._force_fit = False
            self._rand_grad_norms = case_one_class
        self.case_one_class = case_one_class
        self._force_counters = True

    ### TODO: parallelize this in cython for the default case
    def _crit_active(self, X, pred, grad_crit):
        change_f_grad = isinstance(self._get_grad_norms, list)
        change_r_grad = isinstance(self._rand_grad_norms, list)
        f_grad = self._get_grad_norms
        r_grad = self._rand_grad_norms
        for choice in range(self.nchoices):
            if change_f_grad:
                f_grad = self._get_grad_norms[choice]
            if change_r_grad:
                r_grad = self._rand_grad_norms[choice]

            if self._oracles.should_calculate_grad(choice) or self._force_fit:
                if ( (self._get_grad_norms == _get_logistic_grads_norms)
                      and ("coef_" not in dir(self._oracles.algos[choice]))
                    ):
                    grad_norms = \
                        r_grad(X,
                               self._oracles.get_n_pos(choice),
                               self._oracles.get_n_neg(choice),
                               self._oracles.rng_arm[choice])
                else:
                    grad_norms = f_grad(self._oracles.algos[choice],
                                        X, pred[:, choice])
            else:
                grad_norms = r_grad(X,
                                    self._oracles.get_n_pos(choice),
                                    self._oracles.get_n_neg(choice),
                                    self._oracles.rng_arm[choice])

            if grad_crit == 'min':
                pred[:, choice] = grad_norms.min(axis = 1)
            elif grad_crit == 'max':
                pred[:, choice] = grad_norms.max(axis = 1)
            elif grad_crit == 'weighted':
                pred[:, choice] = np.einsum("i,ij->i", pred[:, choice], grad_norms)
            else:
                raise ValueError("Something went wrong. Please open an issue in GitHub indicating what you were doing.")
        return pred

    def reset_active_choice(self, active_choice='weighted'):
        """
        Set the active gradient criteria to a custom form

        Parameters
        ----------
        active_choice : str in {'min', 'max', 'weighted'}
            How to calculate the gradient that an observation would have on the loss
            function for each classifier, given that it could be either class (positive or negative)
            for the classifier that predicts each arm. If weighted, they are weighted by the same
            probability estimates from the base algorithm.

        Returns
        -------
        self : obj
            This object
        """
        if self.active_choice is None: ### AdaptiveGreedy
            raise ValueError("Cannot change active choice for non-active policy.")
        assert active_choice in ['min', 'max', 'weighted']
        self.active_choice = active_choice
        return self


class AdaptiveGreedy(_ActivePolicy):
    """
    Adaptive Greedy
    
    Takes the action with highest estimated reward, unless that estimation falls below a certain
    threshold, in which case it takes a an action either at random or according to an active learning
    heuristic (same way as `ActiveExplorer`).

    Note
    ----
    The hyperparameters here can make a large impact on the quality of the choices. Be sure
    to tune the threshold (or percentile), decay, and prior (or smoothing parameters).
    
    Note
    ----
    The threshold for the reward probabilities can be set to a hard-coded number, or
    to be calculated dynamically by keeping track of the predictions it makes, and taking
    a fixed percentile of that distribution to be the threshold.
    In the second case, these are calculated in separate batches rather than in a sliding window.
    
    Can also be set to make choices in the same way as
    'ActiveExplorer' rather than random (see 'greedy_choice' parameter).
    
    Parameters
    ----------
    base_algorithm : obj
        Base binary classifier for which each sample for each class will be fit.
        Will look for, in this order:
            1) A 'predict_proba' method with outputs (n_samples, 2), values in [0,1], rows suming to 1
            2) A 'decision_function' method with unbounded outputs (n_samples,) to which it will apply a sigmoid function.
            3) A 'predict' method with outputs (n_samples,) with values in [0,1].
        Can also pass a list with a different (or already-fit) classifier for each arm.
    nchoices : int or list-like
        Number of arms/labels to choose from. Can also pass a list, array, or series with arm names, in which case
        the outputs from predict will follow these names and arms can be dropped by name, and new ones added with a
        custom name.
    window_size : int
        Number of predictions after which the threshold will be updated to the desired percentile.
    percentile : int in [0,100] or None
        Percentile of the predictions sample to set as threshold, below which actions are random.
        If None, will not take percentiles, will instead use the intial threshold and apply decay to it.
    decay : float (0,1) or None
        After each prediction, either the threshold or the percentile gets adjusted to:
            val_t+1 = val_t*decay
    decay_type : str, either 'percentile' or 'threshold'
        Whether to decay the threshold itself or the percentile of the predictions to take after
        each prediction. Ignored when using 'decay=None'. If passing 'percentile=None' and 'decay_type=percentile',
        will be forced to 'threshold'.
    initial_thr : str 'auto' or float (0,1)
        Initial threshold for the prediction below which a random action is taken.
        If set to 'auto', will be calculated as initial_thr = 1 / (2 * sqrt(nchoices)).
        Note that if 'base_algorithm' has a 'decision_function' method, it will first apply a sigmoid function to the
        output, and then compare it to the threshold, so the threshold should lie between zero and one.
    beta_prior : str 'auto', None, or tuple ((a,b), n)
        If not 'None', when there are less than 'n' samples with and without
        a reward from a given arm, it will predict the score for that class as a
        random number drawn from a beta distribution with the prior
        specified by 'a' and 'b'. If set to "auto", will be calculated as:
            beta_prior = ((3/nchoices, 4), 2)
        This parameter can have a very large impact in the end results, and it's
        recommended to tune it accordingly - scenarios with low expected reward rates
        should have priors that result in drawing small random numbers, whereas
        scenarios with large expected reward rates should have stronger priors and
        tend towards larger random numbers. Also, the more arms there are, the smaller
        the optimal expected value for these random numbers.
        Note that the default value for ``AdaptiveGreedy`` is different than from the
        other methods in this module, and it's recommended to experiment with different
        values of this hyperparameter.
        Recommended to use only one of ``beta_prior`` or ``smoothing``.
    smoothing : None or tuple (a,b)
        If not None, predictions will be smoothed as yhat_smooth = (yhat*n + a)/(n + b),
        where 'n' is the number of times each arm was chosen in the training data.
        This will not work well with non-probabilistic classifiers such as SVM, in which case you might
        want to define a class that embeds it with some recalibration built-in.
        Recommended to use only one of ``beta_prior`` or ``smoothing``.
    noise_to_smooth : bool
        If passing ``smoothing``, whether to add a small amount of random
        noise ~ Uniform(0, 10^-12) in order to break ties at random instead of
        choosing the smallest arm index.
        Ignored when passing ``smoothing=None``.
    batch_train : bool
        Whether the base algorithm will be fit to the data in batches as it comes (streaming),
        or to the whole dataset each time it is refit. Requires a classifier with a
        'partial_fit' method.
    refit_buffer : int or None
        Number of observations per arm to keep as a reserve for passing to
        'partial_fit'. If passing it, up until the moment there are at least this
        number of observations for a given arm, that arm will keep the observations
        when calling 'fit' and 'partial_fit', and will translate calls to
        'partial_fit' to calls to 'fit' with the new plus stored observations.
        After the reserve number is reached, calls to 'partial_fit' will enlarge
        the data batch with the stored observations, and old stored observations
        will be gradually replaced with the new ones (at random, not on a FIFO
        basis). This technique can greatly enchance the performance when fitting
        the data in batches, but memory consumption can grow quite large.
        If passing sparse CSR matrices as input to 'fit' and 'partial_fit',
        these will be converted to dense once they go into this reserve, and
        then converted back to CSR to augment the new data.
        Calls to 'fit' will override this reserve.
        Ignored when passing 'batch_train=False'.
    deep_copy_buffer : bool
        Whether to make deep copies of the data that is stored in the
        reserve for ``refit_buffer``. If passing 'False', when the reserve is
        not yet full, these will only store shallow copies of the data, which
        is faster but will not let Python's garbage collector free memory
        after deleting the data, and if the original data is overwritten, so will
        this buffer.
        Ignored when not using ``refit_buffer``.
    assume_unique_reward : bool
        Whether to assume that only one arm has a reward per observation. If set to 'True',
        whenever an arm receives a reward, the classifiers for all other arms will be
        fit to that observation too, having negative label.
    active_choice : None or str in {'min', 'max', 'weighted'}
        How to select arms when predictions are below the threshold. If passing None, selects them at random (default).
        If passing 'min', 'max' or 'weighted', selects them in the same way as 'ActiveExplorer'.
        Non-random active selection requires being able to calculate gradients (gradients for logistic regression and linear regression (from this package)
        are already defined with an option 'auto' below).
    f_grad_norm : str 'auto', list, or function(base_algorithm, X, pred) -> array (n_samples, 2)
        Function that calculates the row-wise norm of the gradient from observations in X if their class were
        negative (first column) or positive (second column).
        Can also use different functions for each arm, in which case it
        accepts them as a list of functions with length equal to ``nchoices``.
        The option 'auto' will only work with scikit-learn's 'LogisticRegression', 'SGDClassifier', and 'RidgeClassifier';
        with stochQN's 'StochasticLogisticRegression';
        and with this package's 'LinearRegression'.
    case_one_class : str 'auto', 'zero', None, list, or function(X, n_pos, n_neg, rng) -> array(n_samples, 2)
        If some arm/choice/class has only rewards of one type, many models will fail to fit, and consequently the gradients
        will be undefined. Likewise, if the model has not been fit, the gradient might also be undefined, and this requires a workaround.
            * If passing 'None', will assume that ``base_algorithm`` can be fit to
              data of only-positive or only-negative class without problems, and that
              it can calculate gradients and predictions with a ``base_algorithm``
              object that has not been fitted. Be aware that the methods 'predict',
              'predict_proba', and 'decision_function' in ``base_algorithm`` might be
              overwritten with another method that wraps it in a try-catch block, so
              don't rely on it producing errors when unfitted.
            * If passing a function, will take the output of it as the row-wise
              gradient norms when it compares them against other arms/classes, with
              the first column having the values if the observations were of negative
              class, and the second column if they were positive class. The other
              inputs to this function are the number of positive and negative examples
              that have been observed, and a ``Generator`` object from NumPy to use
              for generating random numbers.
            * If passing a list, will assume each entry is a function as described
              above, to be used with each corresponding arm.
            * If passing 'auto', will generate random numbers:

                * negative: ~ Gamma(log10(n_features) / (n_pos+1)/(n_pos+n_neg+2), log10(n_features)).

                * positive: ~ Gamma(log10(n_features) * (n_pos+1)/(n_pos+n_neg+2), log10(n_features)).

            * If passing 'zero', it will output zero whenever models have not been fitted.
        Note that the theoretically correct approach for a logistic regression would
        be to assume models with all-zero coefficients, in which case the gradient
        is defined in the absence of any data, but this tends to produce bad end
        results.
    random_state : int, None, RandomState, or Generator
        Either an integer which will be used as seed for initializing a
        ``Generator`` object for random number generation, a ``RandomState``
        object (from NumPy) from which to draw an integer, or a ``Generator``
        object (from NumPy), which will be used directly.
        While this controls random number generation for this meteheuristic,
        there can still be other sources of variations upon re-runs, such as
        data aggregations in parallel (e.g. from OpenMP or BLAS functions).
    njobs : int or None
        Number of parallel jobs to run. If passing None will set it to 1. If passing -1 will
        set it to the number of CPU cores. Note that if the base algorithm is itself parallelized,
        this might result in a slowdown as both compete for available threads, so don't set
        parallelization in both. The parallelization uses shared memory, thus you will only
        see a speed up if your base classifier releases the Python GIL, and will
        otherwise result in slower runs.
    
    References
    ----------
    .. [1] Chakrabarti, Deepayan, et al. "Mortal multi-armed bandits."
           Advances in neural information processing systems. 2009.
    .. [2] Cortes, David. "Adapting multi-armed bandits policies to contextual bandits scenarios."
           arXiv preprint arXiv:1811.04383 (2018).
    """
    def __init__(self, base_algorithm, nchoices, window_size=500, percentile=30,
                 decay=0.9998, decay_type='percentile', initial_thr='auto',
                 beta_prior='auto', smoothing=None, noise_to_smooth=True,
                 batch_train=False, refit_buffer=None,  deep_copy_buffer=True,
                 assume_unique_reward=False, active_choice=None, f_grad_norm='auto',
                 case_one_class='auto', random_state=None, njobs=-1):
        if beta_prior == "auto":
            beta_prior = ((3./nchoices, 4.), 2)
        self._add_common_params(base_algorithm, beta_prior, smoothing, noise_to_smooth, njobs, nchoices,
                                batch_train, refit_buffer, deep_copy_buffer,
                                assume_unique_reward, random_state)
        
        assert isinstance(window_size, int)
        if percentile is not None:
            assert isinstance(percentile, int)
            assert (percentile > 0) and (percentile < 100)
        if initial_thr == 'auto':
            if not isinstance(nchoices, list):
                initial_thr = 1.0 / (np.sqrt(nchoices) * 2.0)
            else:
                initial_thr = 1.0 / (np.sqrt(len(nchoices)) * 2.0)
        assert isinstance(initial_thr, float)
        assert window_size > 0
        self.window_size = window_size
        self.percentile = percentile
        self.thr = initial_thr
        self.window_cnt = 0
        self.window = np.array([])
        assert (decay_type == 'threshold') or (decay_type == 'percentile')
        if (decay_type == 'percentile') and (percentile is None):
            decay_type = 'threshold'
        self.decay_type = decay_type
        if decay is not None:
            assert (decay >= 0.0) and (decay <= 1.0)
        if (decay_type == 'percentile') and (percentile is None):
            decay = 1.
        self.decay = decay

        if active_choice is not None:
            assert active_choice in ['min', 'max', 'weighted']
            self._check_active_inp(base_algorithm, f_grad_norm, case_one_class)
        self.active_choice = active_choice

    def reset_threshold(self, threshold="auto"):
        """
        Set the adaptive threshold to a custom number

        Parameters
        ----------
        threshold : float or "auto"
            New threshold to use. If passing "auto", will set it
            to 1.5/nchoices. Note that this threshold will still be
            decayed if the object was initialized with ``decay_type="threshold"``,
            and will still be updated if initialized with ``percentile != None``.

        Returns
        -------
        self : obj
            This object
        """
        if isinstance(threshold, int):
            threshold = float(threshold)
        elif threshold == "auto":
            threshold = 1.5 / self.nchoices
        assert isinstance(threshold, float)
        self.thr = threshold
        return self

    def reset_percentile(self, percentile=30):
        """
        Set the moving percentile to a custom number

        Parameters
        ----------
        percentile : int between 0 and 100
            The new percentile to set. Note that it will still apply
            decay to it after being set through this method.

        Returns
        -------
        self : obj
            This object
        """
        if self.decay_type == 'threshold':
            raise ValueError("Method is not available when not using percentile decay.")
        assert percentile >= 0
        assert percentile <= 100
        self.percentile = percentile
        return self

    def predict(self, X, exploit = False):
        """
        Selects actions according to this policy for new data.
        
        Parameters
        ----------
        X : array (n_samples, n_features)
            New observations for which to choose an action according to this policy.
        exploit : bool
            Whether to make a prediction according to the policy, or to just choose the
            arm with the highest expected reward according to current models.
            
        Returns
        -------
        pred : array (n_samples,)
            Actions chosen by the policy.
        """
        # TODO: add option to output scores
        X = _check_X_input(X)
        if not self.is_fitted:
            return self._predict_random_if_unfit(X, False)
        return self._name_arms(self._predict(X, exploit, True))
    
    def _predict(self, X, exploit = False, choose = True):
        
        if X.shape[0] == 0:
            if choose:
                return np.array([])
            else:
                return np.empty((0, self.nchoices), dtype=ctypes.c_double)
        
        if exploit:
            if choose:
                return self._oracles.predict(X)
            else:
                return self._oracles.decision_function(X)
        
        # fixed threshold, anything below is always random
        if (self.decay == 1) or (self.decay is None):
            pred, pred_max = self._calc_preds(X, choose)

        # variable threshold that needs to be updated
        else:
            remainder_window = self.window_size - self.window_cnt
            
            # case 1: number of predictions to make would still fit within current window
            if remainder_window > X.shape[0]:
                pred, pred_max = self._calc_preds(X, choose)
                self.window_cnt += X.shape[0]
                self.window = np.r_[self.window, pred_max]
                
                # apply decay for all observations
                self._apply_decay(X.shape[0])

            # case 2: number of predictions to make would span more than current window
            else:
                # predict for the remainder of this window
                pred, pred_max = self._calc_preds(X[:remainder_window, :], choose)
                
                # allocate the rest that don't fit in this window
                if choose:
                    pred_all = np.zeros(X.shape[0])
                else:
                    pred_all = np.zeros((X.shape[0], self.nchoices), dtype=ctypes.c_double)
                pred_all[:remainder_window] = pred
                
                # complete window, update percentile if needed
                self.window = np.r_[self.window, pred_max]
                if self.decay_type == 'percentile':
                    self.thr = np.percentile(self.window, self.percentile)

                # reset window
                self.window = np.array([])
                self.window_cnt = 0
                
                # decay threshold only for these observations
                self._apply_decay(remainder_window)
                
                # predict the rest recursively
                pred_all[remainder_window:] = self._predict(X[remainder_window:, :], False, choose)
                return pred_all
                
        return pred

    def _apply_decay(self, nobs):
        if (self.decay is not None) and (self.decay != 1):
            if self.decay_type == 'threshold':
                self.thr *= self.decay ** nobs
            elif self.decay_type == 'percentile':
                self.percentile *= self.decay ** nobs
            else:
                raise ValueError("'decay_type' must be one of 'threshold' or 'percentile'")

    def _calc_preds(self, X, choose = True):
        pred_proba = self._oracles.decision_function(X)
        pred_max = pred_proba.max(axis = 1)
        if choose:
            pred = np.argmax(pred_proba, axis = 1)
        else:
            pred = pred_proba
        set_greedy = pred_max <= self.thr
        if np.any(set_greedy):
            self._choose_greedy(set_greedy, X, pred, pred_proba, choose)
        return pred, pred_max

    def _choose_greedy(self, set_greedy, X, pred, pred_all, choose = True):
        if self.active_choice is None:
            n_greedy = set_greedy.sum()
            if choose:
                pred[set_greedy] = self.random_state.integers(self.nchoices, size = n_greedy)
            else:
                pred[set_greedy] = self.random_state.random(size = (n_greedy, self.nchoices))
        else:
            scores = self._crit_active(
                        X[set_greedy],
                        pred_all[set_greedy],
                        self.active_choice)
            if choose:
                pred[set_greedy] = np.argmax(scores, axis = 1)
            else:
                pred[set_greedy] = scores

    def _score_matrix(self, X):
        return self._predict(X, False, False)

class ExploreFirst(_ActivePolicy):
    """
    Explore First, a.k.a. Explore-Then-Exploit
    
    Selects random actions for the first N predictions, after which it selects the
    best arm only, according to its estimates.
    
    Parameters
    ----------
    base_algorithm : obj
        Base binary classifier for which each sample for each class will be fit.
        Will look for, in this order:
            1) A 'predict_proba' method with outputs (n_samples, 2), values in [0,1], rows suming to 1
            2) A 'decision_function' method with unbounded outputs (n_samples,) to which it will apply a sigmoid function.
            3) A 'predict' method with outputs (n_samples,) with values in [0,1].
        Can also pass a list with a different (or already-fit) classifier for each arm.
    nchoices : int or list-like
        Number of arms/labels to choose from. Can also pass a list, array, or Series with arm names, in which case
        the outputs from predict will follow these names and arms can be dropped by name, and new ones added with a
        custom name.
    explore_rounds : int
        Number of rounds to wait before exploitation mode.
        Will switch after making N predictions.
    prob_active_choice : float (0, 1)
        Probability of choosing explore-mode actions according to active
        learning criteria. Pass zero for choosing everything at random.
    active_choice : str, one of 'weighted', 'max' or 'min'
        How to calculate the gradient that an observation would have on the loss
        function for each classifier, given that it could be either class (positive or negative)
        for the classifier that predicts each arm. If weighted, they are weighted by the same
        probability estimates from the base algorithm.
    f_grad_norm : str 'auto' or function(base_algorithm, X, pred) -> array (n_samples, 2)
        Function that calculates the row-wise norm of the gradient from observations in X if their class were
        negative (first column) or positive (second column).
        Can also use different functions for each arm, in which case it
        accepts them as a list of functions with length equal to ``nchoices``.
        The option 'auto' will only work with scikit-learn's 'LogisticRegression', 'SGDClassifier' (log-loss only), and 'RidgeClassifier';
        with stochQN's 'StochasticLogisticRegression';
        and with this package's 'LinearRegression'.
        Ignored when passing ``prob_active_choice=0.``
    case_one_class : str 'auto', 'zero', None, or function(X, n_pos, n_neg, rng) -> array(n_samples, 2)
        If some arm/choice/class has only rewards of one type, many models will fail to fit, and consequently the gradients
        will be undefined. Likewise, if the model has not been fit, the gradient might also be undefined, and this requires a workaround.
            * If passing 'None', will assume that ``base_algorithm`` can be fit to
              data of only-positive or only-negative class without problems, and that
              it can calculate gradients and predictions with a ``base_algorithm``
              object that has not been fitted. Be aware that the methods 'predict',
              'predict_proba', and 'decision_function' in ``base_algorithm`` might be
              overwritten with another method that wraps it in a try-catch block, so
              don't rely on it producing errors when unfitted.
            * If passing a function, will take the output of it as the row-wise
              gradient norms when it compares them against other arms/classes, with
              the first column having the values if the observations were of negative
              class, and the second column if they were positive class. The other
              inputs to this function are the number of positive and negative examples
              that have been observed, and a ``Generator`` object from NumPy to use
              for generating random numbers.
            * If passing a list, will assume each entry is a function as described
              above, to be used with each corresponding arm.
            * If passing 'auto', will generate random numbers:

                * negative: ~ Gamma(log10(n_features) / (n_pos+1)/(n_pos+n_neg+2), log10(n_features)).

                * positive: ~ Gamma(log10(n_features) * (n_pos+1)/(n_pos+n_neg+2), log10(n_features)).

            * If passing 'zero', it will output zero whenever models have not been fitted.
        Note that the theoretically correct approach for a logistic regression would
        be to assume models with all-zero coefficients, in which case the gradient
        is defined in the absence of any data, but this tends to produce bad end
        results.
        Ignored when passing ``prob_active_choice=0.``
    beta_prior : str 'auto', None, or tuple ((a,b), n)
        If not 'None', when there are less than 'n' samples with and without
        a reward from a given arm, it will predict the score for that class as a
        random number drawn from a beta distribution with the prior
        specified by 'a' and 'b'. If set to "auto", will be calculated as:
            beta_prior = ((2/log2(nchoices), 4), 2)
        Recommended to use only one of ``beta_prior`` or ``smoothing``.
    smoothing : None or tuple (a,b)
        If not None, predictions will be smoothed as yhat_smooth = (yhat*n + a)/(n + b),
        where 'n' is the number of times each arm was chosen in the training data.
        This will not work well with non-probabilistic classifiers such as SVM, in which case you might
        want to define a class that embeds it with some recalibration built-in.
        Recommended to use only one of ``beta_prior`` or ``smoothing``.
    noise_to_smooth : bool
        If passing ``smoothing``, whether to add a small amount of random
        noise ~ Uniform(0, 10^-12) in order to break ties at random instead of
        choosing the smallest arm index.
        Ignored when passing ``smoothing=None``.
    batch_train : bool
        Whether the base algorithm will be fit to the data in batches as it comes (streaming),
        or to the whole dataset each time it is refit. Requires a classifier with a
        'partial_fit' method.
    refit_buffer : int or None
        Number of observations per arm to keep as a reserve for passing to
        'partial_fit'. If passing it, up until the moment there are at least this
        number of observations for a given arm, that arm will keep the observations
        when calling 'fit' and 'partial_fit', and will translate calls to
        'partial_fit' to calls to 'fit' with the new plus stored observations.
        After the reserve number is reached, calls to 'partial_fit' will enlarge
        the data batch with the stored observations, and old stored observations
        will be gradually replaced with the new ones (at random, not on a FIFO
        basis). This technique can greatly enchance the performance when fitting
        the data in batches, but memory consumption can grow quite large.
        If passing sparse CSR matrices as input to 'fit' and 'partial_fit',
        these will be converted to dense once they go into this reserve, and
        then converted back to CSR to augment the new data.
        Calls to 'fit' will override this reserve.
        Ignored when passing 'batch_train=False'.
    deep_copy_buffer : bool
        Whether to make deep copies of the data that is stored in the
        reserve for ``refit_buffer``. If passing 'False', when the reserve is
        not yet full, these will only store shallow copies of the data, which
        is faster but will not let Python's garbage collector free memory
        after deleting the data, and if the original data is overwritten, so will
        this buffer.
        Ignored when not using ``refit_buffer``.
    assume_unique_reward : bool
        Whether to assume that only one arm has a reward per observation. If set to 'True',
        whenever an arm receives a reward, the classifiers for all other arms will be
        fit to that observation too, having negative label.
    random_state : int, None, RandomState, or Generator
        Either an integer which will be used as seed for initializing a
        ``Generator`` object for random number generation, a ``RandomState``
        object (from NumPy) from which to draw an integer, or a ``Generator``
        object (from NumPy), which will be used directly.
        While this controls random number generation for this meteheuristic,
        there can still be other sources of variations upon re-runs, such as
        data aggregations in parallel (e.g. from OpenMP or BLAS functions).
    njobs : int or None
        Number of parallel jobs to run. If passing None will set it to 1. If passing -1 will
        set it to the number of CPU cores. Note that if the base algorithm is itself parallelized,
        this might result in a slowdown as both compete for available threads, so don't set
        parallelization in both. The parallelization uses shared memory, thus you will only
        see a speed up if your base classifier releases the Python GIL, and will
        otherwise result in slower runs.

    References
    ----------
    .. [1] Cortes, David. "Adapting multi-armed bandits policies to contextual bandits scenarios."
           arXiv preprint arXiv:1811.04383 (2018).
    """
    def __init__(self, base_algorithm, nchoices, explore_rounds=2500,
                 prob_active_choice=0., active_choice='weighted',
                 f_grad_norm='auto', case_one_class='auto',
                 beta_prior=None, smoothing=None, noise_to_smooth=True,
                 batch_train=False, refit_buffer=None, deep_copy_buffer=True,
                 assume_unique_reward=False, random_state=None, njobs=-1):
        self._add_common_params(base_algorithm, beta_prior, smoothing, noise_to_smooth, njobs, nchoices,
                                batch_train, refit_buffer, deep_copy_buffer,
                                assume_unique_reward, random_state)
        
        assert explore_rounds>0
        assert isinstance(explore_rounds, int)
        self.explore_rounds = explore_rounds
        self.explore_cnt = 0

        assert (prob_active_choice >= 0.) and (prob_active_choice <= 1.)
        self.prob_active_choice = float(prob_active_choice)
        if self.prob_active_choice > 0:
            assert active_choice in ['min', 'max', 'weighted']
            self.active_choice = active_choice
            self._check_active_inp(base_algorithm, f_grad_norm, case_one_class)
        else:
            self.active_choice = None

    def reset_count(self):
        """
        Resets the counter for exploitation mode

        Returns
        -------
        self

        """
        self.explore_cnt = 0
        return self

    def predict(self, X, exploit = False):
        """
        Selects actions according to this policy for new data.
        
        Parameters
        ----------
        X : array (n_samples, n_features)
            New observations for which to choose an action according to this policy.
        exploit : bool
            Whether to make a prediction according to the policy, or to just choose the
            arm with the highest expected reward according to current models.
            
        Returns
        -------
        pred : array (n_samples,)
            Actions chosen by the policy.
        """
        # TODO: add option to output scores
        if not self.is_fitted:
            return self._predict_random_if_unfit(X, False)
        return self._name_arms(self._predict(X, exploit))
    
    def _predict(self, X, exploit = False):
        X = _check_X_input(X)
        
        if X.shape[0] == 0:
            return np.array([])
        
        if exploit:
            return self._oracles.predict(X)
        
        if self.explore_cnt < self.explore_rounds:
            self.explore_cnt += X.shape[0]
            
            # case 1: all predictions are within allowance
            if self.explore_cnt <= self.explore_rounds:
                pred = self.random_state.integers(self.nchoices, size = X.shape[0])
                self._choose_active(X, pred)
                return pred
            
            # case 2: some predictions are within allowance, others are not
            else:
                n_explore = self.explore_rounds - self.explore_cnt + X.shape[0]
                pred = np.empty(X.shape[0], type = ctypes.c_double)
                pred[:n_explore] = self.random_state.integers(self.nchoices, n_explore)
                self._choose_active(X[:n_explore], pred[:n_explore])
                pred[n_explore:] = self._oracles.predict(X[n_explore:])
                return pred
        else:
            return self._oracles.predict(X)

    def _score_matrix(self, X):
        if self.explore_cnt < self.explore_rounds:
            self.explore_cnt += X.shape[0]

            # case 1: all predictions are within allowance
            if self.explore_cnt <= self.explore_rounds:
                scores = self.random_state.random(size=(X.shape[0], self.nchoices))
                self._choose_active(X, scores, choose=False)
            
            # case 2: some predictions are within allowance, others are not
            else:
                scores = np.empty((X.shape[0], self.nchoices), type = ctypes.c_double)
                scores[:n_explore] = self.random_state.random(size=(n_explore, self.nchoices))
                self._choose_active(X[:n_explore], scores[:n_explore], choose=False)
                scores[n_explore:] = self._oracles.decision_function(X[n_explore:])
            
        else:
            scores = self._oracles.decision_function(X)

        return scores

    def _choose_active(self, X, pred, choose=True):
        if self.prob_active_choice <= 0.:
            return None

        pick_active = self.random_state.random(size=X.shape[0]) <= self.prob_active_choice
        if not np.any(pick_active):
            return None
        by_crit = self._crit_active(
                        X[pick_active],
                        self._oracles.decision_function(X[pick_active]),
                        self.active_choice)
        if choose:
            pred[pick_active] = np.argmax(by_crit, axis = 1)
        else:
            pred[pick_active] = by_crit


class ActiveExplorer(_ActivePolicy, _BasePolicyWithExploit):
    """
    Active Explorer
    
    Selects a proportion of actions according to an active learning heuristic based on gradient.
    Works only for differentiable and preferably smooth functions.
    
    Note
    ----
    Here, for the predictions that are made according to an active learning heuristic
    (these are selected at random, just like in Epsilon-Greedy), the guiding heuristic
    is the gradient that the observation, having either label (either weighted by the estimted
    probability, or taking the maximum or minimum), would produce on each model that
    predicts a class, given the current coefficients for that model. This of course requires
    being able to calculate gradients - package comes with pre-defined gradient functions for
    linear and logistic regression, and allows passing custom functions for others.
    
    Parameters
    ----------
    base_algorithm : obj
        Base binary classifier for which each sample for each class will be fit.
        Will look for, in this order:
            1) A 'predict_proba' method with outputs (n_samples, 2), values in [0,1], rows suming to 1
            2) A 'decision_function' method with unbounded outputs (n_samples,) to which it will apply a sigmoid function.
            3) A 'predict' method with outputs (n_samples,) with values in [0,1].
        Can also pass a list with a different (or already-fit) classifier for each arm.
    nchoices : int or list-like
        Number of arms/labels to choose from. Can also pass a list, array, or Series with arm names, in which case
        the outputs from predict will follow these names and arms can be dropped by name, and new ones added with a
        custom name.
    f_grad_norm : str 'auto' or function(base_algorithm, X, pred) -> array (n_samples, 2)
        Function that calculates the row-wise norm of the gradient from observations in X if their class were
        negative (first column) or positive (second column).
        Can also use different functions for each arm, in which case it
        accepts them as a list of functions with length equal to ``nchoices``.
        The option 'auto' will only work with scikit-learn's 'LogisticRegression', 'SGDClassifier' (log-loss only), and 'RidgeClassifier';
        with stochQN's 'StochasticLogisticRegression';
        and with this package's 'LinearRegression'.
    case_one_class : str 'auto', 'zero', None, or function(X, n_pos, n_neg, rng) -> array(n_samples, 2)
        If some arm/choice/class has only rewards of one type, many models will fail to fit, and consequently the gradients
        will be undefined. Likewise, if the model has not been fit, the gradient might also be undefined, and this requires a workaround.
            * If passing 'None', will assume that ``base_algorithm`` can be fit to
              data of only-positive or only-negative class without problems, and that
              it can calculate gradients and predictions with a ``base_algorithm``
              object that has not been fitted. Be aware that the methods 'predict',
              'predict_proba', and 'decision_function' in ``base_algorithm`` might be
              overwritten with another method that wraps it in a try-catch block, so
              don't rely on it producing errors when unfitted.
            * If passing a function, will take the output of it as the row-wise
              gradient norms when it compares them against other arms/classes, with
              the first column having the values if the observations were of negative
              class, and the second column if they were positive class. The other
              inputs to this function are the number of positive and negative examples
              that have been observed, and a ``Generator`` object from NumPy to use
              for generating random numbers.
            * If passing a list, will assume each entry is a function as described
              above, to be used with each corresponding arm.
            * If passing 'auto', will generate random numbers:

                * negative: ~ Gamma(log10(n_features) / (n_pos+1)/(n_pos+n_neg+2), log10(n_features)).

                * positive: ~ Gamma(log10(n_features) * (n_pos+1)/(n_pos+n_neg+2), log10(n_features)).

            * If passing 'zero', it will output zero whenever models have not been fitted.
        Note that the theoretically correct approach for a logistic regression would
        be to assume models with all-zero coefficients, in which case the gradient
        is defined in the absence of any data, but this tends to produce bad end
        results.
    active_choice : str in {'min', 'max', 'weighted'}
        How to calculate the gradient that an observation would have on the loss
        function for each classifier, given that it could be either class (positive or negative)
        for the classifier that predicts each arm. If weighted, they are weighted by the same
        probability estimates from the base algorithm.
    explore_prob : float (0,1)
        Probability of selecting an action according to active learning criteria.
    decay : float (0,1)
        After each prediction, the probability of selecting an arm according to active
        learning criteria is set to p = p*decay
    beta_prior : str 'auto', None, or tuple ((a,b), n)
        If not 'None', when there are less than 'n' samples with and without
        a reward from a given arm, it will predict the score for that class as a
        random number drawn from a beta distribution with the prior
        specified by 'a' and 'b'. If set to "auto", will be calculated as:
            beta_prior = ((2/log2(nchoices), 4), 2)
        This parameter can have a very large impact in the end results, and it's
        recommended to tune it accordingly - scenarios with low expected reward rates
        should have priors that result in drawing small random numbers, whereas
        scenarios with large expected reward rates should have stronger priors and
        tend towards larger random numbers. Also, the more arms there are, the smaller
        the optimal expected value for these random numbers.
        Recommended to use only one of ``beta_prior`` or ``smoothing``.
    smoothing : None or tuple (a,b)
        If not None, predictions will be smoothed as yhat_smooth = (yhat*n + a)/(n + b),
        where 'n' is the number of times each arm was chosen in the training data.
        Recommended to use only one of ``beta_prior`` or ``smoothing``.
    noise_to_smooth : bool
        If passing ``smoothing``, whether to add a small amount of random
        noise ~ Uniform(0, 10^-12) in order to break ties at random instead of
        choosing the smallest arm index.
        Ignored when passing ``smoothing=None``.
    batch_train : bool
        Whether the base algorithm will be fit to the data in batches as it comes (streaming),
        or to the whole dataset each time it is refit. Requires a classifier with a
        'partial_fit' method.
    refit_buffer : int or None
        Number of observations per arm to keep as a reserve for passing to
        'partial_fit'. If passing it, up until the moment there are at least this
        number of observations for a given arm, that arm will keep the observations
        when calling 'fit' and 'partial_fit', and will translate calls to
        'partial_fit' to calls to 'fit' with the new plus stored observations.
        After the reserve number is reached, calls to 'partial_fit' will enlarge
        the data batch with the stored observations, and old stored observations
        will be gradually replaced with the new ones (at random, not on a FIFO
        basis). This technique can greatly enchance the performance when fitting
        the data in batches, but memory consumption can grow quite large.
        If passing sparse CSR matrices as input to 'fit' and 'partial_fit',
        these will be converted to dense once they go into this reserve, and
        then converted back to CSR to augment the new data.
        Calls to 'fit' will override this reserve.
        Ignored when passing 'batch_train=False'.
    deep_copy_buffer : bool
        Whether to make deep copies of the data that is stored in the
        reserve for ``refit_buffer``. If passing 'False', when the reserve is
        not yet full, these will only store shallow copies of the data, which
        is faster but will not let Python's garbage collector free memory
        after deleting the data, and if the original data is overwritten, so will
        this buffer.
        Ignored when not using ``refit_buffer``.
    assume_unique_reward : bool
        Whether to assume that only one arm has a reward per observation. If set to 'True',
        whenever an arm receives a reward, the classifiers for all other arms will be
        fit to that observation too, having negative label.
    random_state : int, None, RandomState, or Generator
        Either an integer which will be used as seed for initializing a
        ``Generator`` object for random number generation, a ``RandomState``
        object (from NumPy) from which to draw an integer, or a ``Generator``
        object (from NumPy), which will be used directly.
        While this controls random number generation for this meteheuristic,
        there can still be other sources of variations upon re-runs, such as
        data aggregations in parallel (e.g. from OpenMP or BLAS functions).
    njobs : int or None
        Number of parallel jobs to run. If passing None will set it to 1. If passing -1 will
        set it to the number of CPU cores. Note that if the base algorithm is itself parallelized,
        this might result in a slowdown as both compete for available threads, so don't set
        parallelization in both. The parallelization uses shared memory, thus you will only
        see a speed up if your base classifier releases the Python GIL, and will
        otherwise result in slower runs.

    References
    ----------
    .. [1] Cortes, David. "Adapting multi-armed bandits policies to contextual bandits scenarios."
           arXiv preprint arXiv:1811.04383 (2018).
    """
    def __init__(self, base_algorithm, nchoices,
                 f_grad_norm='auto', case_one_class='auto', active_choice='weighted',
                 explore_prob=.15, decay=0.9997,
                 beta_prior='auto', smoothing=None, noise_to_smooth=True,
                 batch_train=False, refit_buffer=None, deep_copy_buffer=True,
                 assume_unique_reward=False, random_state=None, njobs=-1):
        assert active_choice in ['min', 'max', 'weighted']
        self.active_choice = active_choice
        self._check_active_inp(base_algorithm, f_grad_norm, case_one_class)
        self._add_common_params(base_algorithm, beta_prior, smoothing, noise_to_smooth, njobs, nchoices,
                                batch_train, refit_buffer, deep_copy_buffer,
                                assume_unique_reward, random_state)
        assert isinstance(explore_prob, float)
        assert (explore_prob > 0.) and (explore_prob <= 1.)
        self.explore_prob = explore_prob
        self.decay = decay

    def reset_explore_prob(self, explore_prob=0.2):
        """
        Set the active exploration probability to a custom number

        Parameters
        ----------
        explore_prob : float between 0 and 1
            The new exploration probability. Note that it will still apply
            decay on it after being reset.

        Returns
        -------
        self : obj
            This object
        """
        assert explore_prob >= 0.
        assert explore_prob <= 1.
        self.explore_prob = explore_prob
        return self

    def _score_matrix(self, X, exploit=False):
        pred = self._oracles.decision_function(X)
        if not exploit:
            change_greedy = self.random_state.random(size=X.shape[0]) <= self.explore_prob
            if np.any(change_greedy):
                pred[change_greedy, :] = self._crit_active(
                                            X[change_greedy, :],
                                            pred[change_greedy, :],
                                            self.active_choice)
            
            if self.decay is not None:
                self.explore_prob *= self.decay ** X.shape[0]
        return pred

    def _exploit(self, X):
        return self._oracles.decision_function(X)

class SoftmaxExplorer(_BasePolicy):
    """
    SoftMax Explorer
    
    Selects an action according to probabilites determined by a softmax transformation
    on the scores from the decision function that predicts each class.

    Note
    ----
    Will apply an inverse sigmoid transformations to the probabilities that come from the base algorithm
    before applying the softmax function.
    
    
    Parameters
    ----------
    base_algorithm : obj
        Base binary classifier for which each sample for each class will be fit.
        Will look for, in this order:
            1) A 'predict_proba' method with outputs (n_samples, 2), values in [0,1], rows suming to 1, to which it
               will apply an inverse sigmoid function.
            2) A 'decision_function' method with unbounded outputs (n_samples,).
            3) A 'predict' method outputting (n_samples,), values in [0,1], to which it will apply an inverse sigmoid function.
        Can also pass a list with a different (or already-fit) classifier for each arm.
    nchoices : int or list-like
        Number of arms/labels to choose from. Can also pass a list, array, or Series with arm names, in which case
        the outputs from predict will follow these names and arms can be dropped by name, and new ones added with a
        custom name.
    multiplier : float or None
        Number by which to multiply the outputs from the base algorithm before applying the softmax function
        (i.e. will take softmax(yhat * multiplier)).
    inflation_rate : float or None
        Number by which to multiply the multipier rate after every prediction, i.e. after making
        't' predictions, the multiplier will be 'multiplier_t = multiplier * inflation_rate^t'.
    beta_prior : str 'auto', None, or tuple ((a,b), n)
        If not 'None', when there are less than 'n' samples with and without
        a reward from a given arm, it will predict the score for that class as a
        random number drawn from a beta distribution with the prior
        specified by 'a' and 'b'. If set to "auto", will be calculated as:
            beta_prior = ((2/log2(nchoices), 4), 2)
        This parameter can have a very large impact in the end results, and it's
        recommended to tune it accordingly - scenarios with low expected reward rates
        should have priors that result in drawing small random numbers, whereas
        scenarios with large expected reward rates should have stronger priors and
        tend towards larger random numbers. Also, the more arms there are, the smaller
        the optimal expected value for these random numbers.
        Recommended to use only one of ``beta_prior`` or ``smoothing``.
    smoothing : None or tuple (a,b)
        If not None, predictions will be smoothed as yhat_smooth = (yhat*n + a)/(n + b),
        where 'n' is the number of times each arm was chosen in the training data.
        This will not work well with non-probabilistic classifiers such as SVM, in which case you might
        want to define a class that embeds it with some recalibration built-in.
        Recommended to use only one of ``beta_prior`` or ``smoothing``.
    noise_to_smooth : bool
        If passing ``smoothing``, whether to add a small amount of random
        noise ~ Uniform(0, 10^-12) in order to break ties at random instead of
        choosing the smallest arm index.
        Ignored when passing ``smoothing=None``.
    batch_train : bool
        Whether the base algorithm will be fit to the data in batches as it comes (streaming),
        or to the whole dataset each time it is refit. Requires a classifier with a
        'partial_fit' method.
    refit_buffer : int or None
        Number of observations per arm to keep as a reserve for passing to
        'partial_fit'. If passing it, up until the moment there are at least this
        number of observations for a given arm, that arm will keep the observations
        when calling 'fit' and 'partial_fit', and will translate calls to
        'partial_fit' to calls to 'fit' with the new plus stored observations.
        After the reserve number is reached, calls to 'partial_fit' will enlarge
        the data batch with the stored observations, and old stored observations
        will be gradually replaced with the new ones (at random, not on a FIFO
        basis). This technique can greatly enchance the performance when fitting
        the data in batches, but memory consumption can grow quite large.
        If passing sparse CSR matrices as input to 'fit' and 'partial_fit',
        these will be converted to dense once they go into this reserve, and
        then converted back to CSR to augment the new data.
        Calls to 'fit' will override this reserve.
        Ignored when passing 'batch_train=False'.
    deep_copy_buffer : bool
        Whether to make deep copies of the data that is stored in the
        reserve for ``refit_buffer``. If passing 'False', when the reserve is
        not yet full, these will only store shallow copies of the data, which
        is faster but will not let Python's garbage collector free memory
        after deleting the data, and if the original data is overwritten, so will
        this buffer.
        Ignored when not using ``refit_buffer``.
    assume_unique_reward : bool
        Whether to assume that only one arm has a reward per observation. If set to 'True',
        whenever an arm receives a reward, the classifiers for all other arms will be
        fit to that observation too, having negative label.
    random_state : int, None, RandomState, or Generator
        Either an integer which will be used as seed for initializing a
        ``Generator`` object for random number generation, a ``RandomState``
        object (from NumPy) from which to draw an integer, or a ``Generator``
        object (from NumPy), which will be used directly.
        While this controls random number generation for this meteheuristic,
        there can still be other sources of variations upon re-runs, such as
        data aggregations in parallel (e.g. from OpenMP or BLAS functions).
    njobs : int or None
        Number of parallel jobs to run. If passing None will set it to 1. If passing -1 will
        set it to the number of CPU cores. Note that if the base algorithm is itself parallelized,
        this might result in a slowdown as both compete for available threads, so don't set
        parallelization in both. The parallelization uses shared memory, thus you will only
        see a speed up if your base classifier releases the Python GIL, and will
        otherwise result in slower runs.

    References
    ----------
    .. [1] Cortes, David. "Adapting multi-armed bandits policies to contextual bandits scenarios."
           arXiv preprint arXiv:1811.04383 (2018).
    """
    def __init__(self, base_algorithm, nchoices, multiplier=1.0, inflation_rate=1.0004,
                 beta_prior='auto', smoothing=None, noise_to_smooth=True,
                 batch_train=False, refit_buffer=None, deep_copy_buffer=True,
                 assume_unique_reward=False, random_state=None, njobs=-1):
        self._add_common_params(base_algorithm, beta_prior, smoothing, noise_to_smooth, njobs, nchoices,
                                batch_train, refit_buffer, deep_copy_buffer,
                                assume_unique_reward, random_state)

        if multiplier is not None:
            if isinstance(multiplier, int):
                multiplier = float(multiplier)
            assert multiplier > 0
        else:
            multiplier = None
        if inflation_rate is not None:
            if isinstance(inflation_rate, int):
                inflation_rate = float(inflation_rate)
            assert inflation_rate > 0
        self.multiplier = multiplier
        self.inflation_rate = inflation_rate

    def reset_multiplier(self, multiplier=1.0):
        """
        Set the multiplier to a custom number

        Parameters
        ----------
        multiplier : float
            New multiplier for the numbers going to the softmax function.
            Note that it will still apply the inflation rate after this
            parameter is being reset.

        Returns
        -------
        self : obj
            This object
        """
        assert multiplier != 0
        self.multiplier = multiplier
        return self
    
    def decision_function(self, X, output_score=False, apply_sigmoid_score=True):
        """
        Get the scores for each arm following this policy's action-choosing criteria.
        
        Parameters
        ----------
        X : array (n_samples, n_features)
            Data for which to obtain decision function scores for each arm.
        
        Returns
        -------
        scores : array (n_samples, n_choices)
            Scores following this policy for each arm.
        """
        X = _check_X_input(X)
        if not self.is_fitted:
            raise ValueError("Object has not been fit to data.")
        return self._oracles.predict_proba(X)
    
    def predict(self, X, exploit=False, output_score=False):
        """
        Selects actions according to this policy for new data.
        
        Parameters
        ----------
        X : array (n_samples, n_features)
            New observations for which to choose an action according to this policy.
        exploit : bool
            Whether to make a prediction according to the policy, or to just choose the
            arm with the highest expected reward according to current models.
        output_score : bool
            Whether to output the score that this method predicted, in case it is desired to use
            it with this pakckage's offpolicy and evaluation modules.
            
        Returns
        -------
        pred : array (n_samples,) or dict("choice" : array(n_samples,), "score" : array(n_samples,))
            Actions chosen by the policy. If passing output_score=True, it will be a dictionary
            with the chosen arm and the score that the arm got following this policy with the classifiers used.
        """
        if not self.is_fitted:
            return self._predict_random_if_unfit(X, output_score)
        if exploit:
            X = _check_X_input(X)
            return np.argmax(self._oracles.decision_function(X), axis=1)
        pred = self._softmax_scores(X)
        chosen =  _choice_over_rows(pred, self.random_state, self.njobs)

        if output_score:
            score_chosen = pred[np.arange(pred.shape[0]), chosen]
        chosen = self._name_arms(chosen)

        if not output_score:
            return chosen
        else:
            return {"choice" : chosen, "score" : score_chosen}

    def _softmax_scores(self, X):
        pred = self.decision_function(X)
        _apply_inverse_sigmoid(pred)
        if self.multiplier is not None:
            pred *= self.multiplier
            if self.inflation_rate is not None:
                self.multiplier *= self.inflation_rate ** pred.shape[0]
        _apply_softmax(pred)
        return pred

    def topN(self, X, n):
        """
        Get top-N ranked actions for each observation

        Note
        ----
        This method will rank choices/arms according to what the policy
        dictates - it is not an exploitation-mode rank, so if e.g. there are
        random choices for some observations, there will be random ranks in here.

        Parameters
        ----------
        X : array (n_samples, n_features)
            New observations for which to rank actions according to this policy.
        n : int
            Number of top-ranked actions to output

        Returns
        -------
        topN : array(n_samples, n)
            The top-ranked actions for each observation
        """
        assert n >= 1
        if isinstance(n, float):
            n = int(n)
        assert isinstance(n, int)
        if n > self.nchoices:
            raise ValueError("'n' cannot be greater than 'nchoices'.")
        X = _check_X_input(X)

        scores = self._softmax_scores(X)
        topN = topN_byrow_softmax(scores, n, self.njobs, self.random_state)
        return self._name_arms(topN)


class LinUCB(_BasePolicyWithExploit):
    """
    LinUCB

    Note
    ----
    This strategy requires each fitted model to store a square matrix with
    dimension equal to the number of features. Thus, memory consumption can grow
    very high with this method.

    Note
    ----
    The 'X' data (covariates) should ideally be centered before passing them
    to 'fit', 'partial_fit', 'predict'.

    Note
    ----
    The default hyperparameters here are meant to match the original reference, but
    it's recommended to change them. Particularly: use ``beta_prior`` instead of
    ``ucb_from_empty``, decrease ``alpha``, and maybe increase ``lambda_``.
    
    Parameters
    ----------
    nchoices : int or list-like
        Number of arms/labels to choose from. Can also pass a list, array, or Series with arm names, in which case
        the outputs from predict will follow these names and arms can be dropped by name, and new ones added with a
        custom name.
    alpha : float
        Parameter to control the upper confidence bound (more is higher).
    lambda_ : float > 0
        Regularization parameter. References assumed this would always be equal to 1, but this
        implementation allows to change it.
    fit_intercept : bool
        Whether to add an intercept term to the coefficients.
    use_float : bool
        Whether to use C 'float' type for the required matrices. If passing 'False',
        will use C 'double'. Be aware that memory usage for this model can grow
        very large.
    method : str, one of 'chol' or 'sm'
        Method used to fit the model. Options are:

        ``'chol'``:
            Uses the Cholesky decomposition to solve the linear system from the
            least-squares closed-form each time 'fit' or 'partial_fit' is called.
            This is likely to be faster when fitting the model to a large number
            of observations at once, and is able to better exploit multi-threading.
        ``'sm'``:
            Starts with an inverse diagonal matrix and updates it as each
            new observation comes using the Sherman-Morrison formula, thus
            never explicitly solving the linear system, nor needing to calculate
            a matrix inverse. This is likely to be faster when fitting the model
            to small batches of observations. Be aware that with this method, it
            will add regularization to the intercept if passing 'fit_intercept=True'.
    ucb_from_empty : bool
        Whether to make upper confidence bounds on arms with no observations according
        to the formula, as suggested in the references (ties are broken at random for
        them). Choosing this option leads to policies that usually start making random
        predictions until having sampled from all arms, and as such, it's not
        recommended when the number of arms is large relative to the number of rounds.
        Instead, it's recommended to use ``beta_prior``, which acts in the same way
        as for the other policies in this library.
    beta_prior : str 'auto', None, or tuple ((a,b), n)
        If not 'None', when there are less than 'n' samples with and without
        a reward from a given arm, it will predict the score for that class as a
        random number drawn from a beta distribution with the prior
        specified by 'a' and 'b'. If set to "auto", will be calculated as:
            beta_prior = ((3/log2(nchoices), 4), 2).
        This parameter can have a very large impact in the end results, and it's
        recommended to tune it accordingly - scenarios with low expected reward rates
        should have priors that result in drawing small random numbers, whereas
        scenarios with large expected reward rates should have stronger priors and
        tend towards larger random numbers. Also, the more arms there are, the smaller
        the optimal expected value for these random numbers.
        Ignored when passing ``ucb_from_empty=True``.
    smoothing : None or tuple (a,b)
        If not None, predictions will be smoothed as yhat_smooth = (yhat*n + a)/(n + b),
        where 'n' is the number of times each arm was chosen in the training data.
        Recommended to use only one of ``beta_prior`` or ``smoothing``.
        Note that it is technically incorrect to apply smoothing like this (because
        the predictions from models are not bounded between zero and one), but
        if neither ``beta_prior``, nor ``smoothing`` are passed, the policy can get
        stuck in situations in which it will only choose actions from the first batch
        of observations to which it is fit (if using ``ucb_from_empty=False``), or
        only from the first arms that show rewards (if using ``ucb_from_empty=True``).
    noise_to_smooth : bool
        If passing ``smoothing``, whether to add a small amount of random
        noise ~ Uniform(0, 10^-12) in order to break ties at random instead of
        choosing the smallest arm index.
        Ignored when passing ``smoothing=None``.
    assume_unique_reward : bool
        Whether to assume that only one arm has a reward per observation. If set to 'True',
        whenever an arm receives a reward, the classifiers for all other arms will be
        fit to that observation too, having negative label.
    random_state : int, None, RandomState, or Generator
        Either an integer which will be used as seed for initializing a
        ``Generator`` object for random number generation, a ``RandomState``
        object (from NumPy) from which to draw an integer, or a ``Generator``
        object (from NumPy), which will be used directly.
        While this controls random number generation for this meteheuristic,
        there can still be other sources of variations upon re-runs, such as
        data aggregations in parallel (e.g. from OpenMP or BLAS functions).
    njobs : int or None
        Number of parallel jobs to run. If passing None will set it to 1. If passing -1 will
        set it to the number of CPU cores. Be aware that the algorithm will use BLAS function calls,
        and if these have multi-threading enabled, it might result in a slow-down
        as both functions compete for available threads.
    
    References
    ----------
    .. [1] Chu, Wei, et al. "Contextual bandits with linear payoff functions."
           Proceedings of the Fourteenth International Conference on Artificial Intelligence and Statistics. 2011.
    .. [2] Li, Lihong, et al. "A contextual-bandit approach to personalized news article recommendation."
           Proceedings of the 19th international conference on World wide web. ACM, 2010.
    """
    def __init__(self, nchoices, alpha=1.0, lambda_=1.0, fit_intercept=True,
                 use_float=True, method="sm", ucb_from_empty=True,
                 beta_prior=None, smoothing=None, noise_to_smooth=True,
                 assume_unique_reward=False, random_state=None, njobs=1):
        self._ts = False
        self._add_common_lin(alpha, lambda_, fit_intercept, use_float, method, nchoices, njobs)
        base = _LinUCB_n_TS_single(alpha=self.alpha, lambda_=self.lambda_,
                                   fit_intercept=self.fit_intercept,
                                   use_float=self.use_float, method=self.method,
                                   ts=False)
        self._add_common_params(base, beta_prior, smoothing, noise_to_smooth, njobs, nchoices,
                                True, None, False, assume_unique_reward,
                                random_state, assign_algo=True, prior_def_ucb=True,
                                force_unfit_predict=ucb_from_empty)

    def _add_common_lin(self, alpha, lambda_, fit_intercept, use_float, method, nchoices, njobs):
        if isinstance(alpha, int):
            alpha = float(alpha)
        assert isinstance(alpha, float)
        if isinstance(lambda_, int):
            lambda_ = float(lambda_)
        assert lambda_ >= 0.
        assert method in ["chol", "sm"]

        self.alpha = alpha
        self.lambda_ = lambda_
        self.fit_intercept = bool(fit_intercept)
        self.use_float = bool(use_float)
        self.method = method
        if self._ts:
            self.v_sq = self.alpha
            del self.alpha

    def reset_alpha(self, alpha=1.0):
        """
        Set the upper confidence bound parameter to a custom number

        Note
        ----
        This method is only for LinUCB, not for LinTS.

        Parameters
        ----------
        alpha : float
            Parameter to control the upper confidence bound (more is higher).

        Returns
        -------
        self : obj
            This object
        """
        if self._ts:
            raise ValueError("Method is only available for LinUCB")
        if isinstance(alpha, int):
            alpha = float(alpha)
        assert isinstance(alpha, float)
        self.alpha = alpha
        self.base_algorithm.alpha = alpha
        if self.is_fitted:
            self._oracles.reset_attribute("alpha", alpha)
        return self

class LinTS(LinUCB):
    """
    Linear Thompson Sampling

    Note
    ----
    This strategy requires each fitted model to store a square matrix with
    dimension equal to the number of features. Thus, memory consumption can grow
    very high with this method.

    Note
    ----
    The 'X' data (covariates) should ideally be centered before passing them
    to 'fit', 'partial_fit', 'predict'.

    Note
    ----
    Be aware that sampling coefficients is an operation that scales poorly with
    the number of columns/features/variables. For wide datasets, it might be
    slower than a bootstrapped approach, especially when using ``sample_unique=True``.
    
    Parameters
    ----------
    nchoices : int or list-like
        Number of arms/labels to choose from. Can also pass a list, array, or Series with arm names, in which case
        the outputs from predict will follow these names and arms can be dropped by name, and new ones added with a
        custom name.
    v_sq : float
        Parameter by which to multiply the covariance matrix (more means higher variance).
    lambda_ : float > 0
        Regularization parameter. References assumed this would always be equal to 1, but this
        implementation allows to change it.
    fit_intercept : bool
        Whether to add an intercept term to the coefficients.
    sample_unique : bool
        Whether to sample different coefficients each time a prediction is to
        be made. If passing 'False', when calling 'predict', it will sample
        the same coefficients for all the observations in the same call to
        'predict', whereas if passing 'True', will use a different set of
        coefficients for each observations. Passing 'False' leads to an
        approach which is theoretically wrong, but as sampling coefficients
        can be very slow, using 'False' can provide a reasonable speed up
        without much of a performance penalty.
    use_float : bool
        Whether to use C 'float' type for the required matrices. If passing 'False',
        will use C 'double'. Be aware that memory usage for this model can grow
        very large.
    method : str, one of 'chol' or 'sm'
        Method used to fit the model. Options are:

        ``'chol'``:
            Uses the Cholesky decomposition to solve the linear system from the
            least-squares closed-form each time 'fit' or 'partial_fit' is called.
            This is likely to be faster when fitting the model to a large number
            of observations at once, and is able to better exploit multi-threading.
        ``'sm'``:
            Starts with an inverse diagonal matrix and updates it as each
            new observation comes using the Sherman-Morrison formula, thus
            never explicitly solving the linear system, nor needing to calculate
            a matrix inverse. This is likely to be faster when fitting the model
            to small batches of observations. Be aware that with this method, it
            will add regularization to the intercept if passing 'fit_intercept=True'.
    beta_prior : str 'auto', None, or tuple ((a,b), n)
        If not 'None', when there are less than 'n' samples with and without
        a reward from a given arm, it will predict the score for that class as a
        random number drawn from a beta distribution with the prior
        specified by 'a' and 'b'. If set to "auto", will be calculated as:
            beta_prior = ((2/log2(nchoices), 4), 2)
        This parameter can have a very large impact in the end results, and it's
        recommended to tune it accordingly - scenarios with low expected reward rates
        should have priors that result in drawing small random numbers, whereas
        scenarios with large expected reward rates should have stronger priors and
        tend towards larger random numbers. Also, the more arms there are, the smaller
        the optimal expected value for these random numbers.
    smoothing : None or tuple (a,b)
        If not None, predictions will be smoothed as yhat_smooth = (yhat*n + a)/(n + b),
        where 'n' is the number of times each arm was chosen in the training data.
        Recommended to use only one of ``beta_prior`` or ``smoothing``.
        Note that it is technically incorrect to apply smoothing like this (because
        the predictions from models are not bounded between zero and one), but
        if neither ``beta_prior``, nor ``smoothing`` are passed, the policy can get
        stuck in situations in which it will only choose actions from the first batch
        of observations to which it is fit.
    noise_to_smooth : bool
        If passing ``smoothing``, whether to add a small amount of random
        noise ~ Uniform(0, 10^-12) in order to break ties at random instead of
        choosing the smallest arm index.
        Ignored when passing ``smoothing=None``.
    assume_unique_reward : bool
        Whether to assume that only one arm has a reward per observation. If set to 'True',
        whenever an arm receives a reward, the classifiers for all other arms will be
        fit to that observation too, having negative label.
    random_state : int, None, RandomState, or Generator
        Either an integer which will be used as seed for initializing a
        ``Generator`` object for random number generation, a ``RandomState``
        object (from NumPy) from which to draw an integer, or a ``Generator``
        object (from NumPy), which will be used directly.
        While this controls random number generation for this meteheuristic,
        there can still be other sources of variations upon re-runs, such as
        data aggregations in parallel (e.g. from OpenMP or BLAS functions).
    njobs : int or None
        Number of parallel jobs to run. If passing None will set it to 1. If passing -1 will
        set it to the number of CPU cores. Be aware that the algorithm will use BLAS function calls,
        and if these have multi-threading enabled, it might result in a slow-down
        as both functions compete for available threads.
    
    References
    ----------
    .. [1] Agrawal, Shipra, and Navin Goyal.
           "Thompson sampling for contextual bandits with linear payoffs."
           International Conference on Machine Learning. 2013.
    """
    def __init__(self, nchoices, v_sq=1.0, lambda_=1.0, fit_intercept=True,
                 sample_unique=False, use_float=True, method="sm",
                 beta_prior=None, smoothing=None, noise_to_smooth=True,
                 assume_unique_reward=False, random_state=None, njobs = 1):
        self._ts = True
        self._add_common_lin(v_sq, lambda_, fit_intercept, use_float, method, nchoices, njobs)
        base = _LinUCB_n_TS_single(alpha=self.v_sq, lambda_=self.lambda_,
                                   fit_intercept=self.fit_intercept,
                                   use_float=self.use_float, method=self.method,
                                   ts=True, sample_unique=sample_unique)
        self._add_common_params(base, beta_prior, smoothing, noise_to_smooth, njobs, nchoices,
                                True, None, False, assume_unique_reward,
                                random_state, assign_algo=True, prior_def_ucb=False)

    def reset_v_sq(self, v_sq=1.0):
        """
        Set the covariance multiplier to a custom number

        Parameters
        ----------
        v_sq : float
            Parameter by which to multiply the covariance matrix (more means higher variance).

        Returns
        -------
        self : obj
            This object
        """
        if isinstance(v_sq, int):
            v_sq = float(v_sq)
        assert isinstance(v_sq, float)
        self.v_sq = v_sq
        self.base_algorithm.alpha = v_sq
        if self.is_fitted:
            self._oracles.reset_attribute("alpha", v_sq)
        return self

class ParametricTS(_BasePolicyWithExploit):
    """
    Parametric Thompson Sampling

    Performs Thompson sampling using a beta distribution, with parameters given
    by the predicted probability from the base algorithm multiplied by the number
    of observations seen from each arm.

    Parameters
    ----------
    base_algorithm : obj
        Base binary classifier for which each sample for each class will be fit.
        Will look for, in this order:
            1) A 'predict_proba' method with outputs (n_samples, 2), values in [0,1], rows suming to 1
            2) A 'decision_function' method with unbounded outputs (n_samples,) to which it will apply a sigmoid function.
            3) A 'predict' method with outputs (n_samples,) with values in [0,1].
        Can also pass a list with a different (or already-fit) classifier for each arm.
    nchoices : int or list-like
        Number of arms/labels to choose from. Can also pass a list, array, or Series with arm names, in which case
        the outputs from predict will follow these names and arms can be dropped by name, and new ones added with a
        custom name.
    beta_prior : str 'auto', None, or tuple ((a,b), n)
        If not 'None', when there are less than 'n' samples with and without
        a reward from a given arm, it will predict the score for that class as a
        random number drawn from a beta distribution with the prior
        specified by 'a' and 'b'. If set to "auto", will be calculated as:
            beta_prior = ((2/log2(nchoices), 4), 2)
        This parameter can have a very large impact in the end results, and it's
        recommended to tune it accordingly - scenarios with low expected reward rates
        should have priors that result in drawing small random numbers, whereas
        scenarios with large expected reward rates should have stronger priors and
        tend towards larger random numbers. Also, the more arms there are, the smaller
        the optimal expected value for these random numbers.
        Recommended to use only one of ``beta_prior`` or ``smoothing``.
    beta_prior_ts : tuple(float, float)
        Beta prior used for the distribution from which to draw probabilities given
        the base algorithm's estimates. This is independent of ``beta_prior``, and
        they will not be used together under the same arm. Pass '(0,0)' for no prior.
    smoothing : None or tuple (a,b)
        If not None, predictions will be smoothed as yhat_smooth = (yhat*n + a)/(n + b),
        where 'n' is the number of times each arm was chosen in the training data.
        This will not work well with non-probabilistic classifiers such as SVM, in which case you might
        want to define a class that embeds it with some recalibration built-in.
        Recommended to use only one of ``beta_prior`` or ``smoothing``.
    noise_to_smooth : bool
        If passing ``smoothing``, whether to add a small amount of random
        noise ~ Uniform(0, 10^-12) in order to break ties at random instead of
        choosing the smallest arm index.
        Ignored when passing ``smoothing=None``.
    batch_train : bool
        Whether the base algorithm will be fit to the data in batches as it comes (streaming),
        or to the whole dataset each time it is refit. Requires a classifier with a
        'partial_fit' method.
    refit_buffer : int or None
        Number of observations per arm to keep as a reserve for passing to
        'partial_fit'. If passing it, up until the moment there are at least this
        number of observations for a given arm, that arm will keep the observations
        when calling 'fit' and 'partial_fit', and will translate calls to
        'partial_fit' to calls to 'fit' with the new plus stored observations.
        After the reserve number is reached, calls to 'partial_fit' will enlarge
        the data batch with the stored observations, and old stored observations
        will be gradually replaced with the new ones (at random, not on a FIFO
        basis). This technique can greatly enchance the performance when fitting
        the data in batches, but memory consumption can grow quite large.
        If passing sparse CSR matrices as input to 'fit' and 'partial_fit',
        these will be converted to dense once they go into this reserve, and
        then converted back to CSR to augment the new data.
        Calls to 'fit' will override this reserve.
        Ignored when passing 'batch_train=False'.
    deep_copy_buffer : bool
        Whether to make deep copies of the data that is stored in the
        reserve for ``refit_buffer``. If passing 'False', when the reserve is
        not yet full, these will only store shallow copies of the data, which
        is faster but will not let Python's garbage collector free memory
        after deleting the data, and if the original data is overwritten, so will
        this buffer.
        Ignored when not using ``refit_buffer``.
    assume_unique_reward : bool
        Whether to assume that only one arm has a reward per observation. If set to 'True',
        whenever an arm receives a reward, the classifiers for all other arms will be
        fit to that observation too, having negative label.
    random_state : int, None, RandomState, or Generator
        Either an integer which will be used as seed for initializing a
        ``Generator`` object for random number generation, a ``RandomState``
        object (from NumPy) from which to draw an integer, or a ``Generator``
        object (from NumPy), which will be used directly.
        While this controls random number generation for this meteheuristic,
        there can still be other sources of variations upon re-runs, such as
        data aggregations in parallel (e.g. from OpenMP or BLAS functions).
    njobs : int or None
        Number of parallel jobs to run. If passing None will set it to 1. If passing -1 will
        set it to the number of CPU cores. Note that if the base algorithm is itself parallelized,
        this might result in a slowdown as both compete for available threads, so don't set
        parallelization in both. The parallelization uses shared memory, thus you will only
        see a speed up if your base classifier releases the Python GIL, and will
        otherwise result in slower runs.
    """
    def __init__(self, base_algorithm, nchoices, beta_prior=None,
                 beta_prior_ts=(0.,0.), smoothing=None, noise_to_smooth=True,
                 batch_train=False, refit_buffer=None, deep_copy_buffer=True,
                 assume_unique_reward=False, random_state=None, njobs=-1):
        self._add_common_params(base_algorithm, beta_prior, smoothing, noise_to_smooth, njobs, nchoices,
                                batch_train, refit_buffer, deep_copy_buffer,
                                assume_unique_reward, random_state)
        assert beta_prior_ts[0] >= 0.
        assert beta_prior_ts[1] >= 0.
        self.beta_prior_ts = beta_prior_ts
        self.force_counters = True

    def reset_beta_prior_ts(self, beta_prior_ts=(0.,0.)):
        """
        Set the Thompson prior to a custom tuple

        Parameters
        ----------
        beta_prior_ts : tuple(float, float)
            Beta prior used for the distribution from which to draw probabilities given
            the base algorithm's estimates. This is independent of ``beta_prior``, and
            they will not be used together under the same arm. Pass '(0,0)' for no prior.

        Returns
        -------
        self : obj
            This object
        """
        assert beta_prior_ts[0] >= 0.
        assert beta_prior_ts[1] >= 0.
        self.beta_prior_ts = beta_prior_ts
        return self

    def _score_matrix(self, X):
        pred = self._oracles.decision_function(X)
        counters = self._oracles.get_nobs_by_arm()
        with_model = counters >= self.beta_prior[1]
        counters = counters.reshape((1,-1))
        pred[:, with_model] = self.random_state.beta(
            np.clip(pred[:, with_model] * counters[:, with_model] + self.beta_prior_ts[0], a_min=1e-5, a_max=None),
            np.clip((1. - pred[:, with_model]) * counters[:, with_model] + self.beta_prior_ts[1], a_min=1e-5, a_max=None)
            )
        return pred

    def _exploit(self, X):
        return self._oracles.decision_function(X)

class PartitionedUCB(_BasePolicyWithExploit):
    """
    Tree-partitioned Upper Confidence Bound

    Fits decision trees having non-contextual multi-armed UCB bandits at each leaf.
    Uses the standard approximation for confidence interval of a proportion
    (mean + c * sqrt(mean * (1-mean) / n)).

    This is similar to the 'TreeHeuristic' in the reference paper, but uses UCB as a
    MAB policy instead of Thompson sampling.

    Note
    ----
    This method fits only one tree per arm. As such, it's not recommended for
    high-dimensional data.

    Parameters
    ----------
    nchoices : int or list-like
        Number of arms/labels to choose from. Can also pass a list, array, or Series with arm names, in which case
        the outputs from predict will follow these names and arms can be dropped by name, and new ones added with a
        custom name.
    percentile : int [0,100]
        Percentile of the confidence interval to take.
    ucb_prior : tuple(float, float)
        Prior for the upper confidence bounds generated at each tree leaf. First
        number will be added to the number of positives, and second number to
        the number of negatives. If passing ``beta_prior=None``, will use these alone
        to generate an upper confidence bound and will break ties at random.
    beta_prior : str 'auto', None, or tuple ((a,b), n)
        If not 'None', when there are less than 'n' samples with and without
        a reward from a given arm, it will predict the score for that class as a
        random number drawn from a beta distribution with the prior
        specified by 'a' and 'b'. If set to "auto", will be calculated as:
            beta_prior = ((3/log2(nchoices), 4), 2)
        This parameter can have a very large impact in the end results, and it's
        recommended to tune it accordingly - scenarios with low expected reward rates
        should have priors that result in drawing small random numbers, whereas
        scenarios with large expected reward rates should have stronger priors and
        tend towards larger random numbers. Also, the more arms there are, the smaller
        the optimal expected value for these random numbers.
        Note that this method calculates upper bounds rather than expectations, so the 'a'
        parameter should be higher than for other methods.
        Recommended to use only one of ``beta_prior`` or ``smoothing``.
    smoothing : None or tuple (a,b)
        If not None, predictions will be smoothed as yhat_smooth = (yhat*n + a)/(n + b),
        where 'n' is the number of times each arm was chosen in the training data.
        want to define a class that embeds it with some recalibration built-in.
        Not recommended for this method.
    noise_to_smooth : bool
        If passing ``smoothing``, whether to add a small amount of random
        noise ~ Uniform(0, 10^-12) in order to break ties at random instead of
        choosing the smallest arm index.
        Ignored when passing ``smoothing=None``.
    assume_unique_reward : bool
        Whether to assume that only one arm has a reward per observation. If set to 'True',
        whenever an arm receives a reward, the classifiers for all other arms will be
        fit to that observation too, having negative label.
    random_state : int, None, RandomState, or Generator
        Either an integer which will be used as seed for initializing a
        ``Generator`` object for random number generation, a ``RandomState``
        object (from NumPy) from which to draw an integer, or a ``Generator``
        object (from NumPy), which will be used directly.
    njobs : int or None
        Number of parallel jobs to run. If passing None will set it to 1. If passing -1 will
        set it to the number of CPU cores. Note that it will not achieve a large
        degree of parallelization due to needing many Python computations with
        shared memory and no GIL releasing.
    *args : tuple
        Additional arguments to pass to the decision tree model (this policy uses
        SciKit-Learn's ``DecisionTreeClassifier`` - see their docs for more details).
        Note that passing ``random_state`` for ``DecisionTreeClassifier`` will have
        no effect as it will be set independently.
    **kwargs : dict
        Additional keyword arguments to pass to the decision tree model (this policy uses
        SciKit-Learn's ``DecisionTreeClassifier`` - see their docs for more details).
        Note that passing ``random_state`` for ``DecisionTreeClassifier`` will have
        no effect as it will be set independently.
    
    References
    ----------
    .. [1] Elmachtoub, Adam N., et al.
           "A practical method for solving contextual bandit problems using decision trees."
           arXiv preprint arXiv:1706.04687 (2017).
    .. [2] https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
    """
    def __init__(self, nchoices, percentile=80, ucb_prior=(1,1),
                 beta_prior='auto', smoothing=None, noise_to_smooth=True,
                 assume_unique_reward=False, random_state=None, njobs=-1,
                 *args, **kwargs):
        assert (percentile > 0) and (percentile < 100)
        assert ucb_prior[0] >= 0.
        assert ucb_prior[1] >= 0.
        self.ucb_prior = (float(ucb_prior[0]), float(ucb_prior[1]))

        base = _TreeUCB_n_TS_single(self.ucb_prior, ts=False, alpha=float(percentile),
                                    random_state=None, *args, **kwargs)
        self._add_common_params(base, beta_prior, smoothing, noise_to_smooth, njobs,
                                nchoices, False, None, False,
                                assume_unique_reward, random_state,
                                prior_def_ucb = True,
                                force_unfit_predict = beta_prior is None)
        if self.beta_prior[1] <= 0:
            self.force_unfit_predict = True

    def reset_percentile(self, percentile=80):
        """
        Set the upper confidence bound percentile to a custom number

        Parameters
        ----------
        percentile : int [0,100]
            Percentile of the confidence interval to take.

        Returns
        -------
        self : obj
            This object
        """
        assert (percentile > 0) and (percentile < 100)
        if self.is_fitted:
            self._oracles.reset_attribute("alpha", percentile)
        self.base_algorithm.alpha = percentile
        return self

    def reset_ucb_prior(self, ucb_prior=(1,1)):
        """
        Set the upper confidence bound prior to a custom tuple

        Parameters
        ----------
        ucb_prior : tuple(float, float)
            Prior for the upper confidence bounds generated at each tree leaf. First
            number will be added to the number of positives, and second number to
            the number of negatives. If passing ``beta_prior=None``, will use these alone
            to generate an upper confidence bound and will break ties at random.

        Returns
        -------
        self : obj
            This object
        """
        assert ucb_prior[0] >= 0.
        assert ucb_prior[1] >= 0.
        self.ucb_prior = (float(ucb_prior[0]), float(ucb_prior[1]))
        self.base_algorithm.beta_prior = ucb_prior
        if self.is_fitted:
            self._oracles.reset_attribute("beta_prior", ucb_prior)
        return self

class PartitionedTS(_BasePolicyWithExploit):
    """
    Tree-partitioned Thompson Sampling

    Fits decision trees having non-contextual multi-armed Thompson-sampling
    bandits at each leaf.

    This corresponds to the 'TreeHeuristic' in the reference paper.

    Note
    ----
    This method fits only one tree per arm. As such, it's not recommended for
    high-dimensional data.

    Note
    ----
    The default values for beta prior are as suggested in the reference paper.
    It is recommended to change it however.

    Parameters
    ----------
    nchoices : int or list-like
        Number of arms/labels to choose from. Can also pass a list, array, or Series with arm names, in which case
        the outputs from predict will follow these names and arms can be dropped by name, and new ones added with a
        custom name.
    beta_prior : str 'auto', or tuple ((a,b), n)
        When there are less than 'n' samples with and without a reward from
        a given arm, it will predict the score
        for that class as a random number drawn from a beta distribution with the prior
        specified by 'a' and 'b'.
        If passing 'auto' (which is *not* the default), will use the same default as for
        the other policies in this library:
            beta_prior = ((2/log2(nchoices), 4), 2)
        Additionally, will use (a,b) as prior when sampling from the MAB at a given node.
    smoothing : None or tuple (a,b)
        If not None, predictions will be smoothed as yhat_smooth = (yhat*n + a)/(n + b),
        where 'n' is the number of times each arm was chosen in the training data.
        Not recommended for this method.
    noise_to_smooth : bool
        If passing ``smoothing``, whether to add a small amount of random
        noise ~ Uniform(0, 10^-12) in order to break ties at random instead of
        choosing the smallest arm index.
        Ignored when passing ``smoothing=None``.
    assume_unique_reward : bool
        Whether to assume that only one arm has a reward per observation. If set to 'True',
        whenever an arm receives a reward, the classifiers for all other arms will be
        fit to that observation too, having negative label.
    random_state : int, None, RandomState, or Generator
        Either an integer which will be used as seed for initializing a
        ``Generator`` object for random number generation, a ``RandomState``
        object (from NumPy) from which to draw an integer, or a ``Generator``
        object (from NumPy), which will be used directly.
    njobs : int or None
        Number of parallel jobs to run. If passing None will set it to 1. If passing -1 will
        set it to the number of CPU cores. Note that it will not achieve a large
        degree of parallelization due to needing many Python computations with
        shared memory and no GIL releasing.
    *args : tuple
        Additional arguments to pass to the decision tree model (this policy uses
        SciKit-Learn's ``DecisionTreeClassifier`` - see their docs for more details).
        Note that passing ``random_state`` for ``DecisionTreeClassifier`` will have
        no effect as it will be set independently.
    **kwargs : dict
        Additional keyword arguments to pass to the decision tree model (this policy uses
        SciKit-Learn's ``DecisionTreeClassifier`` - see their docs for more details).
        Note that passing ``random_state`` for ``DecisionTreeClassifier`` will have
        no effect as it will be set independently.

    References
    ----------
    .. [1] Elmachtoub, Adam N., et al.
           "A practical method for solving contextual bandit problems using decision trees."
           arXiv preprint arXiv:1706.04687 (2017).
    .. [2] https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
    """
    def __init__(self, nchoices, beta_prior=((1,1), 1), smoothing=None, noise_to_smooth=True,
                 assume_unique_reward=False, random_state=None, njobs=-1,
                 *args, **kwargs):
        if beta_prior is None:
            raise ValueError("Must pass a valid 'beta_prior'.")
        beta_prior = _check_beta_prior(beta_prior, nchoices)
        base = _TreeUCB_n_TS_single(beta_prior[0], ts=True, random_state=None,
                                    *args, **kwargs)
        self._add_common_params(base, beta_prior, smoothing, noise_to_smooth, njobs,
                                nchoices, False, None, False,
                                assume_unique_reward, random_state)