python source code of search

from __future__ import absolute_import, print_function

import math
import os
from multiprocessing import TimeoutError
import sys
import time
import warnings

import collections
import sigopt
import numpy
from joblib import Parallel, delayed
from joblib.func_inspect import getfullargspec

try:
    # For scikit-learn >= 0.18
    from sklearn.model_selection import check_cv as base_check_cv
    def our_check_cv(cv, X, y, classifier):
      ret = base_check_cv(cv, y, classifier)
      return ret.n_splits, list(ret.split(X, y=y))
    from sklearn.model_selection._search import BaseSearchCV
    from sklearn.model_selection._validation import _fit_and_score
except ImportError:
    # For scikit-learn < 0.18
    from sklearn.grid_search import BaseSearchCV
    from sklearn.cross_validation import check_cv as base_check_cv, _fit_and_score
    def our_check_cv(cv, X, y, classifier):
      ret = base_check_cv(cv, X, y, classifier)
      return len(ret), list(iter(ret))

from sklearn.metrics.scorer import check_scoring
from sklearn.utils.validation import _num_samples, indexable
from sklearn.base import is_classifier, clone

HANDLES_UNICODE = sys.version_info[0] >= 3


class SigOptSearchCV(BaseSearchCV):
    """SigOpt powered search on hyper parameters.
    SigOptSearchCV implements a "fit" and a "score" method.
    It also implements "predict", "predict_proba", "decision_function",
    "transform" and "inverse_transform" if they are implemented in the
    estimator used.
    The parameters of the estimator used to apply these methods are optimized
    by cross-validated search over parameter settings.
    In contrast to GridSearchCV, not all parameter values are tried out, but
    rather a fixed number of parameter settings is chosen from the specified
    domains. The number of parameter settings that are tried is
    given by n_iter.

    Parameters
    ----------
    estimator : estimator object.
        A object of that type is instantiated for each grid point.
        This is assumed to implement the scikit-learn estimator interface.
        Either estimator needs to provide a ``score`` function,
        or ``scoring`` must be passed.
    param_domains : dict
        Dictionary with parameters names (string) as keys and domains as lists
        of parameter ranges to try. Domains are either lists of categorical
        (string) values or 2 element lists specifying a min and max for integer
        or float parameters
    n_iter : int, default=10
        Number of parameter settings that are sampled. n_iter trades off runtime
        vs quality of the solution.
    n_sug : int, default=1
        Number of suggestions to retrieve from SigOpt for evaluation in parallel
    client_token : string, optional
        SigOpt API client token, find yours here:
        https://sigopt.com/tokens. This field is required except when the
        ``sigopt_connection`` argument is present or when the
        ``SIGOPT_API_TOKEN`` environment variable is set. We recommend using
        this instead of ``sigopt_connection``.
    sigopt_connection : sigopt.interface.Connection, optional
        SigOpt API Connection object. If present, this object will be used to
        connect to SigOpt in lieu of the client token. We recommend using the
        ``client_token`` option instead of this one.
    opt_timeout : float, optional
        Max time for entire optimization process
    cv_timeout : float, optional
        Max time each CV fold objective evaluation can take
    scoring : string, callable or None, default=None
        A string (see model evaluation documentation) or a scorer callable
        object / function with signature ``scorer(estimator, X, y)``. If
        ``None``, the ``score`` method of the estimator is used.
    fit_params : dict, optional
        Parameters to pass to the fit method.
    n_jobs : int, default=1
        Number of jobs to run in parallel.
    pre_dispatch : int, or string, optional
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an explosion of
        memory consumption when more jobs get dispatched than CPUs can process.
        This parameter can be:
            - None, in which case all the jobs are immediately
              created and spawned. Use this for lightweight and
              fast-running jobs, to avoid delays due to on-demand
              spawning of the jobs
            - An int, giving the exact number of total jobs that are
              spawned
            - A string, giving an expression as a function of n_jobs,
              as in '2*n_jobs'
    iid : boolean, default=True
        If True, the data is assumed to be identically distributed across the
        folds, and the loss minimized is the total loss per sample, and not the
        mean loss across the folds.
    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross validation,
          - integer, to specify the number of folds in a `(Stratified)KFold`,
          - An object to be used as a cross-validation generator.
          - An iterable yielding train, test splits.
        For integer/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass, :class:`StratifiedKFold` used. In all other
        cases, :class:`KFold` is used.
        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.
    refit : boolean, default=True
        Refit the best estimator with the entire dataset. If "False", it is
        impossible to make predictions using this RandomizedSearchCV instance
        after fitting.
    verbose : integer
        Controls the verbosity: the higher, the more messages.
    error_score : 'raise' (default) or numeric
        Value to assign to the score if an error occurs in estimator fitting. If
        set to 'raise', the error is raised. If a numeric value is given,
        FitFailedWarning is raised. This parameter does not affect the refit
        step, which will always raise the error.

    Attributes
    ----------
    best_estimator_ : estimator
        Estimator that was chosen by the search, i.e. estimator
        which gave highest score (or smallest loss if specified)
        on the left out data. Not available if refit=False.
    best_score_ : float
        Score of best_estimator on the left out data.
    best_params_ : dict
        Parameter setting that gave the best results on the hold out data.

    Notes
    -----
    The parameters selected are those that maximize the score of the held-out
    data, according to the scoring parameter.
    If `n_jobs` was set to a value higher than one, the data is copied for each
    parameter setting(and not `n_jobs` times). This is done for efficiency
    reasons if individual jobs take very little time, but may raise errors if
    the dataset is large and not enough memory is available.  A workaround in
    this case is to set `pre_dispatch`. Then, the memory is copied only
    `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
    n_jobs`.
    """
    def __init__(self, estimator, param_domains, n_iter=10, scoring=None,
                 fit_params=None, n_jobs=1, iid=True, refit=True, cv=None,
                 verbose=0, n_sug=1, pre_dispatch='2*n_jobs',
                 error_score='raise', cv_timeout=None, opt_timeout=None,
                 client_token=None, sigopt_connection=None, experiment=None):
        self.param_domains = param_domains
        self.n_iter = n_iter
        self.n_sug = n_sug
        self.cv_timeout = cv_timeout
        self.opt_timeout = opt_timeout
        self.verbose = verbose

        # Stores the mappings between categorical strings to Python values. The keys correspond to parameter names and
        # values correspond to the string-to-value mappings themselves.
        self.categorical_mappings_ = {}

        self.scorer_ = None
        self.our_best_params_ = None
        self.our_best_score_ = None
        self.our_best_estimator_ = None
        self.experiment = experiment

        # Set up sigopt_connection
        found_token = client_token or os.environ.get('SIGOPT_API_TOKEN')
        if (not found_token) and (not sigopt_connection):
            raise ValueError(
                'Please set the `SIGOPT_API_TOKEN` environment variable, pass the ``client_token`` parameter, or pass '
                'the ``sigopt_connection`` parameter. You can find your client token here: '
                'https://sigopt.com/tokens.')
        else:
            self.sigopt_connection = (sigopt_connection if sigopt_connection
                    else sigopt.Connection(client_token=found_token))

        super(SigOptSearchCV, self).__init__(
            estimator=estimator, scoring=scoring, fit_params=fit_params,
            n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
            pre_dispatch=pre_dispatch, error_score=error_score)

    def _transform_param_domains(self, param_domains):
        def _transform_param(param_name, param_bounds):
            """Transform a parameter name and its bounds into a form that can be sent to the API layer."""
            def _check_bounds():
                """Check that min/max bounds are well formed."""
                if len(param_bounds) != 2:
                    raise Exception('Parameter bounds must be specified with two numbers! Not sure what to do with {}.'
                                    .format(param_bounds))
                if not isinstance(param_bounds, tuple):
                    warnings.warn('Parameter bounds should be specified as a tuple in the form (min, max).')

            # Check that param bounds is either iterable (range/categoricals) or a dict (categoricals)
            if not isinstance(param_bounds, (collections.Iterable, dict)):
              raise Exception('Parameter bounds must be iterable or dicts! The range {} isn\'t friendly!'
                              .format(param_bounds))

            param_dict = {'name': param_name}
            if isinstance(param_bounds, dict):
                # This is a categorical with mappings between strings and values
                param_dict['type'] = 'categorical'
                param_dict['categorical_values'] = [{'name': k} for k in param_bounds.keys()]

                # Add this mapping to our set of categorical string mappings
                self.categorical_mappings_[param_name] = param_bounds

            elif all(isinstance(x, str) for x in param_bounds):
                # This is a categorical with a list of strings naming each category
                param_dict['type'] = 'categorical'
                param_dict['categorical_values'] = [{'name': k} for k in param_bounds]

            elif all(isinstance(x, int) for x in param_bounds):
                # This is an integer parameter
                _check_bounds()
                param_dict['type'] = 'int'
                param_dict['bounds'] = {'min': param_bounds[0], 'max': param_bounds[1]}

            elif any(isinstance(x, float) for x in param_bounds):
                # This is a continuous parameter. Note that we use `any` since the user may pass some combination of
                # float and integer parameters, e.g. (0, 0.1).
                _check_bounds()
                param_dict['type'] = 'double'
                param_dict['bounds'] = {'min': param_bounds[0], 'max': param_bounds[1]}

            else:
                # Not sure what the user gave us here
                raise Exception('Bad parameter range {}.'.format(param_bounds))

            return param_dict

        # generate sigopt experiment parameters
        return [_transform_param(name, bounds) for (name, bounds) in param_domains.items()]

    def _create_sigopt_exp(self, conn):
        est_name = self.estimator.__class__.__name__
        exp_name = est_name + ' (sklearn)'
        if len(exp_name) > 50:
            exp_name = est_name

        if self.verbose > 0:
            print('Creating SigOpt experiment: ', exp_name)

        # create sigopt experiment
        experiment = conn.experiments().create(
            name=exp_name,
            parameters=self._transform_param_domains(self.param_domains),
            observation_budget=self.n_iter,
        )

        if self.verbose > 0:
            exp_url = 'https://sigopt.com/experiment/{0}'.format(self.experiment.id)
            print('Experiment progress available at :', exp_url)

        return experiment

    # NOTE(patrick): SVM can't handle unicode, so we need to convert those to string.
    def _convert_unicode(self, data):
      if HANDLES_UNICODE:
        return data
      # pylint: disable=undefined-variable
      if isinstance(data, basestring):
        return str(data)
      # pylint: enable=undefined-variable
      if isinstance(data, collections.Mapping):
        return dict(map(self._convert_unicode, data.items()))
      if isinstance(data, collections.Iterable):
        return type(data)(map(self._convert_unicode, data))
      return data

    def _convert_log_params(self, param_dict):
      # searches through names for params and converts params with __log__ names
      log_converted_dict = {}
      for pname in param_dict:
        pval = param_dict[pname]
        if '__log__' in pname:
          pval = math.exp(pval)
          pname = pname.replace('__log__', '')
        log_converted_dict[pname] = pval
      return log_converted_dict

    def _convert_nonstring_categoricals(self, param_dict):
        """Apply the self.categorical_mappings_ mappings where necessary."""
        return {name: (self.categorical_mappings_[name][val] if name in self.categorical_mappings_ else val)
                for (name, val) in param_dict.items()}

    def _convert_sigopt_api_to_sklearn_assignments(self, param_dict):
        return self._convert_nonstring_categoricals(self._convert_log_params(self._convert_unicode(param_dict)))

    # pylint: disable=unused-argument
    def _run_search(self, evaluate_candidates):
        # NOTE(patrick): scikit-learn 0.20.0 checks for the existence of this method, since
        # the default implementation of `_fit` calls it. However, to maintain compatibility
        # with older versions, we completely override _fit, so this method is unused. But
        # we make sure it exists, so that the class can be instantiated
        raise NotImplementedError('_run_search not used in this implementation')
    # pylint: enable=unused-argument

    def _fit(self, X, y, groups=None, parameter_iterable=None, **fit_params):
        if groups is not None:
            raise NotImplementedError('The groups argument is not supported.')
        if parameter_iterable is not None:
            raise NotImplementedError('The parameter_iterable argument is not supported.')
        if self.fit_params is not None:
            fit_params = self.fit_params

        # Actual fitting,  performing the search over parameters.
        estimator = self.estimator
        cv = self.cv
        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))

        n_folds, cv_iter = our_check_cv(cv, X, y, classifier=is_classifier(estimator))

        base_estimator = clone(self.estimator)
        pre_dispatch = self.pre_dispatch

        # setup SigOpt experiment and run optimization
        if not self.experiment:
            self.experiment = self._create_sigopt_exp(self.sigopt_connection)

        # start tracking time to optimize estimator
        opt_start_time = time.time()
        for jk in range(0, self.n_iter, self.n_sug):
            # check for opt timeout, ensuring at least 1 observation
            # TODO : handling failure observations
            if (
                self.opt_timeout is not None and
                time.time() - opt_start_time > self.opt_timeout and
                jk >= 1
            ):
                # break out of loop and refit model with best params so far
                break

            suggestions = []
            parameter_configs = []
            for _ in range(self.n_sug):
                suggestion = self.sigopt_connection.experiments(self.experiment.id).suggestions().create()
                parameters = self._convert_sigopt_api_to_sklearn_assignments(suggestion.assignments.to_json())
                suggestions.append(suggestion)
                parameter_configs.append(parameters)

            if self.verbose > 0:
                print('Evaluating params : ', parameter_configs)


            # do CV folds in parallel using joblib
            # returns scores on test set
            obs_timed_out = False
            try:
                par_kwargs = {'n_jobs': self.n_jobs, 'verbose': self.verbose,
                              'pre_dispatch': pre_dispatch}
                # add timeout kwarg if version of joblib supports it
                if 'timeout' in getfullargspec(Parallel.__init__).args:
                    par_kwargs['timeout'] = self.cv_timeout
                out = Parallel(
                    **par_kwargs
                )(
                    delayed(_fit_and_score)(clone(base_estimator), X, y,
                                            self.scorer_, train, test,
                                            self.verbose, parameters,
                                            fit_params,
                                            return_parameters=True,
                                            error_score=self.error_score)
                        for parameters in parameter_configs
                        for train, test in cv_iter)
            except TimeoutError:
                 obs_timed_out = True

            if not obs_timed_out:
                # grab scores from results
                for sidx, suggestion in enumerate(suggestions):
                    out_idx = sidx * n_folds
                    scores = [o[0] for o in out[out_idx:out_idx+n_folds]]
                    self.sigopt_connection.experiments(self.experiment.id).observations().create(
                        suggestion=suggestion.id,
                        value=numpy.mean(scores),
                        value_stddev=numpy.std(scores)
                    )
            else:
                # obsevation timed out so report a failure
                self.sigopt_connection.experiments(self.experiment.id).observations().create(
                    suggestion=suggestion.id,
                    failed=True)

        # return best SigOpt assignments so far
        best_assignments = self.sigopt_connection.experiments(self.experiment.id).best_assignments().fetch().data

        if not best_assignments:
            raise RuntimeError(
                'No valid observations found. '
                'Make sure opt_timeout and cv_timeout provide sufficient time for observations to be reported.')

        self.our_best_params_ = self._convert_sigopt_api_to_sklearn_assignments(
            best_assignments[0].assignments.to_json())
        self.our_best_score_ = best_assignments[0].value

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(**self.best_params_)
            if y is not None:
                best_estimator.fit(X, y, **fit_params)
            else:
                best_estimator.fit(X, **fit_params)
            self.our_best_estimator_ = best_estimator
        return self

    @property
    def best_params_(self):
        return self.our_best_params_

    @property
    def best_score_(self):
        return self.our_best_score_

    @property
    def best_estimator_(self):
        return self.our_best_estimator_

    def fit(self, X, y=None, groups=None, **fit_params):
        """
        Run fit on the estimator with parameters chosen sequentially by SigOpt.
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.
        """
        return self._fit(X, y=y, groups=groups, **fit_params)