Python sklearn.metrics.make_scorer() Examples

The following are 30 code examples of sklearn.metrics.make_scorer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.metrics , or try the search function .
Example #1
Source File: listing_9_3_backtest.py    From fight-churn with MIT License 8 votes vote down vote up
def backtest(data_set_path,n_test_split):

    X,y = prepare_data(data_set_path,as_retention=False)

    tscv = TimeSeriesSplit(n_splits=n_test_split)

    lift_scorer = make_scorer(calc_lift, needs_proba=True)
    score_models = {'lift': lift_scorer, 'AUC': 'roc_auc'}

    retain_reg = LogisticRegression(penalty='l1', solver='liblinear', fit_intercept=True)

    gsearch = GridSearchCV(estimator=retain_reg,scoring=score_models, cv=tscv, verbose=1,
                           return_train_score=False,  param_grid={'C' : [1]}, refit='AUC')

    gsearch.fit(X,y)
    result_df = pd.DataFrame(gsearch.cv_results_)

    save_path = data_set_path.replace('.csv', '_backtest.csv')
    result_df.to_csv(save_path, index=False)
    print('Saved test scores to ' + save_path) 
Example #2
Source File: test_pipeline.py    From lale with Apache License 2.0 7 votes vote down vote up
def test_with_gridsearchcv3_auto(self):
        from sklearn.model_selection import GridSearchCV
        from sklearn.datasets import load_iris
        from sklearn.metrics import accuracy_score, make_scorer
        lr = LogisticRegression()
        from sklearn.pipeline import Pipeline
        scikit_pipeline = Pipeline([(Nystroem().name(), Nystroem()), (lr.name(), LogisticRegression())])
        all_parameters = get_grid_search_parameter_grids(Nystroem()>>lr, num_samples=1)
        # otherwise the test takes too long
        parameters = random.sample(all_parameters, 2)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")

            clf = GridSearchCV(scikit_pipeline, parameters, cv=2, scoring=make_scorer(accuracy_score))
            iris = load_iris()
            clf.fit(iris.data, iris.target)
            predicted = clf.predict(iris.data) 
Example #3
Source File: scoring.py    From skorch with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def convert_sklearn_metric_function(scoring):
    """If ``scoring`` is a sklearn metric function, convert it to a
    sklearn scorer and return it. Otherwise, return ``scoring`` unchanged."""
    if callable(scoring):
        module = getattr(scoring, '__module__', None)

        # those are scoring objects returned by make_scorer starting
        # from sklearn 0.22
        scorer_names = ('_PredictScorer', '_ProbaScorer', '_ThresholdScorer')
        if (
                hasattr(module, 'startswith') and
                module.startswith('sklearn.metrics.') and
                not module.startswith('sklearn.metrics.scorer') and
                not module.startswith('sklearn.metrics.tests.') and
                not scoring.__class__.__name__ in scorer_names
        ):
            return make_scorer(scoring)
    return scoring 
Example #4
Source File: regression_tests.py    From drifter_ml with MIT License 6 votes vote down vote up
def mse_cv(self, cv):
        """
        This method performs cross-validation over mean squared error.
        
        Parameters
        ----------
        * cv : integer
          The number of cross validation folds to perform

        Returns
        -------
        Returns a scores of the k-fold mean squared error.
        """
        mse = metrics.make_scorer(metrics.mean_squared_error)
        result = cross_validate(self.reg, self.X,
                                self.y, cv=cv,
                                scoring=(mse))
        return self.get_test_score(result) 
Example #5
Source File: regression_tests.py    From drifter_ml with MIT License 6 votes vote down vote up
def mae_cv(self, cv):
        """
        This method performs cross-validation over median absolute error.
        
        Parameters
        ----------
        * cv : integer
          The number of cross validation folds to perform

        Returns
        -------
        Returns a scores of the k-fold median absolute error.
        """

        mae = metrics.make_scorer(metrics.median_absolute_error)
        result = cross_validate(self.reg, self.X,
                                self.y, cv=cv,
                                scoring=(mae))
        return self.get_test_score(result) 
Example #6
Source File: listing_9_6_crossvalidate_xgb.py    From fight-churn with MIT License 6 votes vote down vote up
def crossvalidate_xgb(data_set_path,n_test_split):

    X,y = prepare_data(data_set_path,ext='',as_retention=False)

    tscv = TimeSeriesSplit(n_splits=n_test_split)

    score_models = {'lift': make_scorer(calc_lift, needs_proba=True), 'AUC': 'roc_auc'}

    xgb_model = xgb.XGBClassifier(objective='binary:logistic')
    test_params = { 'max_depth': [1,2,4,6],
                    'learning_rate': [0.1,0.2,0.3,0.4],
                    'n_estimators': [20,40,80,120],
                    'min_child_weight' : [3,6,9,12]}
    gsearch = GridSearchCV(estimator=xgb_model,n_jobs=-1, scoring=score_models, cv=tscv, verbose=1,
                           return_train_score=False,  param_grid=test_params,refit='AUC')
    gsearch.fit(X.values,y)

    result_df = pd.DataFrame(gsearch.cv_results_)
    result_df.sort_values('mean_test_AUC',ascending=False,inplace=True)
    save_path = data_set_path.replace('.csv', '_crossval_xgb.csv')
    result_df.to_csv(save_path, index=False)
    print('Saved test scores to ' + save_path)

    pickle_path = data_set_path.replace('.csv', '_xgb_model.pkl')
    with open(pickle_path, 'wb') as fid:
        pickle.dump(gsearch.best_estimator_, fid)
    print('Saved model pickle to ' + pickle_path)

    predictions = gsearch.best_estimator_.predict_proba(X.values)
    predict_df = pd.DataFrame(predictions, index=X.index, columns=['retain_prob','churn_prob'])
    forecast_save_path = data_set_path.replace('.csv', '_xgb_predictions.csv')
    print('Saving results to %s' % forecast_save_path)
    predict_df.to_csv(forecast_save_path, header=True)

    forecast_histogram(data_set_path,predict_df,ext='xgb') 
Example #7
Source File: sklearn_test.py    From keras-tuner with Apache License 2.0 6 votes vote down vote up
def test_sklearn_custom_scoring_and_cv(tmp_dir):
    tuner = sklearn_tuner.Sklearn(
        oracle=kt.oracles.BayesianOptimization(
            objective=kt.Objective('score', 'max'),
            max_trials=10),
        hypermodel=build_model,
        scoring=metrics.make_scorer(metrics.balanced_accuracy_score),
        cv=model_selection.StratifiedKFold(5),
        directory=tmp_dir)

    x = np.random.uniform(size=(50, 10))
    y = np.random.randint(0, 2, size=(50,))
    tuner.search(x, y)

    assert len(tuner.oracle.trials) == 10

    best_trial = tuner.oracle.get_best_trials()[0]
    assert best_trial.status == 'COMPLETED'
    assert best_trial.score is not None
    assert best_trial.best_step == 0
    assert best_trial.metrics.exists('score')

    # Make sure best model can be reloaded.
    best_model = tuner.get_best_models()[0]
    best_model.score(x, y) 
Example #8
Source File: ml_tune.py    From ml-parameter-optimization with MIT License 6 votes vote down vote up
def apply_gridsearch(self,model):
        """
        apply grid search on ml algorithm to specified parameters
        returns updated best score and parameters
        """
        # check if custom evalution function is specified
        if callable(self.params_cv['scoring']):
            scoring = make_scorer(self.params_cv['scoring'],greater_is_better=self._greater_is_better)
        else:
            scoring = self.params_cv['scoring']
        
        gsearch = GridSearchCV(estimator=model,param_grid=self.get_params_tune(),scoring=scoring,
                               iid=self.params_cv['iid'],cv=self.params_cv['cv_folds'],n_jobs=self.params_cv['n_jobs'])
        gsearch.fit(self.X,self.y)
        
        # update best model if best_score is improved
        if (gsearch.best_score_ * self._score_mult) > (self.best_score * self._score_mult):
            self.best_model = clone(gsearch.best_estimator_)
            self.best_score = gsearch.best_score_
        
        # update tuned parameters with optimal values
        for key,value in gsearch.best_params_.items():
            self._params[key] = value
        self._temp_score = gsearch.best_score_
        return self 
Example #9
Source File: sklearn_test.py    From keras-tuner with Apache License 2.0 6 votes vote down vote up
def test_sklearn_real_data(tmp_dir):
    tuner = sklearn_tuner.Sklearn(
        oracle=kt.oracles.BayesianOptimization(
            objective=kt.Objective('score', 'max'),
            max_trials=10),
        hypermodel=build_model,
        scoring=metrics.make_scorer(metrics.accuracy_score),
        cv=model_selection.StratifiedKFold(5),
        directory=tmp_dir)

    x, y = datasets.load_iris(return_X_y=True)
    x_train, x_test, y_train, y_test = model_selection.train_test_split(
        x, y, test_size=0.2)

    tuner.search(x_train, y_train)

    best_models = tuner.get_best_models(10)
    best_model = best_models[0]
    worst_model = best_models[9]
    best_model_score = best_model.score(x_test, y_test)
    worst_model_score = worst_model.score(x_test, y_test)

    assert best_model_score > 0.8
    assert best_model_score >= worst_model_score 
Example #10
Source File: regression_tests.py    From drifter_ml with MIT License 6 votes vote down vote up
def tse_cv(self, cv):
        """
        This method performs cross-validation over trimean squared error.
        
        Parameters
        ----------
        * cv : integer
          The number of cross validation folds to perform

        Returns
        -------
        Returns a scores of the k-fold trimean squared error.
        """
        tse = metrics.make_scorer(self.trimean_squared_error)
        result = cross_validate(self.reg, self.X,
                                self.y, cv=cv,
                                scoring=(tse))
        return self.get_test_score(result) 
Example #11
Source File: regression_tests.py    From drifter_ml with MIT License 6 votes vote down vote up
def tae_cv(self, cv):
        """
        This method performs cross-validation over trimean absolute error.
        
        Parameters
        ----------
        * cv : integer
          The number of cross validation folds to perform

        Returns
        -------
        Returns a scores of the k-fold trimean absolute error.
        """
        tse = metrics.make_scorer(self.trimean_absolute_error)
        result = cross_validate(self.reg, self.X,
                                self.y, cv=cv,
                                scoring=(tse))
        return self.get_test_score(result) 
Example #12
Source File: test_search.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_grid_search_cv_results_multimetric():
    X, y = make_classification(n_samples=50, n_features=4, random_state=42)

    n_splits = 3
    params = [dict(kernel=['rbf', ], C=[1, 10], gamma=[0.1, 1]),
              dict(kernel=['poly', ], degree=[1, 2])]

    for iid in (False, True):
        grid_searches = []
        for scoring in ({'accuracy': make_scorer(accuracy_score),
                         'recall': make_scorer(recall_score)},
                        'accuracy', 'recall'):
            grid_search = GridSearchCV(SVC(gamma='scale'), cv=n_splits,
                                       iid=iid, param_grid=params,
                                       scoring=scoring, refit=False)
            grid_search.fit(X, y)
            assert_equal(grid_search.iid, iid)
            grid_searches.append(grid_search)

        compare_cv_results_multimetric_with_single(*grid_searches, iid=iid) 
Example #13
Source File: test_validation.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_cross_val_score_with_score_func_regression():
    X, y = make_regression(n_samples=30, n_features=20, n_informative=5,
                           random_state=0)
    reg = Ridge()

    # Default score of the Ridge regression estimator
    scores = cross_val_score(reg, X, y, cv=5)
    assert_array_almost_equal(scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)

    # R2 score (aka. determination coefficient) - should be the
    # same as the default estimator score
    r2_scores = cross_val_score(reg, X, y, scoring="r2", cv=5)
    assert_array_almost_equal(r2_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2)

    # Mean squared error; this is a loss function, so "scores" are negative
    neg_mse_scores = cross_val_score(reg, X, y, cv=5,
                                     scoring="neg_mean_squared_error")
    expected_neg_mse = np.array([-763.07, -553.16, -274.38, -273.26, -1681.99])
    assert_array_almost_equal(neg_mse_scores, expected_neg_mse, 2)

    # Explained variance
    scoring = make_scorer(explained_variance_score)
    ev_scores = cross_val_score(reg, X, y, cv=5, scoring=scoring)
    assert_array_almost_equal(ev_scores, [0.94, 0.97, 0.97, 0.99, 0.92], 2) 
Example #14
Source File: test_core_operators.py    From lale with Apache License 2.0 6 votes vote down vote up
def test_with_randomizedsearchcv(self):
        from sklearn.model_selection import RandomizedSearchCV
        from sklearn.datasets import load_iris
        from sklearn.metrics import accuracy_score, make_scorer
        from scipy.stats.distributions import uniform
        import numpy as np
        lr = LogisticRegression()
        parameters = {'solver':('liblinear', 'lbfgs'), 'penalty':['l2']}
        ranges, cat_idx = lr.get_param_ranges()
        min_C, max_C, default_C = ranges['C']
        # specify parameters and distributions to sample from
        #the loguniform distribution needs to be taken care of properly
        param_dist = {"solver": ranges['solver'],
                      "C": uniform(min_C, np.log(max_C))}
        # run randomized search
        n_iter_search = 5
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            random_search = RandomizedSearchCV(
                lr, param_distributions=param_dist, n_iter=n_iter_search, cv=5,
                scoring=make_scorer(accuracy_score))
            iris = load_iris()
            random_search.fit(iris.data, iris.target) 
Example #15
Source File: scorer.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def check_scoring(estimator, scoring=None, **kwargs):
    res = sklearn_check_scoring(estimator, scoring=scoring, **kwargs)
    if callable(scoring):
        # Heuristic to ensure user has not passed a metric
        module = getattr(scoring, "__module__", None)
        if (
            hasattr(module, "startswith")
            and module.startswith("dask_ml.metrics.")
            and not module.startswith("dask_ml.metrics.scorer")
            and not module.startswith("dask_ml.metrics.tests.")
        ):
            raise ValueError(
                "scoring value %r looks like it is a metric "
                "function rather than a scorer. A scorer should "
                "require an estimator as its first parameter. "
                "Please use `make_scorer` to convert a metric "
                "to a scorer." % scoring
            )
    if scoring in SCORERS.keys():
        func, kwargs = SCORERS[scoring]
        return make_scorer(func, **kwargs)
    return res 
Example #16
Source File: churn_calc.py    From fight-churn with MIT License 6 votes vote down vote up
def crossvalidate_churn_model(self,model_code,groups=True):
        X,y = self.prepare_xy(groups)
        params = self.cv_params(model_code)
        model = self.model_instance(model_code)
        tscv = TimeSeriesSplit(n_splits=3)
        lift_scorer = make_scorer(top_decile_lift,needs_proba=True)
        score_models = {'lift_scorer' : lift_scorer, 'AUC' : 'roc_auc'}
        gsearch = GridSearchCV(estimator=model, param_grid=params, scoring=score_models, cv=tscv, n_jobs=8,verbose=5,
                               return_train_score=True,refit='AUC')


        gsearch.fit(X, y)
        result_df = pd.DataFrame(gsearch.cv_results_)
        if len(params)>1:
            result_df.sort_values('mean_test_AUC',ascending=False,inplace=True)


        save_file_name = model_code + '_CV'
        save_path = self.save_path(save_file_name, subdir=self.grouping_correlation_subdir(groups))

        result_df.to_csv(save_path)
        print('Saved result to ' + save_path)
        return result_df 
Example #17
Source File: test_core_operators.py    From lale with Apache License 2.0 6 votes vote down vote up
def test_clone_operator_choice(self):
        from sklearn.model_selection import cross_val_score
        from sklearn.metrics import accuracy_score, make_scorer
        from sklearn.base import clone
        from sklearn.datasets import load_iris
        iris = load_iris()
        X, y = iris.data, iris.target

        lr = LogisticRegression()
        trainable = PCA() >> lr 
        trainable_wrapper = make_sklearn_compat(trainable)
        trainable2 = clone(trainable_wrapper)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            result = cross_val_score(trainable_wrapper, X, y,
                                     scoring=make_scorer(accuracy_score), cv=2)
            result2 = cross_val_score(trainable2, X, y,
                                      scoring=make_scorer(accuracy_score), cv=2)
        for i in range(len(result)):
            self.assertEqual(result[i], result2[i]) 
Example #18
Source File: test_optimizers.py    From lale with Apache License 2.0 6 votes vote down vote up
def test_with_gridsearchcv_auto_wrapped_pipe1(self):
        from sklearn.datasets import load_iris
        from sklearn.metrics import accuracy_score, make_scorer
  
        lr = LogisticRegression()
        pca = PCA()
        trainable = pca >> lr

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            from lale.lib.lale import GridSearchCV
            clf = GridSearchCV(
                estimator=trainable, lale_num_samples=1, lale_num_grids=1,
                cv=2, scoring=make_scorer(accuracy_score))
            iris = load_iris()
            clf.fit(iris.data, iris.target) 
Example #19
Source File: test_scoring.py    From skorch with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_with_make_scorer_accuracy_score(
            self, net_cls, module_cls, scoring_cls, train_split, data,
    ):
        net = net_cls(
            module_cls,
            callbacks=[scoring_cls(make_scorer(accuracy_score))],
            batch_size=1,
            max_epochs=2,
            train_split=train_split,
        )
        net.fit(*data)

        score_epochs = net.history[:, 'accuracy_score']
        assert np.allclose(score_epochs, [0, 0])

        score_batches = net.history[:, 'batches', :, 'accuracy_score']
        assert np.allclose(score_batches, [[0, 0], [0, 0]]) 
Example #20
Source File: ABuMLGrid.py    From abu with GNU General Public License v3.0 6 votes vote down vote up
def grid_search_init_n_components(estimator, x, y, n_components_range=None, cv=10, n_jobs=-1,
                                  scoring=None, show=True):
    """
    封装grid search特定的'n_components'关键字参数最优搜索,
    为AbuMLCreater中_estimators_prarms_best提供callback函数,
    具体阅读AbuMLCreater._estimators_prarms_best()

    :param estimator: 学习器对象
    :param x: 训练集x矩阵,numpy矩阵
    :param y: 训练集y序列,numpy序列
    :param n_components_range: 默认None, None则会使用:
            n_estimators_range = np.arange(2, np.maximum(10, int(x.shape[1]) - 1), 1)

    :param cv: int,GridSearchCV切割训练集测试集参数,默认10
    :param n_jobs: 并行执行的进程任务数量,默认-1, 开启与cpu相同数量的进程数
    :param scoring: 测试集的度量方法,默认为None, None的情况下分类器使用accuracy进行度量,回归器使用
                    回归器使用可释方差值explained_variance_score,使用make_scorer对函数进行score封装
    :param show: 是否进行可视化
    :return: eg: (0.82154882154882158, {'n_components': 10})
    """
    if n_components_range is None:
        n_components_range = np.arange(2, np.maximum(10, int(x.shape[1]) - 1), 1)

    return grid_search_init_kwargs(estimator, x, y, 'n_components', n_components_range,
                                   cv=cv, n_jobs=n_jobs, scoring=scoring, show=show) 
Example #21
Source File: tpot_tests.py    From tpot with GNU Lesser General Public License v3.0 5 votes vote down vote up
def test_init_default_scoring_3():
    """Assert that TPOT intitializes with a valid _BaseScorer."""
    with warnings.catch_warnings(record=True) as w:
        tpot_obj = TPOTClassifier(scoring=make_scorer(balanced_accuracy))
        tpot_obj._fit_init()
    assert len(w) == 0 # deap 1.2.2 warning message made this unit test failed
    assert tpot_obj.scoring_function._score_func == balanced_accuracy 
Example #22
Source File: test_optimizers.py    From lale with Apache License 2.0 5 votes vote down vote up
def test_custom_scoring(self):
        from sklearn.metrics import f1_score, make_scorer
        lr = LogisticRegression()
        clf = Hyperopt(estimator=lr, scoring=make_scorer(f1_score, average='macro'), cv = 5, max_evals=1)
        trained = clf.fit(self.X_train, self.y_train)
        predictions = trained.predict(self.X_test)
        predictions_1 = clf.predict(self.X_test)
        assert np.array_equal(predictions_1, predictions) 
Example #23
Source File: metrics.py    From Quadflor with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def hierarchical_f_measure_scorer(graph):
    measure = partial(hierarchical_f_measure, graph)
    return make_scorer(measure) 
Example #24
Source File: tpot_tests.py    From tpot with GNU Lesser General Public License v3.0 5 votes vote down vote up
def test_init_default_scoring_7():
    """Assert that TPOT rasies ValueError with a valid sklearn metric function from __main__."""
    def my_scorer(estimator, X, y):
        return make_scorer(balanced_accuracy)

    tpot_obj = TPOTClassifier(scoring=my_scorer)
    tpot_obj._fit_init() 
Example #25
Source File: test_optimizers.py    From lale with Apache License 2.0 5 votes vote down vote up
def test_cv_folds_scikit(self):
        trainable_lr = LogisticRegression(n_jobs=1)
        iris = sklearn.datasets.load_iris()
        from sklearn.model_selection import cross_val_score
        from sklearn.metrics import accuracy_score, make_scorer
        from sklearn.model_selection import KFold
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            cv_results = cross_val_score(
                trainable_lr, iris.data, iris.target,
                cv = KFold(2), scoring=make_scorer(accuracy_score))
        self.assertEqual(len(cv_results), 2) 
Example #26
Source File: test_optimizers.py    From lale with Apache License 2.0 5 votes vote down vote up
def test_custom_scorer(self):
        from sklearn.metrics import f1_score, make_scorer
        pipeline = PCA() >> LogisticRegression()
        def custom_scorer(estimator, X, y, factor=0.1):
            #This is a custom scorer for demonstrating the use of kwargs
            #Just applies some factor to the accuracy
            from sklearn.metrics import accuracy_score
            predictions = estimator.predict(X)
            self.assertEqual(factor, 0.5)
            return factor*accuracy_score(y, predictions)
        clf = Hyperopt(estimator=pipeline, scoring=custom_scorer, cv = 5, max_evals=1, args_to_scorer={'factor':0.5})
        trained = clf.fit(self.X_train, self.y_train)
        predictions = trained.predict(self.X_test)
        predictions_1 = clf.predict(self.X_test)
        assert np.array_equal(predictions_1, predictions) 
Example #27
Source File: test_core_operators.py    From lale with Apache License 2.0 5 votes vote down vote up
def test_grid_search_on_trained_auto(self):
        from sklearn.model_selection import GridSearchCV
        from sklearn.datasets import load_iris
        from sklearn.metrics import accuracy_score, make_scorer
        iris = load_iris()
        X, y = iris.data, iris.target
        lr = LogisticRegression()
        trained = lr.fit(X, y)
        parameters = get_grid_search_parameter_grids(lr, num_samples=2)

        clf = GridSearchCV(trained, parameters, cv=5, scoring=make_scorer(accuracy_score)) 
Example #28
Source File: classification.py    From pyImSegm with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def create_classif_search(name_clf, clf_pipeline, nb_labels,
                          search_type='random', cross_val=10,
                          eval_metric='f1', nb_iter=250, nb_workers=5):
    """ create sklearn search depending on spec. random or grid

    :param int nb_labels: number of labels
    :param str search_type: hyper-params search type
    :param str eval_metric: evaluation metric
    :param int nb_iter: for random number of tries
    :param str name_clf: name of classif.
    :param obj clf_pipeline: object
    :param obj cross_val: obj specific CV for fix train-test
    :param int nb_workers: number jobs running in parallel
    :return:
    """
    score_weight = 'weighted' if nb_labels > 2 else 'binary'
    scoring = metrics.make_scorer(DICT_SCORING[eval_metric.lower()],
                                  average=score_weight)
    if search_type == 'grid':
        clf_parameters = create_clf_param_search_grid(name_clf)
        logging.info('init Grid search...')
        clf_search = GridSearchCV(
            clf_pipeline, clf_parameters, scoring=scoring, cv=cross_val,
            n_jobs=nb_workers, verbose=1, refit=True)
    else:
        clf_parameters = create_clf_param_search_distrib(name_clf)
        nb_iter = search_params_cut_down_max_nb_iter(clf_parameters, nb_iter)
        logging.info('init Randomized search...')
        clf_search = RandomizedSearchCV(
            clf_pipeline, clf_parameters, scoring=scoring, cv=cross_val,
            n_jobs=nb_workers, n_iter=nb_iter, verbose=1, refit=True)
    return clf_search 
Example #29
Source File: mlchallenge.py    From BTB with MIT License 5 votes vote down vote up
def __init__(self, dataset, model=None, target_column=None,
                 encode=None, tunable_hyperparameters=None, metric=None,
                 model_defaults=None, make_binary=None, stratified=None,
                 cv_splits=5, cv_random_state=42, cv_shuffle=True, metric_args={}):

        self.model = model or self.MODEL
        self.dataset = dataset or self.DATASET
        self.target_column = target_column or self.TARGET_COLUMN
        self.model_defaults = model_defaults or self.MODEL_DEFAULTS
        self.make_binary = make_binary or self.MAKE_BINARY
        self.tunable_hyperparameters = tunable_hyperparameters or self.TUNABLE_HYPERPARAMETERS

        if metric:
            self.metric = metric
            self.metric_args = metric_args

        else:
            # Allow to either write a metric method or assign a METRIC function
            self.metric = getattr(self, 'metric', self.__class__.METRIC)
            self.metric_args = getattr(self, 'metric_args', self.__class__.METRIC_ARGS)

        self.stratified = self.STRATIFIED if stratified is None else stratified
        # self.X, self.y = self.load_data()

        self.encode = self.ENCODE if encode is None else encode
        self.scorer = make_scorer(self.metric, **self.metric_args)

        if self.stratified:
            self.cv = StratifiedKFold(
                shuffle=cv_shuffle,
                n_splits=cv_splits,
                random_state=cv_random_state
            )
        else:
            self.cv = KFold(
                shuffle=cv_shuffle,
                n_splits=cv_splits,
                random_state=cv_random_state
            ) 
Example #30
Source File: listing_9_5_crossvalidate.py    From fight-churn with MIT License 5 votes vote down vote up
def crossvalidate(data_set_path,n_test_split):

    X,y = prepare_data(data_set_path,as_retention=False)
    tscv = TimeSeriesSplit(n_splits=n_test_split)
    score_models = {'lift': make_scorer(calc_lift, needs_proba=True), 'AUC': 'roc_auc'}
    retain_reg = LogisticRegression(penalty='l1', solver='liblinear', fit_intercept=True)
    test_params = {'C' : [0.64, 0.32, 0.16, 0.08, 0.04, 0.02, 0.01, 0.005, 0.0025]}
    gsearch = GridSearchCV(estimator=retain_reg,scoring=score_models, cv=tscv, verbose=1,
                           return_train_score=False,  param_grid=test_params, refit=False)
    gsearch.fit(X,y)

    result_df = pd.DataFrame(gsearch.cv_results_)
    result_df['n_weights']= test_n_weights(X,y,test_params)
    result_df.to_csv(data_set_path.replace('.csv', '_crossval.csv'), index=False)
    plot_regression_test(data_set_path,result_df)