Python lightgbm.train() Examples

The following are 30 code examples of lightgbm.train(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module lightgbm , or try the search function .
Example #1
Source File: avito2.py    From MachineLearning with Apache License 2.0 9 votes vote down vote up
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "num_leaves": 30,
        "learning_rate": 0.1,
        "bagging_fraction": 0.7,
        "feature_fraction": 0.7,
        "bagging_frequency": 5,
        "bagging_seed": 2018,
        "verbosity": -1
    }

    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20,
                      evals_result=evals_result)

    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    return pred_test_y, model, evals_result


# Splitting the data for model training# 
Example #2
Source File: automl.py    From kddcup2019-automl with MIT License 8 votes vote down vote up
def hyperopt_lightgbm(X: pd.DataFrame, y: pd.Series, params: Dict, config: Config):
    X_train, X_val, y_train, y_val = data_split(X, y, test_size=0.5)
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_val, label=y_val)

    space = {
        "max_depth": hp.choice("max_depth", np.arange(2, 10, 1, dtype=int)),
        # smaller than 2^(max_depth)
        "num_leaves": hp.choice("num_leaves", np.arange(4, 200, 4, dtype=int)),
        "feature_fraction": hp.quniform("feature_fraction", 0.2, 0.8, 0.1),
        # "bagging_fraction": hp.quniform("bagging_fraction", 0.2, 0.8, 0.1),
        # "bagging_freq": hp.choice("bagging_freq", np.linspace(0, 10, 2, dtype=int)),
        # "scale_pos_weight":hp.uniform('scale_pos_weight',1.0, 10.0),
        # "colsample_by_tree":hp.uniform("colsample_bytree",0.5,1.0),
        "min_child_weight": hp.quniform('min_child_weight', 2, 50, 2),
        "reg_alpha": hp.uniform("reg_alpha", 2.0, 8.0),
        "reg_lambda": hp.uniform("reg_lambda", 2.0, 8.0),
        "learning_rate": hp.quniform("learning_rate", 0.05, 0.4, 0.01),
        # "learning_rate": hp.loguniform("learning_rate", np.log(0.04), np.log(0.5)),
        #
        "min_data_in_leaf": hp.choice('min_data_in_leaf', np.arange(200, 2000, 100, dtype=int)),
        #"is_unbalance": hp.choice("is_unbalance", [True])
    }

    def objective(hyperparams):
        model = lgb.train({**params, **hyperparams}, train_data, 300,
                          valid_data, early_stopping_rounds=45, verbose_eval=0)

        score = model.best_score["valid_0"][params["metric"]]

        # in classification, less is better
        return {'loss': -score, 'status': STATUS_OK}

    trials = Trials()
    best = hyperopt.fmin(fn=objective, space=space, trials=trials,
                         algo=tpe.suggest, max_evals=150, verbose=1,
                         rstate=np.random.RandomState(1))

    hyperparams = space_eval(space, best)
    log(f"auc = {-trials.best_trial['result']['loss']:0.4f} {hyperparams}")
    return hyperparams 
Example #3
Source File: automl.py    From Kaggler with MIT License 7 votes vote down vote up
def optimize_hyperparam(self, X, y, test_size=.2, n_eval=100):
        X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=test_size, shuffle=self.shuffle)

        train_data = lgb.Dataset(X_trn, label=y_trn)
        valid_data = lgb.Dataset(X_val, label=y_val)

        def objective(hyperparams):
            model = lgb.train({**self.params, **hyperparams}, train_data, self.n_est,
                              valid_data, early_stopping_rounds=self.n_stop, verbose_eval=0)

            score = model.best_score["valid_0"][self.metric] * self.loss_sign

            return {'loss': score, 'status': STATUS_OK, 'model': model}

        trials = Trials()
        best = hyperopt.fmin(fn=objective, space=self.space, trials=trials,
                             algo=tpe.suggest, max_evals=n_eval, verbose=1,
                             rstate=self.random_state)

        hyperparams = space_eval(self.space, best)
        return hyperparams, trials 
Example #4
Source File: level2.py    From kaggle-kuzushiji-2019 with MIT License 7 votes vote down vote up
def train_lgb(train_features, train_y, valid_features, valid_y, *,
              lr, num_boost_round):
    train_data = lgb.Dataset(train_features, train_y)
    valid_data = lgb.Dataset(valid_features, valid_y, reference=train_data)
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'learning_rate': lr,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'feature_fraction': 0.9,
        'min_data_in_leaf': 20,
        'num_leaves': 41,
        'scale_pos_weight': 1.2,
        'lambda_l2': 1,
    }
    print(params)
    return lgb.train(
        params=params,
        train_set=train_data,
        num_boost_round=num_boost_round,
        early_stopping_rounds=20,
        valid_sets=[valid_data],
        verbose_eval=10,
    ) 
Example #5
Source File: level2.py    From kaggle-kuzushiji-2019 with MIT License 6 votes vote down vote up
def train_xgb(train_features, train_y, valid_features, valid_y, *,
              eta, num_boost_round):
    train_data = xgb.DMatrix(train_features, label=train_y)
    valid_data = xgb.DMatrix(valid_features, label=valid_y)
    params = {
        'eta': eta,
        'objective': 'binary:logistic',
        'gamma': 0.01,
        'max_depth': 8,
    }
    print(params)
    eval_list = [(valid_data, 'eval')]
    return xgb.train(
        params, train_data, num_boost_round, eval_list,
        early_stopping_rounds=20,
        verbose_eval=10,
    ) 
Example #6
Source File: misc.py    From open-solution-mapping-challenge with MIT License 6 votes vote down vote up
def fit(self, X, y, X_valid, y_valid, feature_names, categorical_features, **kwargs):
        train = lgb.Dataset(X, label=y,
                            feature_name=feature_names,
                            categorical_feature=categorical_features
                            )
        valid = lgb.Dataset(X_valid, label=y_valid,
                            feature_name=feature_names,
                            categorical_feature=categorical_features
                            )

        evaluation_results = {}
        self.estimator = lgb.train(self.model_config,
                                   train, valid_sets=[train, valid], valid_names=['train', 'valid'],
                                   evals_result=evaluation_results,
                                   num_boost_round=self.training_config.number_boosting_rounds,
                                   early_stopping_rounds=self.training_config.early_stopping_rounds,
                                   verbose_eval=self.model_config.verbose,
                                   feval=self.evaluation_function)
        return self 
Example #7
Source File: lgb.py    From kaggle-plasticc with MIT License 6 votes vote down vote up
def train_and_predict(train_df, test_df, features, params):
    oof_preds = np.zeros((len(train_df), params["num_class"]))
    test_preds = np.zeros((len(test_df), params["num_class"]))

    skf = StratifiedKFold(NUM_FOLDS, random_state=4)

    for train_index, val_index in skf.split(train_df, train_df["target"]):
        dev_df, val_df = train_df.iloc[train_index], train_df.iloc[val_index]
        lgb_train = lgb.Dataset(dev_df[features], dev_df["target"], weight=dev_df["sample_weight"])
        lgb_val = lgb.Dataset(val_df[features], val_df["target"], weight=val_df["sample_weight"])

        model = lgb.train(params, lgb_train, num_boost_round=200, valid_sets=[lgb_train, lgb_val],
                          early_stopping_rounds=10, verbose_eval=50)
        oof_preds[val_index, :] = model.predict(val_df[features])

        test_preds += model.predict(test_df[features]) / NUM_FOLDS

    return oof_preds, test_preds 
Example #8
Source File: avito.py    From MachineLearning with Apache License 2.0 6 votes vote down vote up
def get_oof(clf, x_train, y, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        print('\nFold {}'.format(i))
        x_tr = x_train[train_index]
        y_tr = y[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1) 
Example #9
Source File: models.py    From steppy-toolkit with MIT License 6 votes vote down vote up
def fit(self, X, y, X_valid, y_valid):
        self._check_target_shape_and_type(y, 'y')
        self._check_target_shape_and_type(y_valid, 'y_valid')
        y = self._format_target(y)
        y_valid = self._format_target(y_valid)

        logger.info('LightGBM transformer, train data shape        {}'.format(X.shape))
        logger.info('LightGBM transformer, validation data shape   {}'.format(X_valid.shape))
        logger.info('LightGBM transformer, train labels shape      {}'.format(y.shape))
        logger.info('LightGBM transformer, validation labels shape {}'.format(y_valid.shape))

        data_train = lgb.Dataset(data=X,
                                 label=y,
                                 **self.dataset_parameters)
        data_valid = lgb.Dataset(data=X_valid,
                                 label=y_valid,
                                 **self.dataset_parameters)
        self.estimator = lgb.train(params=self.booster_parameters,
                                   train_set=data_train,
                                   valid_sets=[data_train, data_valid],
                                   valid_names=['data_train', 'data_valid'],
                                   **self.training_parameters)
        return self 
Example #10
Source File: test_LightGbmTreeEnsembleConverters.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_lightgbm_booster_multi_classifier(self):
        X = [[0, 1], [1, 1], [2, 0], [1, 2], [-1, 2], [1, -2]]
        X = numpy.array(X, dtype=numpy.float32)
        y = [0, 1, 0, 1, 2, 2]
        data = lightgbm.Dataset(X, label=y)
        model = lightgbm.train({'boosting_type': 'gbdt', 'objective': 'multiclass',
                                'n_estimators': 3, 'min_child_samples': 1, 'num_class': 3},
                               data)
        model_onnx, prefix = convert_model(model, 'tree-based classifier',
                                           [('input', FloatTensorType([None, 2]))])
        dump_data_and_model(X, model, model_onnx,
                            allow_failure="StrictVersion(onnx.__version__) < StrictVersion('1.3.0')",
                            basename=prefix + "BoosterBin" + model.__class__.__name__)
        try:
            from onnxruntime import InferenceSession
        except ImportError:
            # onnxruntime not installed (python 2.7)
            return
        sess = InferenceSession(model_onnx.SerializeToString())
        out = sess.get_outputs()
        names = [o.name for o in out]
        assert names == ['label', 'probabilities'] 
Example #11
Source File: 04_PlanetKaggle_GPU.py    From h2o4gpu with Apache License 2.0 6 votes vote down vote up
def train_and_validate_lightgbm(params, train_features, train_labels, validation_features, num_boost_round):
    n_classes = train_labels.shape[1]
    y_val_pred = np.zeros((validation_features.shape[0], n_classes))
    time_results = defaultdict(list)
    for class_i in tqdm(range(n_classes)):
        lgb_train = lgb.Dataset(train_features, train_labels[:, class_i], free_raw_data=False)
        with Timer() as t:
            model = lgb.train(params, lgb_train, num_boost_round = num_boost_round)
        time_results['train_time'].append(t.interval)
        
        with Timer() as t:
            y_val_pred[:, class_i] = model.predict(validation_features)
        time_results['test_time'].append(t.interval)
        
    return y_val_pred, time_results


# In[8]: 
Example #12
Source File: lgb_model.py    From autogluon with Apache License 2.0 6 votes vote down vote up
def generate_datasets(self, X_train: DataFrame, Y_train: Series, params, X_test=None, Y_test=None, dataset_train=None, dataset_val=None, save=False):
        lgb_dataset_params_keys = ['objective', 'two_round', 'num_threads', 'num_classes', 'verbose']  # Keys that are specific to lightGBM Dataset object construction.
        data_params = {key: params[key] for key in lgb_dataset_params_keys if key in params}.copy()

        W_train = None  # TODO: Add weight support
        W_test = None  # TODO: Add weight support
        if X_train is not None:
            X_train = self.preprocess(X_train, is_train=True)
        if X_test is not None:
            X_test = self.preprocess(X_test)
        # TODO: Try creating multiple Datasets for subsets of features, then combining with Dataset.add_features_from(), this might avoid memory spike
        if not dataset_train:
            # X_train, W_train = self.convert_to_weight(X=X_train)
            dataset_train = construct_dataset(x=X_train, y=Y_train, location=f'{self.path}datasets{os.path.sep}train', params=data_params, save=save, weight=W_train)
            # dataset_train = construct_dataset_lowest_memory(X=X_train, y=Y_train, location=self.path + 'datasets/train', params=data_params)
        if (not dataset_val) and (X_test is not None) and (Y_test is not None):
            # X_test, W_test = self.convert_to_weight(X=X_test)
            dataset_val = construct_dataset(x=X_test, y=Y_test, location=f'{self.path}datasets{os.path.sep}val', reference=dataset_train, params=data_params, save=save, weight=W_test)
            # dataset_val = construct_dataset_lowest_memory(X=X_test, y=Y_test, location=self.path + 'datasets/val', reference=dataset_train, params=data_params)
        return dataset_train, dataset_val 
Example #13
Source File: optimize.py    From optuna with MIT License 6 votes vote down vote up
def __call__(self, trial: optuna.trial.Trial) -> float:

        self._preprocess(trial)

        start_time = time.time()
        booster = lgb.train(self.lgbm_params, self.train_set, **self.lgbm_kwargs)

        val_score = self._get_booster_best_score(booster)
        elapsed_secs = time.time() - start_time
        average_iteration_time = elapsed_secs / booster.current_iteration()

        if self.model_dir is not None:
            path = os.path.join(self.model_dir, "{}.pkl".format(trial.number))
            with open(path, "wb") as fout:
                pickle.dump(booster, fout)
            _logger.info("The booster of trial#{} was saved as {}.".format(trial.number, path))

        if self.compare_validation_metrics(val_score, self.best_score):
            self.best_score = val_score
            self.best_booster_with_trial_number = (booster, trial.number)

        self._postprocess(trial, elapsed_secs, average_iteration_time)

        return val_score 
Example #14
Source File: test_lightgbm.py    From docker-python with Apache License 2.0 6 votes vote down vote up
def test_cpu(self):
        lgb_train = lgb.Dataset('/input/tests/data/lgb_train.bin')
        lgb_eval = lgb.Dataset('/input/tests/data/lgb_test.bin', reference=lgb_train)

        params = {
            'task': 'train',
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': {'l2', 'auc'},
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': 0
        }

        # Run only one round for faster test
        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=1,
                        valid_sets=lgb_eval,
                        early_stopping_rounds=1)

        self.assertEqual(1, gbm.best_iteration) 
Example #15
Source File: test_lightgbm.py    From docker-python with Apache License 2.0 6 votes vote down vote up
def test_gpu(self):
        lgb_train = lgb.Dataset('/input/tests/data/lgb_train.bin')
        lgb_eval = lgb.Dataset('/input/tests/data/lgb_test.bin', reference=lgb_train)

        params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': 'auc',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': 1,
            'device': 'gpu'
        }
        
        # Run only one round for faster test
        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=1,
                        valid_sets=lgb_eval,
                        early_stopping_rounds=1)

        self.assertEqual(1, gbm.best_iteration) 
Example #16
Source File: test_lightgbm.py    From optuna with MIT License 6 votes vote down vote up
def objective(
    trial, metric="binary_error", valid_name="valid_0", force_default_valid_names=False, cv=False
):
    # type: (optuna.trial.Trial, str, str, bool, bool) -> float

    dtrain = lgb.Dataset([[1.0], [2.0], [3.0]], label=[1.0, 0.0, 1.0])
    dtest = lgb.Dataset([[1.0]], label=[1.0])

    if force_default_valid_names:
        valid_names = None
    else:
        valid_names = [valid_name]

    pruning_callback = LightGBMPruningCallback(trial, metric, valid_name=valid_name)
    if cv:
        lgb.cv(
            {"objective": "binary", "metric": ["auc", "binary_error"]},
            dtrain,
            1,
            verbose_eval=False,
            nfold=2,
            callbacks=[pruning_callback],
        )
    else:
        lgb.train(
            {"objective": "binary", "metric": ["auc", "binary_error"]},
            dtrain,
            1,
            valid_sets=[dtest],
            valid_names=valid_names,
            verbose_eval=False,
            callbacks=[pruning_callback],
        )
    return 1.0 
Example #17
Source File: lightgbm.py    From talkingdata-adtracking-fraud-detection with MIT License 6 votes vote down vote up
def train_and_predict(self, train, valid, weight, categorical_features: List[str], target: str, params: dict) \
            -> Tuple[Booster, dict]:
        if type(train) != pd.DataFrame or type(valid) != pd.DataFrame:
            raise ValueError('Parameter train and valid must be pandas.DataFrame')

        if list(train.columns) != list(valid.columns):
            raise ValueError('Train and valid must have a same column list')

        predictors = train.columns.drop(target)
        if weight is None:
            d_train = lgb.Dataset(train[predictors], label=train[target].values)
        else:
            print(weight)
            d_train = lgb.Dataset(train[predictors], label=train[target].values, weight=weight)
        d_valid = lgb.Dataset(valid[predictors], label=valid[target].values)

        eval_results = {}
        model: Booster = lgb.train(params['model_params'],
                                   d_train,
                                   categorical_feature=categorical_features,
                                   valid_sets=[d_train, d_valid],
                                   valid_names=['train', 'valid'],
                                   evals_result=eval_results,
                                   **params['train_params'])
        return model, eval_results 
Example #18
Source File: lightgbm_simple.py    From optuna with MIT License 6 votes vote down vote up
def objective(trial):
    data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)
    dtrain = lgb.Dataset(train_x, label=train_y)

    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    gbm = lgb.train(param, dtrain)
    preds = gbm.predict(valid_x)
    pred_labels = np.rint(preds)
    accuracy = sklearn.metrics.accuracy_score(valid_y, pred_labels)
    return accuracy 
Example #19
Source File: test_lightgbm_autolog.py    From mlflow with Apache License 2.0 6 votes vote down vote up
def test_lgb_autolog_logs_metrics_with_multi_validation_data_and_metrics(bst_params, train_set):
    mlflow.lightgbm.autolog()
    evals_result = {}
    params = {'metric': ['multi_error', 'multi_logloss']}
    params.update(bst_params)
    valid_sets = [train_set, lgb.Dataset(train_set.data)]
    valid_names = ['train', 'valid']
    lgb.train(params, train_set, num_boost_round=10, valid_sets=valid_sets,
              valid_names=valid_names, evals_result=evals_result)
    run = get_latest_run()
    data = run.data
    client = mlflow.tracking.MlflowClient()
    for valid_name in valid_names:
        for metric_name in params['metric']:
            metric_key = '{}-{}'.format(valid_name, metric_name)
            metric_history = [x.value for x
                              in client.get_metric_history(run.info.run_id, metric_key)]
            assert metric_key in data.metrics
            assert len(metric_history) == 10
            assert metric_history == evals_result[valid_name][metric_name] 
Example #20
Source File: test_lightgbm_autolog.py    From mlflow with Apache License 2.0 6 votes vote down vote up
def test_lgb_autolog_logs_metrics_with_multi_metrics(bst_params, train_set):
    mlflow.lightgbm.autolog()
    evals_result = {}
    params = {'metric': ['multi_error', 'multi_logloss']}
    params.update(bst_params)
    valid_sets = [train_set]
    valid_names = ['train']
    lgb.train(params, train_set, num_boost_round=10, valid_sets=valid_sets,
              valid_names=valid_names, evals_result=evals_result)
    run = get_latest_run()
    data = run.data
    client = mlflow.tracking.MlflowClient()
    for metric_name in params['metric']:
        metric_key = '{}-{}'.format(valid_names[0], metric_name)
        metric_history = [x.value for x in client.get_metric_history(run.info.run_id, metric_key)]
        assert metric_key in data.metrics
        assert len(metric_history) == 10
        assert metric_history == evals_result['train'][metric_name] 
Example #21
Source File: test_lightgbm_autolog.py    From mlflow with Apache License 2.0 6 votes vote down vote up
def test_lgb_autolog_logs_metrics_with_multi_validation_data(bst_params, train_set):
    mlflow.lightgbm.autolog()
    evals_result = {}
    # If we use [train_set, train_set] here, LightGBM ignores the first dataset.
    # To avoid that, create a new Dataset object.
    valid_sets = [train_set, lgb.Dataset(train_set.data)]
    valid_names = ['train', 'valid']
    lgb.train(bst_params, train_set, num_boost_round=10, valid_sets=valid_sets,
              valid_names=valid_names, evals_result=evals_result)
    run = get_latest_run()
    data = run.data
    client = mlflow.tracking.MlflowClient()
    for valid_name in valid_names:
        metric_key = '{}-multi_logloss'.format(valid_name)
        metric_history = [x.value for x in client.get_metric_history(run.info.run_id, metric_key)]
        assert metric_key in data.metrics
        assert len(metric_history) == 10
        assert metric_history == evals_result[valid_name]['multi_logloss'] 
Example #22
Source File: test_lightgbm_autolog.py    From mlflow with Apache License 2.0 6 votes vote down vote up
def test_lgb_autolog_logs_default_params(bst_params, train_set):
    mlflow.lightgbm.autolog()
    lgb.train(bst_params, train_set)
    run = get_latest_run()
    params = run.data.params

    expected_params = {
        'num_boost_round': 100,
        'feature_name': 'auto',
        'categorical_feature': 'auto',
        'verbose_eval': True,
        'keep_training_booster': False
    }
    expected_params.update(bst_params)

    for key, val in expected_params.items():
        assert key in params
        assert params[key] == str(val)

    unlogged_params = ['params', 'train_set', 'valid_sets', 'valid_names', 'fobj', 'feval',
                       'init_model', 'evals_result', 'learning_rates', 'callbacks']

    for param in unlogged_params:
        assert param not in params 
Example #23
Source File: models.py    From open-solution-data-science-bowl-2018 with MIT License 6 votes vote down vote up
def fit(self, X, y, X_valid, y_valid, feature_names, categorical_features, **kwargs):
        train = lgb.Dataset(X, label=y,
                            feature_name=feature_names,
                            categorical_feature=categorical_features
                            )
        valid = lgb.Dataset(X_valid, label=y_valid,
                            feature_name=feature_names,
                            categorical_feature=categorical_features
                            )

        evaluation_results = {}
        self.estimator = lgb.train(self.model_params,
                                   train,
                                   valid_sets=[train, valid],
                                   valid_names=['train', 'valid'],
                                   evals_result=evaluation_results,
                                   num_boost_round=self.training_params.number_boosting_rounds,
                                   early_stopping_rounds=self.training_params.early_stopping_rounds,
                                   verbose_eval=10,
                                   feval=self.evaluation_function)
        return self 
Example #24
Source File: misc.py    From open-solution-data-science-bowl-2018 with MIT License 6 votes vote down vote up
def fit(self, X, y, X_valid, y_valid, feature_names, categorical_features, **kwargs):
        train = lgb.Dataset(X, label=y,
                            feature_name=feature_names,
                            categorical_feature=categorical_features
                            )
        valid = lgb.Dataset(X_valid, label=y_valid,
                            feature_name=feature_names,
                            categorical_feature=categorical_features
                            )

        evaluation_results = {}
        self.estimator = lgb.train(self.model_config,
                                   train, valid_sets=[train, valid], valid_names=['train', 'valid'],
                                   evals_result=evaluation_results,
                                   num_boost_round=self.training_config.number_boosting_rounds,
                                   early_stopping_rounds=self.training_config.early_stopping_rounds,
                                   verbose_eval=self.model_config.verbose,
                                   feval=self.evaluation_function)
        return self 
Example #25
Source File: main.py    From nni with MIT License 6 votes vote down vote up
def load_data(train_path='./data/regression.train', test_path='./data/regression.test'):
    '''
    Load or create dataset
    '''
    print('Load data...')
    df_train = pd.read_csv(train_path, header=None, sep='\t')
    df_test = pd.read_csv(test_path, header=None, sep='\t')
    num = len(df_train)
    split_num = int(0.9 * num)

    y_train = df_train[0].values
    y_test = df_test[0].values
    y_eval = y_train[split_num:]
    y_train = y_train[:split_num]

    X_train = df_train.drop(0, axis=1).values
    X_test = df_test.drop(0, axis=1).values
    X_eval = X_train[split_num:, :]
    X_train = X_train[:split_num, :]

    # create dataset for lightgbm
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

    return lgb_train, lgb_eval, X_test, y_test 
Example #26
Source File: main.py    From nni with MIT License 6 votes vote down vote up
def run(lgb_train, lgb_eval, params, X_test, y_test):
    print('Start training...')

    params['num_leaves'] = int(params['num_leaves'])

    # train
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=20,
                    valid_sets=lgb_eval,
                    early_stopping_rounds=5)

    print('Start predicting...')

    # predict
    y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)

    # eval
    rmse = mean_squared_error(y_test, y_pred) ** 0.5
    print('The rmse of prediction is:', rmse)

    nni.report_final_result(rmse) 
Example #27
Source File: lightgbm_example.py    From ray with Apache License 2.0 6 votes vote down vote up
def train_breast_cancer(config):
    data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
    train_x, test_x, train_y, test_y = train_test_split(
        data, target, test_size=0.25)
    train_set = lgb.Dataset(train_x, label=train_y)
    test_set = lgb.Dataset(test_x, label=test_y)
    gbm = lgb.train(
        config,
        train_set,
        valid_sets=[test_set],
        verbose_eval=False,
        callbacks=[LightGBMCallback])
    preds = gbm.predict(test_x)
    pred_labels = np.rint(preds)
    tune.report(
        mean_accuracy=sklearn.metrics.accuracy_score(test_y, pred_labels),
        done=True) 
Example #28
Source File: models.py    From open-solution-home-credit with MIT License 6 votes vote down vote up
def fit(self,
            X, y,
            X_valid, y_valid,
            feature_names=None,
            feature_types=None,
            **kwargs):
        train = xgb.DMatrix(X,
                            label=y,
                            feature_names=feature_names,
                            feature_types=feature_types)
        valid = xgb.DMatrix(X_valid,
                            label=y_valid,
                            feature_names=feature_names,
                            feature_types=feature_types)

        evaluation_results = {}
        self.estimator = xgb.train(params=self.model_config,
                                   dtrain=train,
                                   evals=[(train, 'train'), (valid, 'valid')],
                                   evals_result=evaluation_results,
                                   num_boost_round=self.training_config.nrounds,
                                   early_stopping_rounds=self.training_config.early_stopping_rounds,
                                   verbose_eval=self.model_config.verbose,
                                   feval=self.evaluation_function)
        return self 
Example #29
Source File: models.py    From open-solution-home-credit with MIT License 6 votes vote down vote up
def fit(self,
            X,
            y,
            X_valid,
            y_valid,
            feature_names=None,
            categorical_features=None,
            **kwargs):

        logger.info('Catboost, train data shape        {}'.format(X.shape))
        logger.info('Catboost, validation data shape   {}'.format(X_valid.shape))
        logger.info('Catboost, train labels shape      {}'.format(y.shape))
        logger.info('Catboost, validation labels shape {}'.format(y_valid.shape))

        categorical_indeces = self._get_categorical_indeces(feature_names, categorical_features)
        self.estimator.fit(X, y,
                           eval_set=(X_valid, y_valid),
                           cat_features=categorical_indeces)
        return self 
Example #30
Source File: models.py    From open-solution-mapping-challenge with MIT License 6 votes vote down vote up
def fit(self, X, y, X_valid, y_valid, feature_names, categorical_features, **kwargs):
        train = lgb.Dataset(X, label=y,
                            feature_name=feature_names,
                            categorical_feature=categorical_features
                            )
        valid = lgb.Dataset(X_valid, label=y_valid,
                            feature_name=feature_names,
                            categorical_feature=categorical_features
                            )

        evaluation_results = {}
        self.estimator = lgb.train(self.model_params,
                                   train,
                                   valid_sets=[train, valid],
                                   valid_names=['train', 'valid'],
                                   evals_result=evaluation_results,
                                   num_boost_round=self.training_params.number_boosting_rounds,
                                   early_stopping_rounds=self.training_params.early_stopping_rounds,
                                   verbose_eval=10,
                                   feval=self.evaluation_function)
        return self