Python lightgbm.Dataset() Examples

The following are 30 code examples of lightgbm.Dataset(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module lightgbm , or try the search function .
Example #1
Source File: avito2.py    From MachineLearning with Apache License 2.0 9 votes vote down vote up
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "num_leaves": 30,
        "learning_rate": 0.1,
        "bagging_fraction": 0.7,
        "feature_fraction": 0.7,
        "bagging_frequency": 5,
        "bagging_seed": 2018,
        "verbosity": -1
    }

    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20,
                      evals_result=evals_result)

    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    return pred_test_y, model, evals_result


# Splitting the data for model training# 
Example #2
Source File: automl.py    From kddcup2019-automl with MIT License 8 votes vote down vote up
def hyperopt_lightgbm(X: pd.DataFrame, y: pd.Series, params: Dict, config: Config):
    X_train, X_val, y_train, y_val = data_split(X, y, test_size=0.5)
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_val, label=y_val)

    space = {
        "max_depth": hp.choice("max_depth", np.arange(2, 10, 1, dtype=int)),
        # smaller than 2^(max_depth)
        "num_leaves": hp.choice("num_leaves", np.arange(4, 200, 4, dtype=int)),
        "feature_fraction": hp.quniform("feature_fraction", 0.2, 0.8, 0.1),
        # "bagging_fraction": hp.quniform("bagging_fraction", 0.2, 0.8, 0.1),
        # "bagging_freq": hp.choice("bagging_freq", np.linspace(0, 10, 2, dtype=int)),
        # "scale_pos_weight":hp.uniform('scale_pos_weight',1.0, 10.0),
        # "colsample_by_tree":hp.uniform("colsample_bytree",0.5,1.0),
        "min_child_weight": hp.quniform('min_child_weight', 2, 50, 2),
        "reg_alpha": hp.uniform("reg_alpha", 2.0, 8.0),
        "reg_lambda": hp.uniform("reg_lambda", 2.0, 8.0),
        "learning_rate": hp.quniform("learning_rate", 0.05, 0.4, 0.01),
        # "learning_rate": hp.loguniform("learning_rate", np.log(0.04), np.log(0.5)),
        #
        "min_data_in_leaf": hp.choice('min_data_in_leaf', np.arange(200, 2000, 100, dtype=int)),
        #"is_unbalance": hp.choice("is_unbalance", [True])
    }

    def objective(hyperparams):
        model = lgb.train({**params, **hyperparams}, train_data, 300,
                          valid_data, early_stopping_rounds=45, verbose_eval=0)

        score = model.best_score["valid_0"][params["metric"]]

        # in classification, less is better
        return {'loss': -score, 'status': STATUS_OK}

    trials = Trials()
    best = hyperopt.fmin(fn=objective, space=space, trials=trials,
                         algo=tpe.suggest, max_evals=150, verbose=1,
                         rstate=np.random.RandomState(1))

    hyperparams = space_eval(space, best)
    log(f"auc = {-trials.best_trial['result']['loss']:0.4f} {hyperparams}")
    return hyperparams 
Example #3
Source File: automl.py    From Kaggler with MIT License 7 votes vote down vote up
def optimize_hyperparam(self, X, y, test_size=.2, n_eval=100):
        X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=test_size, shuffle=self.shuffle)

        train_data = lgb.Dataset(X_trn, label=y_trn)
        valid_data = lgb.Dataset(X_val, label=y_val)

        def objective(hyperparams):
            model = lgb.train({**self.params, **hyperparams}, train_data, self.n_est,
                              valid_data, early_stopping_rounds=self.n_stop, verbose_eval=0)

            score = model.best_score["valid_0"][self.metric] * self.loss_sign

            return {'loss': score, 'status': STATUS_OK, 'model': model}

        trials = Trials()
        best = hyperopt.fmin(fn=objective, space=self.space, trials=trials,
                             algo=tpe.suggest, max_evals=n_eval, verbose=1,
                             rstate=self.random_state)

        hyperparams = space_eval(self.space, best)
        return hyperparams, trials 
Example #4
Source File: level2.py    From kaggle-kuzushiji-2019 with MIT License 7 votes vote down vote up
def train_lgb(train_features, train_y, valid_features, valid_y, *,
              lr, num_boost_round):
    train_data = lgb.Dataset(train_features, train_y)
    valid_data = lgb.Dataset(valid_features, valid_y, reference=train_data)
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'learning_rate': lr,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'feature_fraction': 0.9,
        'min_data_in_leaf': 20,
        'num_leaves': 41,
        'scale_pos_weight': 1.2,
        'lambda_l2': 1,
    }
    print(params)
    return lgb.train(
        params=params,
        train_set=train_data,
        num_boost_round=num_boost_round,
        early_stopping_rounds=20,
        valid_sets=[valid_data],
        verbose_eval=10,
    ) 
Example #5
Source File: optimize.py    From optuna with MIT License 6 votes vote down vote up
def _get_booster_best_score(self, booster: "lgb.Booster") -> float:

        metric = self._get_metric_for_objective()
        valid_sets = self.lgbm_kwargs.get("valid_sets")  # type: Optional[VALID_SET_TYPE]

        if self.lgbm_kwargs.get("valid_names") is not None:
            if type(self.lgbm_kwargs["valid_names"]) is str:
                valid_name = self.lgbm_kwargs["valid_names"]
            elif type(self.lgbm_kwargs["valid_names"]) in [list, tuple]:
                valid_name = self.lgbm_kwargs["valid_names"][-1]
            else:
                raise NotImplementedError

        elif type(valid_sets) is lgb.Dataset:
            valid_name = "valid_0"

        elif isinstance(valid_sets, (list, tuple)) and len(valid_sets) > 0:
            valid_set_idx = len(valid_sets) - 1
            valid_name = "valid_{}".format(valid_set_idx)

        else:
            raise NotImplementedError

        val_score = booster.best_score[valid_name][metric]
        return val_score 
Example #6
Source File: test_lightgbm.py    From docker-python with Apache License 2.0 6 votes vote down vote up
def test_gpu(self):
        lgb_train = lgb.Dataset('/input/tests/data/lgb_train.bin')
        lgb_eval = lgb.Dataset('/input/tests/data/lgb_test.bin', reference=lgb_train)

        params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': 'auc',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': 1,
            'device': 'gpu'
        }
        
        # Run only one round for faster test
        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=1,
                        valid_sets=lgb_eval,
                        early_stopping_rounds=1)

        self.assertEqual(1, gbm.best_iteration) 
Example #7
Source File: lightgbm.py    From talkingdata-adtracking-fraud-detection with MIT License 6 votes vote down vote up
def train_and_predict(self, train, valid, weight, categorical_features: List[str], target: str, params: dict) \
            -> Tuple[Booster, dict]:
        if type(train) != pd.DataFrame or type(valid) != pd.DataFrame:
            raise ValueError('Parameter train and valid must be pandas.DataFrame')

        if list(train.columns) != list(valid.columns):
            raise ValueError('Train and valid must have a same column list')

        predictors = train.columns.drop(target)
        if weight is None:
            d_train = lgb.Dataset(train[predictors], label=train[target].values)
        else:
            print(weight)
            d_train = lgb.Dataset(train[predictors], label=train[target].values, weight=weight)
        d_valid = lgb.Dataset(valid[predictors], label=valid[target].values)

        eval_results = {}
        model: Booster = lgb.train(params['model_params'],
                                   d_train,
                                   categorical_feature=categorical_features,
                                   valid_sets=[d_train, d_valid],
                                   valid_names=['train', 'valid'],
                                   evals_result=eval_results,
                                   **params['train_params'])
        return model, eval_results 
Example #8
Source File: lgb_tune.py    From ml-parameter-optimization with MIT License 6 votes vote down vote up
def get_n_estimators(self):
        """
        returns optimal number of estimators using CV on training set
        """
        lgb_param = {}
        for _params_key,_params_value in self._params.items():
            if _params_key in self._dict_map.keys():
                lgb_param[self._dict_map[_params_key]] = _params_value
            else:
                lgb_param[_params_key] = _params_value
        
        if self.balance_class:
            lgb_train = lgb.Dataset(self.X, label=self.y, weight=self.get_label_weights())
        else:
            lgb_train = lgb.Dataset(self.X, label=self.y)
        
        kwargs_cv = {'num_boost_round':self.params['n_estimators'],
                     'nfold':self.params_cv['cv_folds'],
                     'early_stopping_rounds':self.params_cv['early_stopping_rounds'],
                     'stratified':self.params_cv['stratified']}
        
        try: # check if custom evalution function is specified
            if callable(self.params_cv['feval']):
                kwargs_cv['feval'] = self.params_cv['feval']
        except KeyError:
            kwargs_cv['metrics'] = self.params_cv['metrics']
        
        if type(self.categorical_feature)==list:
            kwargs_cv['categorical_feature'] = self.categorical_feature
        else:
            kwargs_cv['categorical_feature'] = 'auto'
        
        cvresult = lgb.cv(lgb_param,lgb_train,**kwargs_cv)
        self._params['n_estimators'] = int(len(cvresult[kwargs_cv['metrics'] + \
                                            '-mean'])/(1-1/self.params_cv['cv_folds']))
        return self 
Example #9
Source File: models.py    From steppy-toolkit with MIT License 6 votes vote down vote up
def fit(self, X, y, X_valid, y_valid):
        self._check_target_shape_and_type(y, 'y')
        self._check_target_shape_and_type(y_valid, 'y_valid')
        y = self._format_target(y)
        y_valid = self._format_target(y_valid)

        logger.info('LightGBM transformer, train data shape        {}'.format(X.shape))
        logger.info('LightGBM transformer, validation data shape   {}'.format(X_valid.shape))
        logger.info('LightGBM transformer, train labels shape      {}'.format(y.shape))
        logger.info('LightGBM transformer, validation labels shape {}'.format(y_valid.shape))

        data_train = lgb.Dataset(data=X,
                                 label=y,
                                 **self.dataset_parameters)
        data_valid = lgb.Dataset(data=X_valid,
                                 label=y_valid,
                                 **self.dataset_parameters)
        self.estimator = lgb.train(params=self.booster_parameters,
                                   train_set=data_train,
                                   valid_sets=[data_train, data_valid],
                                   valid_names=['data_train', 'data_valid'],
                                   **self.training_parameters)
        return self 
Example #10
Source File: test_lightgbm.py    From docker-python with Apache License 2.0 6 votes vote down vote up
def test_cpu(self):
        lgb_train = lgb.Dataset('/input/tests/data/lgb_train.bin')
        lgb_eval = lgb.Dataset('/input/tests/data/lgb_test.bin', reference=lgb_train)

        params = {
            'task': 'train',
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': {'l2', 'auc'},
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': 0
        }

        # Run only one round for faster test
        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=1,
                        valid_sets=lgb_eval,
                        early_stopping_rounds=1)

        self.assertEqual(1, gbm.best_iteration) 
Example #11
Source File: lgb_utils.py    From autogluon with Apache License 2.0 6 votes vote down vote up
def construct_dataset(x: DataFrame, y: Series, location=None, reference=None, params=None, save=False, weight=None):
    try_import_lightgbm()
    import lightgbm as lgb

    dataset = lgb.Dataset(data=x, label=y, reference=reference, free_raw_data=True, params=params, weight=weight)

    if save:
        assert location is not None
        saving_path = f'{location}.bin'
        if os.path.exists(saving_path):
            os.remove(saving_path)

        os.makedirs(os.path.dirname(saving_path), exist_ok=True)
        dataset.save_binary(saving_path)
        # dataset_binary = lgb.Dataset(location + '.bin', reference=reference, free_raw_data=False)# .construct()

    return dataset 
Example #12
Source File: test_LightGbmTreeEnsembleConverters.py    From onnxmltools with MIT License 6 votes vote down vote up
def test_lightgbm_booster_multi_classifier(self):
        X = [[0, 1], [1, 1], [2, 0], [1, 2], [-1, 2], [1, -2]]
        X = numpy.array(X, dtype=numpy.float32)
        y = [0, 1, 0, 1, 2, 2]
        data = lightgbm.Dataset(X, label=y)
        model = lightgbm.train({'boosting_type': 'gbdt', 'objective': 'multiclass',
                                'n_estimators': 3, 'min_child_samples': 1, 'num_class': 3},
                               data)
        model_onnx, prefix = convert_model(model, 'tree-based classifier',
                                           [('input', FloatTensorType([None, 2]))])
        dump_data_and_model(X, model, model_onnx,
                            allow_failure="StrictVersion(onnx.__version__) < StrictVersion('1.3.0')",
                            basename=prefix + "BoosterBin" + model.__class__.__name__)
        try:
            from onnxruntime import InferenceSession
        except ImportError:
            # onnxruntime not installed (python 2.7)
            return
        sess = InferenceSession(model_onnx.SerializeToString())
        out = sess.get_outputs()
        names = [o.name for o in out]
        assert names == ['label', 'probabilities'] 
Example #13
Source File: test_lightgbm.py    From h2o4gpu with Apache License 2.0 6 votes vote down vote up
def test_lightgbm_cpu(booster):
    import numpy as np
    import pandas as pd
    from h2o4gpu.util.lightgbm_dynamic import got_cpu_lgb, got_gpu_lgb
    import lightgbm as lgb

    X1 = np.repeat(np.arange(10), 1000)
    X2 = np.repeat(np.arange(10), 1000)
    np.random.shuffle(X2)

    y = (X1 + np.random.randn(10000)) * (X2 + np.random.randn(10000))
    data = pd.DataFrame({'y': y, 'X1': X1, 'X2': X2})

    lgb_params = {'learning_rate': 0.1,
                  'boosting': booster,
                  'objective': 'regression',
                  'metric': 'rmse',
                  'feature_fraction': 0.9,
                  'bagging_fraction': 0.75,
                  'num_leaves': 31,
                  'bagging_freq': 1,
                  'min_data_per_leaf': 250}
    lgb_train = lgb.Dataset(data=data[['X1', 'X2']], label=data.y)
    cv = lgb.cv(lgb_params,
                lgb_train,
                num_boost_round=100,
                early_stopping_rounds=15,
                stratified=False,
                verbose_eval=50) 
Example #14
Source File: test_lightgbm.py    From h2o4gpu with Apache License 2.0 6 votes vote down vote up
def test_lightgbm_gpu(booster):
    import numpy as np
    import pandas as pd
    from h2o4gpu.util.lightgbm_dynamic import got_cpu_lgb, got_gpu_lgb
    import lightgbm as lgb

    X1 = np.repeat(np.arange(10), 1000)
    X2 = np.repeat(np.arange(10), 1000)
    np.random.shuffle(X2)

    y = (X1 + np.random.randn(10000)) * (X2 + np.random.randn(10000))
    data = pd.DataFrame({'y': y, 'X1': X1, 'X2': X2})

    lgb_params = {'learning_rate': 0.1,
                  'boosting': booster,
                  'objective': 'regression',
                  'metric': 'rmse',
                  'feature_fraction': 0.9,
                  'bagging_fraction': 0.75,
                  'num_leaves': 31,
                  'bagging_freq': 1,
                  'min_data_per_leaf': 250, 'device_type': 'gpu', 'gpu_device_id': 0}
    lgb_train = lgb.Dataset(data=data[['X1', 'X2']], label=data.y)
    cv = lgb.cv(lgb_params,
                lgb_train,
                num_boost_round=100,
                early_stopping_rounds=15,
                stratified=False,
                verbose_eval=50) 
Example #15
Source File: lgb.py    From kaggle-plasticc with MIT License 6 votes vote down vote up
def train_and_predict(train_df, test_df, features, params):
    oof_preds = np.zeros((len(train_df), params["num_class"]))
    test_preds = np.zeros((len(test_df), params["num_class"]))

    skf = StratifiedKFold(NUM_FOLDS, random_state=4)

    for train_index, val_index in skf.split(train_df, train_df["target"]):
        dev_df, val_df = train_df.iloc[train_index], train_df.iloc[val_index]
        lgb_train = lgb.Dataset(dev_df[features], dev_df["target"], weight=dev_df["sample_weight"])
        lgb_val = lgb.Dataset(val_df[features], val_df["target"], weight=val_df["sample_weight"])

        model = lgb.train(params, lgb_train, num_boost_round=200, valid_sets=[lgb_train, lgb_val],
                          early_stopping_rounds=10, verbose_eval=50)
        oof_preds[val_index, :] = model.predict(val_df[features])

        test_preds += model.predict(test_df[features]) / NUM_FOLDS

    return oof_preds, test_preds 
Example #16
Source File: optimize.py    From optuna with MIT License 6 votes vote down vote up
def _create_objective(
        self,
        target_param_names: List[str],
        train_set: "lgb.Dataset",
        step_name: str,
        pbar: tqdm.tqdm,
    ) -> _OptunaObjective:
        return _OptunaObjective(
            target_param_names,
            self.lgbm_params,
            train_set,
            self.lgbm_kwargs,
            self.best_score,
            step_name=step_name,
            model_dir=self._model_dir,
            pbar=pbar,
        ) 
Example #17
Source File: models.py    From open-solution-mapping-challenge with MIT License 6 votes vote down vote up
def fit(self, X, y, X_valid, y_valid, feature_names, categorical_features, **kwargs):
        train = lgb.Dataset(X, label=y,
                            feature_name=feature_names,
                            categorical_feature=categorical_features
                            )
        valid = lgb.Dataset(X_valid, label=y_valid,
                            feature_name=feature_names,
                            categorical_feature=categorical_features
                            )

        evaluation_results = {}
        self.estimator = lgb.train(self.model_params,
                                   train,
                                   valid_sets=[train, valid],
                                   valid_names=['train', 'valid'],
                                   evals_result=evaluation_results,
                                   num_boost_round=self.training_params.number_boosting_rounds,
                                   early_stopping_rounds=self.training_params.early_stopping_rounds,
                                   verbose_eval=10,
                                   feval=self.evaluation_function)
        return self 
Example #18
Source File: test_lightgbm.py    From optuna with MIT License 6 votes vote down vote up
def objective(
    trial, metric="binary_error", valid_name="valid_0", force_default_valid_names=False, cv=False
):
    # type: (optuna.trial.Trial, str, str, bool, bool) -> float

    dtrain = lgb.Dataset([[1.0], [2.0], [3.0]], label=[1.0, 0.0, 1.0])
    dtest = lgb.Dataset([[1.0]], label=[1.0])

    if force_default_valid_names:
        valid_names = None
    else:
        valid_names = [valid_name]

    pruning_callback = LightGBMPruningCallback(trial, metric, valid_name=valid_name)
    if cv:
        lgb.cv(
            {"objective": "binary", "metric": ["auc", "binary_error"]},
            dtrain,
            1,
            verbose_eval=False,
            nfold=2,
            callbacks=[pruning_callback],
        )
    else:
        lgb.train(
            {"objective": "binary", "metric": ["auc", "binary_error"]},
            dtrain,
            1,
            valid_sets=[dtest],
            valid_names=valid_names,
            verbose_eval=False,
            callbacks=[pruning_callback],
        )
    return 1.0 
Example #19
Source File: lightgbm_simple.py    From optuna with MIT License 6 votes vote down vote up
def objective(trial):
    data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)
    dtrain = lgb.Dataset(train_x, label=train_y)

    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    gbm = lgb.train(param, dtrain)
    preds = gbm.predict(valid_x)
    pred_labels = np.rint(preds)
    accuracy = sklearn.metrics.accuracy_score(valid_y, pred_labels)
    return accuracy 
Example #20
Source File: test_lightgbm_autolog.py    From mlflow with Apache License 2.0 6 votes vote down vote up
def test_lgb_autolog_logs_metrics_with_multi_validation_data_and_metrics(bst_params, train_set):
    mlflow.lightgbm.autolog()
    evals_result = {}
    params = {'metric': ['multi_error', 'multi_logloss']}
    params.update(bst_params)
    valid_sets = [train_set, lgb.Dataset(train_set.data)]
    valid_names = ['train', 'valid']
    lgb.train(params, train_set, num_boost_round=10, valid_sets=valid_sets,
              valid_names=valid_names, evals_result=evals_result)
    run = get_latest_run()
    data = run.data
    client = mlflow.tracking.MlflowClient()
    for valid_name in valid_names:
        for metric_name in params['metric']:
            metric_key = '{}-{}'.format(valid_name, metric_name)
            metric_history = [x.value for x
                              in client.get_metric_history(run.info.run_id, metric_key)]
            assert metric_key in data.metrics
            assert len(metric_history) == 10
            assert metric_history == evals_result[valid_name][metric_name] 
Example #21
Source File: test_lightgbm_autolog.py    From mlflow with Apache License 2.0 6 votes vote down vote up
def test_lgb_autolog_logs_metrics_with_multi_validation_data(bst_params, train_set):
    mlflow.lightgbm.autolog()
    evals_result = {}
    # If we use [train_set, train_set] here, LightGBM ignores the first dataset.
    # To avoid that, create a new Dataset object.
    valid_sets = [train_set, lgb.Dataset(train_set.data)]
    valid_names = ['train', 'valid']
    lgb.train(bst_params, train_set, num_boost_round=10, valid_sets=valid_sets,
              valid_names=valid_names, evals_result=evals_result)
    run = get_latest_run()
    data = run.data
    client = mlflow.tracking.MlflowClient()
    for valid_name in valid_names:
        metric_key = '{}-multi_logloss'.format(valid_name)
        metric_history = [x.value for x in client.get_metric_history(run.info.run_id, metric_key)]
        assert metric_key in data.metrics
        assert len(metric_history) == 10
        assert metric_history == evals_result[valid_name]['multi_logloss'] 
Example #22
Source File: model_v1.py    From Quora with MIT License 6 votes vote down vote up
def get_dataset(self, X, y, free_raw_data=True):
        """
        convert data into lightgbm consumable format

        Parameters
        ----------
        X: string, numpy array, pandas DataFrame, scipy.sparse or
            list of numpy arrays

        y: list, numpy 1-D array, pandas Series / one-column DataFrame \
            or None, optional (default=None)

        free_raw_data: bool, optional (default=True)

        Return
        ------
        lightgbm dataset
        """
        return lightgbm.Dataset(
            data=X, label=y,
            feature_name=self.feature_name,
            categorical_feature=self.categorical_feature,
            free_raw_data=free_raw_data) 
Example #23
Source File: models.py    From open-solution-data-science-bowl-2018 with MIT License 6 votes vote down vote up
def fit(self, X, y, X_valid, y_valid, feature_names, categorical_features, **kwargs):
        train = lgb.Dataset(X, label=y,
                            feature_name=feature_names,
                            categorical_feature=categorical_features
                            )
        valid = lgb.Dataset(X_valid, label=y_valid,
                            feature_name=feature_names,
                            categorical_feature=categorical_features
                            )

        evaluation_results = {}
        self.estimator = lgb.train(self.model_params,
                                   train,
                                   valid_sets=[train, valid],
                                   valid_names=['train', 'valid'],
                                   evals_result=evaluation_results,
                                   num_boost_round=self.training_params.number_boosting_rounds,
                                   early_stopping_rounds=self.training_params.early_stopping_rounds,
                                   verbose_eval=10,
                                   feval=self.evaluation_function)
        return self 
Example #24
Source File: misc.py    From open-solution-data-science-bowl-2018 with MIT License 6 votes vote down vote up
def fit(self, X, y, X_valid, y_valid, feature_names, categorical_features, **kwargs):
        train = lgb.Dataset(X, label=y,
                            feature_name=feature_names,
                            categorical_feature=categorical_features
                            )
        valid = lgb.Dataset(X_valid, label=y_valid,
                            feature_name=feature_names,
                            categorical_feature=categorical_features
                            )

        evaluation_results = {}
        self.estimator = lgb.train(self.model_config,
                                   train, valid_sets=[train, valid], valid_names=['train', 'valid'],
                                   evals_result=evaluation_results,
                                   num_boost_round=self.training_config.number_boosting_rounds,
                                   early_stopping_rounds=self.training_config.early_stopping_rounds,
                                   verbose_eval=self.model_config.verbose,
                                   feval=self.evaluation_function)
        return self 
Example #25
Source File: main.py    From nni with MIT License 6 votes vote down vote up
def load_data(train_path='./data/regression.train', test_path='./data/regression.test'):
    '''
    Load or create dataset
    '''
    print('Load data...')
    df_train = pd.read_csv(train_path, header=None, sep='\t')
    df_test = pd.read_csv(test_path, header=None, sep='\t')
    num = len(df_train)
    split_num = int(0.9 * num)

    y_train = df_train[0].values
    y_test = df_test[0].values
    y_eval = y_train[split_num:]
    y_train = y_train[:split_num]

    X_train = df_train.drop(0, axis=1).values
    X_test = df_test.drop(0, axis=1).values
    X_eval = X_train[split_num:, :]
    X_train = X_train[:split_num, :]

    # create dataset for lightgbm
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

    return lgb_train, lgb_eval, X_test, y_test 
Example #26
Source File: lightgbm_example.py    From ray with Apache License 2.0 6 votes vote down vote up
def train_breast_cancer(config):
    data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
    train_x, test_x, train_y, test_y = train_test_split(
        data, target, test_size=0.25)
    train_set = lgb.Dataset(train_x, label=train_y)
    test_set = lgb.Dataset(test_x, label=test_y)
    gbm = lgb.train(
        config,
        train_set,
        valid_sets=[test_set],
        verbose_eval=False,
        callbacks=[LightGBMCallback])
    preds = gbm.predict(test_x)
    pred_labels = np.rint(preds)
    tune.report(
        mean_accuracy=sklearn.metrics.accuracy_score(test_y, pred_labels),
        done=True) 
Example #27
Source File: misc.py    From open-solution-mapping-challenge with MIT License 6 votes vote down vote up
def fit(self, X, y, X_valid, y_valid, feature_names, categorical_features, **kwargs):
        train = lgb.Dataset(X, label=y,
                            feature_name=feature_names,
                            categorical_feature=categorical_features
                            )
        valid = lgb.Dataset(X_valid, label=y_valid,
                            feature_name=feature_names,
                            categorical_feature=categorical_features
                            )

        evaluation_results = {}
        self.estimator = lgb.train(self.model_config,
                                   train, valid_sets=[train, valid], valid_names=['train', 'valid'],
                                   evals_result=evaluation_results,
                                   num_boost_round=self.training_config.number_boosting_rounds,
                                   early_stopping_rounds=self.training_config.early_stopping_rounds,
                                   verbose_eval=self.model_config.verbose,
                                   feval=self.evaluation_function)
        return self 
Example #28
Source File: test_LightGbmTreeEnsembleConverters.py    From onnxmltools with MIT License 5 votes vote down vote up
def test_lightgbm_booster_classifier(self):
        X = [[0, 1], [1, 1], [2, 0], [1, 2]]
        X = numpy.array(X, dtype=numpy.float32)
        y = [0, 1, 0, 1]
        data = lightgbm.Dataset(X, label=y)
        model = lightgbm.train({'boosting_type': 'gbdt', 'objective': 'binary',
                                'n_estimators': 3, 'min_child_samples': 1},
                               data)
        model_onnx, prefix = convert_model(model, 'tree-based classifier',
                                           [('input', FloatTensorType([None, 2]))])
        dump_data_and_model(X, model, model_onnx,
                            allow_failure="StrictVersion(onnx.__version__) < StrictVersion('1.3.0')",
                            basename=prefix + "BoosterBin" + model.__class__.__name__) 
Example #29
Source File: test_LightGbmTreeEnsembleConverters.py    From onnxmltools with MIT License 5 votes vote down vote up
def test_lightgbm_booster_classifier_zipmap(self):
        X = [[0, 1], [1, 1], [2, 0], [1, 2]]
        X = numpy.array(X, dtype=numpy.float32)
        y = [0, 1, 0, 1]
        data = lightgbm.Dataset(X, label=y)
        model = lightgbm.train({'boosting_type': 'gbdt', 'objective': 'binary',
                                'n_estimators': 3, 'min_child_samples': 1},
                               data)
        model_onnx, prefix = convert_model(model, 'tree-based classifier',
                                           [('input', FloatTensorType([None, 2]))])
        assert "zipmap" in str(model_onnx).lower()
        dump_data_and_model(X, model, model_onnx,
                            allow_failure="StrictVersion(onnx.__version__) < StrictVersion('1.3.0')",
                            basename=prefix + "BoosterBin" + model.__class__.__name__) 
Example #30
Source File: lgb_trial.py    From autogluon with Apache License 2.0 5 votes vote down vote up
def lgb_trial(args, reporter):
    """ Training script for hyperparameter evaluation of Gradient Boosting model """
    try:
        model, args, util_args = model_trial.prepare_inputs(args=args)

        try_import_lightgbm()
        import lightgbm as lgb

        dataset_train = lgb.Dataset(util_args.directory + util_args.dataset_train_filename)
        dataset_val = lgb.Dataset(util_args.directory + util_args.dataset_val_filename)
        X_val, y_val = load_pkl.load(util_args.directory + util_args.dataset_val_pkl_filename)

        fit_model_args = dict(dataset_train=dataset_train, dataset_val=dataset_val)
        predict_proba_args = dict(X=X_val)
        model_trial.fit_and_save_model(model=model, params=args, fit_args=fit_model_args, predict_proba_args=predict_proba_args, y_test=y_val,
                                       time_start=util_args.time_start, time_limit=util_args.get('time_limit', None), reporter=reporter)
    except Exception as e:
        if not isinstance(e, TimeLimitExceeded):
            logger.exception(e, exc_info=True)
        reporter.terminate()

    # FIXME: If stopping metric and eval metric differ, the previous reported scores will not align as they will be evaluated with stopping_metric, whereas this is evaluated with eval_metric
    #  This should only impact if the reporter data is used
    # FIXME: If stopping metric score > eval metric score, stopping metric score will be recorded as best score, this is a defect!
    # FIXME: It might be the case that if a reporter has been recorded and the model crash, AutoGluon will try to access the invalid model and fail.
    # reporter(epoch=model.params_trained['num_boost_round'] + 1, validation_performance=score)