Python sklearn.tree.DecisionTreeRegressor() Examples

The following are 30 code examples for showing how to use sklearn.tree.DecisionTreeRegressor(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module sklearn.tree , or try the search function .

Example 1
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_bagging.py    License: MIT License 8 votes vote down vote up
def test_regression():
    # Check regression for various parameter settings.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
                                                        boston.target[:50],
                                                        random_state=rng)
    grid = ParameterGrid({"max_samples": [0.5, 1.0],
                          "max_features": [0.5, 1.0],
                          "bootstrap": [True, False],
                          "bootstrap_features": [True, False]})

    for base_estimator in [None,
                           DummyRegressor(),
                           DecisionTreeRegressor(),
                           KNeighborsRegressor(),
                           SVR(gamma='scale')]:
        for params in grid:
            BaggingRegressor(base_estimator=base_estimator,
                             random_state=rng,
                             **params).fit(X_train, y_train).predict(X_test) 
Example 2
Project: LearningX   Author: ankonzoid   File: BaggedTrees.py    License: MIT License 6 votes vote down vote up
def fit(self, X, y):
        """
        Method:
         1) Create n_estimator tree estimators with greedy splitting
         2) For each estimator i, sample (X, y) randomly N times with replacement
            and train the estimator on this sampled dataset (X_i, y_i)
         3) Predict by taking the mean predictions of each estimator
        """
        self.models = []
        N = len(X)
        for i in range(self.n_estimators):
            # Create tree with greedy splits
            model = DecisionTreeRegressor(max_depth=self.max_depth)
            # Bagging procedure
            idx_sample = np.random.choice(N, N)
            model.fit(X[idx_sample], y[idx_sample])
            self.models.append(model) 
Example 3
Project: LearningX   Author: ankonzoid   File: RandomForest.py    License: MIT License 6 votes vote down vote up
def fit(self, X, y):
        """
        Method:
         1) Create n_estimator tree estimators with random splitting
            on d/3 max features
         2) For each estimator i, sample (X, y) randomly N times with replacement
            and train the estimator on this sampled dataset (X_i, y_i)
         3) Predict by taking the mean predictions of each estimator
        """
        self.models = []
        N, d = X.shape
        for i in range(self.n_estimators):
            # Create tree with random splitting on d/3 max features
            model = DecisionTreeRegressor(max_depth=self.max_depth,
                                          min_samples_split=self.min_samples_split,
                                          min_samples_leaf=self.min_samples_leaf,
                                          splitter="random",
                                          max_features=d/3)
            # Bagging procedure
            idx_sample = np.random.choice(N, N)  # random sampling of length N
            model.fit(X[idx_sample], y[idx_sample])  # fit on random sampling
            self.models.append(model) 
Example 4
def Train(data, modelcount, censhu, yanzhgdata):
    model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=censhu),
                              n_estimators=modelcount, learning_rate=0.8)

    model.fit(data[:, :-1], data[:, -1])
    # 给出训练数据的预测值
    train_out = model.predict(data[:, :-1])
    # 计算MSE
    train_mse = mse(data[:, -1], train_out)

    # 给出验证数据的预测值
    add_yan = model.predict(yanzhgdata[:, :-1])
    # 计算MSE
    add_mse = mse(yanzhgdata[:, -1], add_yan)
    print(train_mse, add_mse)
    return train_mse, add_mse

# 最终确定组合的函数 
Example 5
Project: practicalDataAnalysisCookbook   Author: drabastomek   File: regression_cart.py    License: GNU General Public License v2.0 6 votes vote down vote up
def regression_cart(x,y):
    '''
        Estimate a CART regressor
    '''
    # create the regressor object
    cart = sk.DecisionTreeRegressor(min_samples_split=80,
        max_features="auto", random_state=66666, 
        max_depth=5)

    # estimate the model
    cart.fit(x,y)

    # return the object
    return cart

# the file name of the dataset 
Example 6
Project: m2cgen   Author: BayesWitnesses   File: test_meta.py    License: MIT License 6 votes vote down vote up
def test_ransac_custom_base_estimator():
    base_estimator = DecisionTreeRegressor()
    estimator = linear_model.RANSACRegressor(
        base_estimator=base_estimator,
        random_state=1)
    estimator.fit([[1], [2], [3]], [1, 2, 3])

    assembler = assemblers.RANSACModelAssembler(estimator)
    actual = assembler.assemble()

    expected = ast.IfExpr(
        ast.CompExpr(
            ast.FeatureRef(0),
            ast.NumVal(2.5),
            ast.CompOpType.LTE),
        ast.NumVal(2.0),
        ast.NumVal(3.0))

    assert utils.cmp_exprs(actual, expected) 
Example 7
Project: m2cgen   Author: BayesWitnesses   File: test_tree.py    License: MIT License 6 votes vote down vote up
def test_single_condition():
    estimator = tree.DecisionTreeRegressor()

    estimator.fit([[1], [2]], [1, 2])

    assembler = assemblers.TreeModelAssembler(estimator)
    actual = assembler.assemble()

    expected = ast.IfExpr(
        ast.CompExpr(
            ast.FeatureRef(0),
            ast.NumVal(1.5),
            ast.CompOpType.LTE),
        ast.NumVal(1.0),
        ast.NumVal(2.0))

    assert utils.cmp_exprs(actual, expected) 
Example 8
Project: m2cgen   Author: BayesWitnesses   File: test_tree.py    License: MIT License 6 votes vote down vote up
def test_two_conditions():
    estimator = tree.DecisionTreeRegressor()

    estimator.fit([[1], [2], [3]], [1, 2, 3])

    assembler = assemblers.TreeModelAssembler(estimator)
    actual = assembler.assemble()

    expected = ast.IfExpr(
        ast.CompExpr(
            ast.FeatureRef(0),
            ast.NumVal(1.5),
            ast.CompOpType.LTE),
        ast.NumVal(1.0),
        ast.IfExpr(
            ast.CompExpr(
                ast.FeatureRef(0),
                ast.NumVal(2.5),
                ast.CompOpType.LTE),
            ast.NumVal(2.0),
            ast.NumVal(3.0)))

    assert utils.cmp_exprs(actual, expected) 
Example 9
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_bagging.py    License: MIT License 6 votes vote down vote up
def test_bootstrap_features():
    # Test that bootstrapping features may generate duplicate features.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                max_features=1.0,
                                bootstrap_features=False,
                                random_state=rng).fit(X_train, y_train)

    for features in ensemble.estimators_features_:
        assert_equal(boston.data.shape[1], np.unique(features).shape[0])

    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                max_features=1.0,
                                bootstrap_features=True,
                                random_state=rng).fit(X_train, y_train)

    for features in ensemble.estimators_features_:
        assert_greater(boston.data.shape[1], np.unique(features).shape[0]) 
Example 10
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_bagging.py    License: MIT License 6 votes vote down vote up
def test_parallel_regression():
    # Check parallel regression.
    rng = check_random_state(0)

    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    ensemble = BaggingRegressor(DecisionTreeRegressor(),
                                n_jobs=3,
                                random_state=0).fit(X_train, y_train)

    ensemble.set_params(n_jobs=1)
    y1 = ensemble.predict(X_test)
    ensemble.set_params(n_jobs=2)
    y2 = ensemble.predict(X_test)
    assert_array_almost_equal(y1, y2)

    ensemble = BaggingRegressor(DecisionTreeRegressor(),
                                n_jobs=1,
                                random_state=0).fit(X_train, y_train)

    y3 = ensemble.predict(X_test)
    assert_array_almost_equal(y1, y3) 
Example 11
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_weight_boosting.py    License: MIT License 6 votes vote down vote up
def test_gridsearch():
    # Check that base trees can be grid-searched.
    # AdaBoost classification
    boost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
    parameters = {'n_estimators': (1, 2),
                  'base_estimator__max_depth': (1, 2),
                  'algorithm': ('SAMME', 'SAMME.R')}
    clf = GridSearchCV(boost, parameters)
    clf.fit(iris.data, iris.target)

    # AdaBoost regression
    boost = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(),
                              random_state=0)
    parameters = {'n_estimators': (1, 2),
                  'base_estimator__max_depth': (1, 2)}
    clf = GridSearchCV(boost, parameters)
    clf.fit(boston.data, boston.target) 
Example 12
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_tree.py    License: MIT License 6 votes vote down vote up
def test_importances_gini_equal_mse():
    # Check that gini is equivalent to mse for binary output variable

    X, y = datasets.make_classification(n_samples=2000,
                                        n_features=10,
                                        n_informative=3,
                                        n_redundant=0,
                                        n_repeated=0,
                                        shuffle=False,
                                        random_state=0)

    # The gini index and the mean square error (variance) might differ due
    # to numerical instability. Since those instabilities mainly occurs at
    # high tree depth, we restrict this maximal depth.
    clf = DecisionTreeClassifier(criterion="gini", max_depth=5,
                                 random_state=0).fit(X, y)
    reg = DecisionTreeRegressor(criterion="mse", max_depth=5,
                                random_state=0).fit(X, y)

    assert_almost_equal(clf.feature_importances_, reg.feature_importances_)
    assert_array_equal(clf.tree_.feature, reg.tree_.feature)
    assert_array_equal(clf.tree_.children_left, reg.tree_.children_left)
    assert_array_equal(clf.tree_.children_right, reg.tree_.children_right)
    assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples) 
Example 13
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_impute.py    License: MIT License 6 votes vote down vote up
def test_imputation_pipeline_grid_search():
    # Test imputation within a pipeline + gridsearch.
    X = sparse_random_matrix(100, 100, density=0.10)
    missing_values = X.data[0]

    pipeline = Pipeline([('imputer',
                          SimpleImputer(missing_values=missing_values)),
                         ('tree',
                          tree.DecisionTreeRegressor(random_state=0))])

    parameters = {
        'imputer__strategy': ["mean", "median", "most_frequent"]
    }

    Y = sparse_random_matrix(100, 1, density=0.10).toarray()
    gs = GridSearchCV(pipeline, parameters)
    gs.fit(X, Y) 
Example 14
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_multiclass.py    License: MIT License 6 votes vote down vote up
def test_ovr_ovo_regressor():
    # test that ovr and ovo work on regressors which don't have a decision_
    # function
    ovr = OneVsRestClassifier(DecisionTreeRegressor())
    pred = ovr.fit(iris.data, iris.target).predict(iris.data)
    assert_equal(len(ovr.estimators_), n_classes)
    assert_array_equal(np.unique(pred), [0, 1, 2])
    # we are doing something sensible
    assert_greater(np.mean(pred == iris.target), .9)

    ovr = OneVsOneClassifier(DecisionTreeRegressor())
    pred = ovr.fit(iris.data, iris.target).predict(iris.data)
    assert_equal(len(ovr.estimators_), n_classes * (n_classes - 1) / 2)
    assert_array_equal(np.unique(pred), [0, 1, 2])
    # we are doing something sensible
    assert_greater(np.mean(pred == iris.target), .9) 
Example 15
Project: drifter_ml   Author: EricSchles   File: test_regression_tests.py    License: MIT License 6 votes vote down vote up
def generate_regression_data_and_models():
    df = pd.DataFrame()
    for _ in range(1000):
        a = np.random.normal(0, 1)
        b = np.random.normal(0, 3)
        c = np.random.normal(12, 4)
        target = a + b + c
        df = df.append({
            "A": a,
            "B": b,
            "C": c,
            "target": target
        }, ignore_index=True)

    reg1 = tree.DecisionTreeRegressor()
    reg2 = ensemble.RandomForestRegressor()
    column_names = ["A", "B", "C"]
    target_name = "target"
    X = df[column_names]
    reg1.fit(X, df[target_name])
    reg2.fit(X, df[target_name])
    return df, column_names, target_name, reg1, reg2 
Example 16
Project: coremltools   Author: apple   File: test_io_types.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_tree_regressor(self):
        for dtype in self.number_data_type.keys():
            scikit_model = DecisionTreeRegressor(random_state=1)
            data = self.scikit_data["data"].astype(dtype)
            target = self.scikit_data["target"].astype(dtype)
            scikit_model, spec = self._sklearn_setup(scikit_model, dtype, data, target)
            test_data = data[0].reshape(1, -1)
            self._check_tree_model(spec, "multiArrayType", "doubleType", 1)
            coreml_model = create_model(spec)
            try:
                self.assertEqual(
                    scikit_model.predict(test_data)[0].dtype,
                    type(coreml_model.predict({"data": test_data})["target"]),
                )
                self.assertEqual(
                    scikit_model.predict(test_data)[0],
                    coreml_model.predict({"data": test_data})["target"],
                    msg="{} != {} for Dtype: {}".format(
                        scikit_model.predict(test_data)[0],
                        coreml_model.predict({"data": test_data})["target"],
                        dtype,
                    ),
                )
            except RuntimeError:
                print("{} not supported. ".format(dtype)) 
Example 17
Project: Malware-GAN   Author: yanminglai   File: MalGAN__v3.py    License: GNU General Public License v3.0 6 votes vote down vote up
def build_blackbox_detector(self):

        if self.blackbox is 'RF':
            blackbox_detector = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=1)
        elif self.blackbox is 'SVM':
            blackbox_detector = svm.SVC()
        elif self.blackbox is 'LR':
            blackbox_detector = linear_model.LogisticRegression()
        elif self.blackbox is 'DT':
            blackbox_detector = tree.DecisionTreeRegressor()
        elif self.blackbox is 'MLP':
            blackbox_detector = MLPClassifier(hidden_layer_sizes=(50,), max_iter=10, alpha=1e-4,
                                              solver='sgd', verbose=0, tol=1e-4, random_state=1,
                                              learning_rate_init=.1)
        elif self.blackbox is 'VOTE':
            blackbox_detector = VOTEClassifier()

        return blackbox_detector 
Example 18
Project: tpot   Author: EpistasisLab   File: export_tests.py    License: GNU Lesser General Public License v3.0 6 votes vote down vote up
def test_set_param_recursive_2():
    """Assert that set_param_recursive sets \"random_state\" to 42 in nested estimator in SelectFromModel."""
    pipeline_string = (
        'DecisionTreeRegressor(SelectFromModel(input_matrix, '
        'SelectFromModel__ExtraTreesRegressor__max_features=0.05, SelectFromModel__ExtraTreesRegressor__n_estimators=100, '
        'SelectFromModel__threshold=0.05), DecisionTreeRegressor__max_depth=8,'
        'DecisionTreeRegressor__min_samples_leaf=5, DecisionTreeRegressor__min_samples_split=5)'
    )
    tpot_obj = TPOTRegressor()
    tpot_obj._fit_init()
    deap_pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline)
    set_param_recursive(sklearn_pipeline.steps, 'random_state', 42)

    assert getattr(getattr(sklearn_pipeline.steps[0][1], 'estimator'), 'random_state') == 42
    assert getattr(sklearn_pipeline.steps[1][1], 'random_state') == 42 
Example 19
Project: pycobra   Author: bhargavvader   File: ewa.py    License: MIT License 6 votes vote down vote up
def load_default(self, machine_list=['lasso', 'tree', 'ridge', 'random_forest', 'svm']):
        """
        Loads 4 different scikit-learn regressors by default.

        Parameters
        ----------
        machine_list: optional, list of strings
            List of default machine names to be loaded.

        """
        for machine in machine_list:
            try:
                if machine == 'lasso':
                    self.estimators_['lasso'] = linear_model.LassoCV(random_state=self.random_state).fit(self.X_k_, self.y_k_)
                if machine == 'tree':
                    self.estimators_['tree'] = DecisionTreeRegressor(random_state=self.random_state).fit(self.X_k_, self.y_k_)
                if machine == 'ridge':
                    self.estimators_['ridge'] = linear_model.RidgeCV().fit(self.X_k_, self.y_k_)
                if machine == 'random_forest':
                    self.estimators_['random_forest'] = RandomForestRegressor(random_state=self.random_state).fit(self.X_k_, self.y_k_)
                if machine == 'svm':
                    self.estimators_['svm'] = SVR().fit(self.X_k_, self.y_k_)
            except ValueError:
                continue 
Example 20
Project: CausalDiscoveryToolbox   Author: FenTechSolutions   File: FSRegression.py    License: MIT License 6 votes vote down vote up
def predict_features(self, df_features, df_target, idx=0, **kwargs):
        """For one variable, predict its neighbouring nodes.

        Args:
            df_features (pandas.DataFrame):
            df_target (pandas.Series):
            idx (int): (optional) for printing purposes
            kwargs (dict): additional options for algorithms

        Returns:
            list: scores of each feature relatively to the target
        """
        X = df_features.values
        y = df_target.values
        regressor = DecisionTreeRegressor()
        regressor.fit(X, y)

        return regressor.feature_importances_ 
Example 21
Project: LearningX   Author: ankonzoid   File: DT_sklearn_regr.py    License: MIT License 5 votes vote down vote up
def __init__(self, max_depth=20, min_samples_leaf=10):
        from sklearn.tree import DecisionTreeRegressor
        self.model = DecisionTreeRegressor(max_depth=max_depth,
                                           min_samples_leaf=min_samples_leaf,
                                           criterion="mse") 
Example 22
Project: LearningX   Author: ankonzoid   File: GradientBoostedTree.py    License: MIT License 5 votes vote down vote up
def fit(self, X, y):
        """
        Method:
         1) Train tree with greedy splitting on dataset (X, y)
         2) Recursively (n_estimator times) compute the residual between the
            truth and prediction values (res = y-y_pred), and use the residual as
            the next estimator's training y (keep X the same)
         3) The prediction at each estimator is the trained prediction plus
            the previous trained prediction, making the full prediction of the final
            model the sum of the predictions of each model.
        """
        self.models = []
        y_i = y
        y_pred_i = np.zeros(y.shape)
        for i in range(self.n_estimators):
            # Create tree with greedy splits
            model = DecisionTreeRegressor(max_depth=self.max_depth,
                                          min_samples_split=self.min_samples_split,
                                          min_samples_leaf=self.min_samples_leaf)
            model.fit(X, y_i)
            # Boosting procedure
            y_pred = model.predict(X) + y_pred_i  # add previous prediction
            res = y - y_pred  # compute residual
            y_i = res  # set training label as residual
            y_pred_i = y_pred  # update prediction value
            self.models.append(model) 
Example 23
def Adaboost_First(self, data, max_depth=5, n_estimators=320):
        model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=max_depth),
                                  n_estimators=n_estimators, learning_rate=0.8)
        model.fit(data['train'][:, :-1], data['train'][:, -1])
        # 注意存储验证数据集结果和预测数据集结果的不同
        # 训练数据集的预测结果
        xul = model.predict(data['train'][:, :-1])
        # 验证的预测结果
        yanre = model.predict(data['test'][:, :-1])
        # 预测的预测结果
        prer = model.predict(data['predict'][:, :-1])
        # 储存
        self.yanzhneg_pr.append(yanre)
        self.predi.append(prer)
        # 分别计算训练、验证、预测的误差
        # 每计算一折后,要计算训练、验证、预测数据的误差
        xx = self.RMSE(xul, data['train'][:, -1])
        yy = self.RMSE(yanre, data['test'][:, -1])
        pp = self.RMSE(prer, data['predict'][:, -1])
        # 储存误差
        self.error_dict['AdaBoost'] = [xx, yy, pp]
        # 验证数据集的真实输出结果
        self.yanzhneg_real = data['test'][:, -1]

        # 预测数据集的真实输出结果
        self.preal = data['predict'][:, -1]
        return print('1层中的AdaBoost运行完毕')

    # GBDT 
Example 24
def recspre(exstr, predata, datadict, zhe, count=100):
    tree, te = exstr.split('-')
    model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=int(te)),
                              n_estimators=int(tree), learning_rate=0.8)
    model.fit(datadict[zhe]['train'][:, :-1], datadict[zhe]['train'][:, -1])

    # 预测
    yucede = model.predict(predata[:, :-1])
    # 为了便于展示,选100条数据进行展示
    zongleng = np.arange(len(yucede))
    randomnum = np.random.choice(zongleng, count, replace=False)

    yucede_se = list(np.array(yucede)[randomnum])

    yuce_re = list(np.array(predata[:, -1])[randomnum])

    # 对比
    plt.figure(figsize=(17, 9))
    plt.subplot(2, 1, 1)
    plt.plot(list(range(len(yucede_se))), yucede_se, 'r--', label='预测', lw=2)
    plt.scatter(list(range(len(yuce_re))), yuce_re, c='b', marker='.', label='真实', lw=2)
    plt.xlim(-1, count + 1)
    plt.legend()
    plt.title('预测和真实值对比[最大树数%d]' % int(tree))

    plt.subplot(2, 1, 2)
    plt.plot(list(range(len(yucede_se))), np.array(yuce_re) - np.array(yucede_se), 'k--', marker='s', label='真实-预测', lw=2)
    plt.legend()
    plt.title('预测和真实值相对误差')

    plt.savefig(r'C:\Users\GWT9\Desktop\duibi.jpg')
    return '预测真实对比完毕'

# 主函数 
Example 25
Project: defragTrees   Author: sato9hara   File: Baselines.py    License: MIT License 5 votes vote down vote up
def fit(self, X, y, featurename=[]):
        self.dim_ = X.shape[1]
        self.setfeaturename(featurename)
        self.setdefaultpred(y)
        param_grid = {"max_depth": self.max_depth_, "min_samples_leaf": self.min_samples_leaf_}
        if self.modeltype_ == 'regression':
            mdl = tree.DecisionTreeRegressor()
        elif self.modeltype_ == 'classification':
            mdl = tree.DecisionTreeClassifier()
        grid_search = GridSearchCV(mdl, param_grid=param_grid, cv=self.cv_)
        grid_search.fit(X, y)
        mdl = grid_search.best_estimator_
        self.__parseTree(mdl)
        self.weight_ = np.ones(len(self.rule_)) 
Example 26
Project: kaggle-HomeDepot   Author: ChenglongChen   File: skl_utils.py    License: MIT License 5 votes vote down vote up
def __init__(self, base_estimator=None, n_estimators=50, max_features=1.0,
                max_depth=6, learning_rate=1.0, loss='linear', random_state=None):
        if base_estimator and base_estimator == 'etr':
            base_estimator = ExtraTreeRegressor(max_depth=max_depth,
                                        max_features=max_features)
        else:
            base_estimator = DecisionTreeRegressor(max_depth=max_depth,
                                        max_features=max_features)

        self.model = sklearn.ensemble.AdaBoostRegressor(
                                    base_estimator=base_estimator,
                                    n_estimators=n_estimators,
                                    learning_rate=learning_rate,
                                    random_state=random_state,
                                    loss=loss) 
Example 27
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_bagging.py    License: MIT License 5 votes vote down vote up
def test_bootstrap_samples():
    # Test that bootstrapping samples generate non-perfect base estimators.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    base_estimator = DecisionTreeRegressor().fit(X_train, y_train)

    # without bootstrap, all trees are perfect on the training set
    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                max_samples=1.0,
                                bootstrap=False,
                                random_state=rng).fit(X_train, y_train)

    assert_equal(base_estimator.score(X_train, y_train),
                 ensemble.score(X_train, y_train))

    # with bootstrap, trees are no longer perfect on the training set
    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                max_samples=1.0,
                                bootstrap=True,
                                random_state=rng).fit(X_train, y_train)

    assert_greater(base_estimator.score(X_train, y_train),
                   ensemble.score(X_train, y_train))

    # check that each sampling correspond to a complete bootstrap resample.
    # the size of each bootstrap should be the same as the input data but
    # the data should be different (checked using the hash of the data).
    ensemble = BaggingRegressor(base_estimator=DummySizeEstimator(),
                                bootstrap=True).fit(X_train, y_train)
    training_hash = []
    for estimator in ensemble.estimators_:
        assert estimator.training_size_ == X_train.shape[0]
        training_hash.append(estimator.training_hash_)
    assert len(set(training_hash)) == len(training_hash) 
Example 28
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_bagging.py    License: MIT License 5 votes vote down vote up
def test_oob_score_regression():
    # Check that oob prediction is a good estimation of the generalization
    # error.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    clf = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                           n_estimators=50,
                           bootstrap=True,
                           oob_score=True,
                           random_state=rng).fit(X_train, y_train)

    test_score = clf.score(X_test, y_test)

    assert_less(abs(test_score - clf.oob_score_), 0.1)

    # Test with few estimators
    assert_warns(UserWarning,
                 BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                  n_estimators=1,
                                  bootstrap=True,
                                  oob_score=True,
                                  random_state=rng).fit,
                 X_train,
                 y_train) 
Example 29
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_bagging.py    License: MIT License 5 votes vote down vote up
def test_bagging_regressor_with_missing_inputs():
    # Check that BaggingRegressor can accept X with missing/infinite data
    X = np.array([
        [1, 3, 5],
        [2, None, 6],
        [2, np.nan, 6],
        [2, np.inf, 6],
        [2, np.NINF, 6],
    ])
    y_values = [
        np.array([2, 3, 3, 3, 3]),
        np.array([
            [2, 1, 9],
            [3, 6, 8],
            [3, 6, 8],
            [3, 6, 8],
            [3, 6, 8],
        ])
    ]
    for y in y_values:
        regressor = DecisionTreeRegressor()
        pipeline = make_pipeline(
            FunctionTransformer(replace, validate=False),
            regressor
        )
        pipeline.fit(X, y).predict(X)
        bagging_regressor = BaggingRegressor(pipeline)
        y_hat = bagging_regressor.fit(X, y).predict(X)
        assert_equal(y.shape, y_hat.shape)

        # Verify that exceptions can be raised by wrapper regressor
        regressor = DecisionTreeRegressor()
        pipeline = make_pipeline(regressor)
        assert_raises(ValueError, pipeline.fit, X, y)
        bagging_regressor = BaggingRegressor(pipeline)
        assert_raises(ValueError, bagging_regressor.fit, X, y) 
Example 30
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_export.py    License: MIT License 5 votes vote down vote up
def test_friedman_mse_in_graphviz():
    clf = DecisionTreeRegressor(criterion="friedman_mse", random_state=0)
    clf.fit(X, y)
    dot_data = StringIO()
    export_graphviz(clf, out_file=dot_data)

    clf = GradientBoostingClassifier(n_estimators=2, random_state=0)
    clf.fit(X, y)
    for estimator in clf.estimators_:
        export_graphviz(estimator[0], out_file=dot_data)

    for finding in finditer(r"\[.*?samples.*?\]", dot_data.getvalue()):
        assert_in("friedman_mse", finding.group())