Python sklearn.ensemble() Examples

The following are 24 code examples of sklearn.ensemble(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn , or try the search function .
Example #1
Source File: bagging.py    From brew with MIT License 6 votes vote down vote up
def __init__(self,
                 base_classifier=None,
                 n_classifiers=100,
                 combination_rule='majority_vote'):

        self.base_classifier = base_classifier
        self.n_classifiers = n_classifiers

        # using the sklearn implementation of bagging for now
        self.sk_bagging = BaggingClassifier(base_estimator=base_classifier,
                                            n_estimators=n_classifiers,
                                            max_samples=1.0,
                                            max_features=1.0)

        self.ensemble = Ensemble()
        self.combiner = Combiner(rule=combination_rule) 
Example #2
Source File: bagging.py    From brew with MIT License 6 votes vote down vote up
def __init__(self,
                 base_classifier=None,
                 n_classifiers=100,
                 combination_rule='majority_vote'):

        self.base_classifier = base_classifier
        self.n_classifiers = n_classifiers

        # using the sklearn implementation of bagging for now
        self.sk_bagging = BaggingClassifier(base_estimator=base_classifier,
                                            n_estimators=n_classifiers,
                                            max_samples=1.0,
                                            max_features=1.0)

        self.ensemble = Ensemble()
        self.combiner = Combiner(rule=combination_rule) 
Example #3
Source File: transfer.py    From rmnist with MIT License 5 votes vote down vote up
def transfer(n):
    td, vd, ts = data_loader.load_data(n, abstract=True, expanded=expanded)
    classifiers = [
        #sklearn.svm.SVC(),
        #sklearn.svm.SVC(kernel="linear", C=0.1),
        #sklearn.neighbors.KNeighborsClassifier(1),
        #sklearn.tree.DecisionTreeClassifier(),
        #sklearn.ensemble.RandomForestClassifier(max_depth=10, n_estimators=500, max_features=1),
        sklearn.neural_network.MLPClassifier(alpha=1.0, hidden_layer_sizes=(300,), max_iter=500)
    ]
    for clf in classifiers:
        clf.fit(td[0], td[1])
        print "\n{}: {}".format(type(clf).__name__, round(clf.score(vd[0], vd[1])*100, 2)) 
Example #4
Source File: bagging.py    From brew with MIT License 5 votes vote down vote up
def fit(self, X, y):
        self.sk_bagging.fit(X, y)
        self.ensemble.add_classifiers(self.sk_bagging.estimators_)
        # self.classes_ = set(y) 
Example #5
Source File: bagging.py    From brew with MIT License 5 votes vote down vote up
def predict(self, X):
        out = self.ensemble.output(X)
        return self.combiner.combine(out) 
Example #6
Source File: bagging.py    From brew with MIT License 5 votes vote down vote up
def fit(self, X, y):
        self.ensemble = Ensemble()

        for _ in range(self.n_classifiers):
            # bootstrap
            idx = np.random.choice(X.shape[0], X.shape[0], replace=True)
            data, target = X[idx, :], y[idx]

            classifier = sklearn.base.clone(self.base_classifier)
            classifier.fit(data, target)

            self.ensemble.add(classifier)

        return 
Example #7
Source File: bagging.py    From brew with MIT License 5 votes vote down vote up
def __init__(self,
                 base_classifier=None,
                 n_classifiers=100,
                 combination_rule='majority_vote'):

        self.base_classifier = base_classifier
        self.n_classifiers = n_classifiers
        self.ensemble = None
        self.combiner = Combiner(rule=combination_rule) 
Example #8
Source File: bagging.py    From brew with MIT License 5 votes vote down vote up
def fit(self, X, y):
        self.sk_bagging.fit(X, y)
        self.ensemble.add_classifiers(self.sk_bagging.estimators_)
        # self.classes_ = set(y) 
Example #9
Source File: bagging.py    From brew with MIT License 5 votes vote down vote up
def predict(self, X):
        out = self.ensemble.output(X)
        return self.combiner.combine(out) 
Example #10
Source File: bagging.py    From brew with MIT License 5 votes vote down vote up
def fit(self, X, y):
        self.ensemble = Ensemble()

        for _ in range(self.n_classifiers):
            # bootstrap
            idx = np.random.choice(X.shape[0], X.shape[0], replace=True)
            data, target = X[idx, :], y[idx]

            classifier = sklearn.base.clone(self.base_classifier)
            classifier.fit(data, target)

            self.ensemble.add(classifier)

        return 
Example #11
Source File: bagging.py    From brew with MIT License 5 votes vote down vote up
def __init__(self,
                 base_classifier=None,
                 n_classifiers=100,
                 combination_rule='majority_vote'):

        self.base_classifier = base_classifier
        self.n_classifiers = n_classifiers
        self.ensemble = None
        self.combiner = Combiner(rule=combination_rule) 
Example #12
Source File: baselines.py    From rmnist with MIT License 5 votes vote down vote up
def baselines(n):
    td, vd, ts = data_loader.load_data(n)
    classifiers = [
        sklearn.svm.SVC(C=1000),
        sklearn.svm.SVC(kernel="linear", C=0.1),
        sklearn.neighbors.KNeighborsClassifier(1),
        sklearn.tree.DecisionTreeClassifier(),
        sklearn.ensemble.RandomForestClassifier(max_depth=10, n_estimators=500, max_features=1),
        sklearn.neural_network.MLPClassifier(alpha=1, hidden_layer_sizes=(500, 100))
    ]
    for clf in classifiers:
        clf.fit(td[0], td[1])
        print "\n{}: {}".format(type(clf).__name__, round(clf.score(vd[0], vd[1])*100, 2)) 
Example #13
Source File: test_grid_hyperparam_opt.py    From deepchem with MIT License 5 votes vote down vote up
def setUp(self):
    """Set up common resources."""

    def rf_model_builder(**model_params):
      rf_params = {k: v for (k, v) in model_params.items() if k != 'model_dir'}
      model_dir = model_params['model_dir']
      sklearn_model = sklearn.ensemble.RandomForestRegressor(**rf_params)
      return dc.models.SklearnModel(sklearn_model, model_dir)

    self.rf_model_builder = rf_model_builder
    self.train_dataset = dc.data.NumpyDataset(
        X=np.random.rand(50, 5), y=np.random.rand(50, 1))
    self.valid_dataset = dc.data.NumpyDataset(
        X=np.random.rand(20, 5), y=np.random.rand(20, 1)) 
Example #14
Source File: GetMLPara.py    From dr_droid with Apache License 2.0 5 votes vote down vote up
def feature_importances(X,y):
    # the output does not stable because of the randomness
    # Build a classification task using 3 informative features
    #X, y = make_classification(n_samples=1000,n_features=10,n_informative=3,n_redundant=0,n_repeated=0,n_classes=2,n_state=0,shuffle=False)
    # Build a forest and compute the feature importances
    from sklearn.ensemble import ExtraTreesClassifier
    forest = ExtraTreesClassifier(n_estimators= 25, criterion = 'entropy' , random_state=None)
    forest.fit(X, y)
    importances = forest.feature_importances_

    std = np.std([tree.feature_importances_ for tree in forest.estimators_],axis=0)
    indices = np.argsort(importances)[::-1]
    # print (indices)
    # Print the feature ranking
    print("Feature ranking:")
    sum1 = 0.0
    for f in range(80):
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
        sum1 = sum1 +  importances[indices[f]]
    print (sum1)
    # Plot the feature importances of the forest
    #width = 0.5
    x_len = range(len(importances))
    plt.figure()
    plt.title("Feature importances")
    plt.bar(x_len, importances[indices] ,color="r", yerr=std[indices], align="center")
    plt.xticks(x_len, indices)
    plt.xlim([-1, max(x_len)+1])
    plt.show()

######################################READ DATA#################################################### 
Example #15
Source File: lambdamart.py    From pyltr with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def iter_y_delta(self, i, X):
        assert i >= 0 and i < self.estimators_fitted_

        X = sklearn.utils.validation.check_array(
            X, dtype=sklearn.tree._tree.DTYPE, order='C')
        score = np.zeros((X.shape[0], 1))
        sklearn.ensemble._gradient_boosting.predict_stage(
            self.estimators_, i, X, self.learning_rate, score)

        return score.ravel() 
Example #16
Source File: lambdamart.py    From pyltr with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def predict(self, X):
        X = sklearn.utils.validation.check_array(
            X, dtype=sklearn.tree._tree.DTYPE, order='C')
        score = np.zeros((X.shape[0], 1))
        estimators = self.estimators_
        if self.estimators_fitted_ < len(estimators):
            estimators = estimators[:self.estimators_fitted_]
        sklearn.ensemble._gradient_boosting.predict_stages(
            estimators, X, self.learning_rate, score)

        return score.ravel() 
Example #17
Source File: models.py    From jh-kaggle-util with Apache License 2.0 5 votes vote down vote up
def run_ensemble():
  MODELS = [
    'xgboost-0p576026_20190319-181720',
    'keras-0p463293_20190319-185422'
  ]
  ensemble(MODELS) 
Example #18
Source File: field_based_ml_field_detection.py    From lexpredict-contraxsuite with GNU Affero General Public License v3.0 5 votes vote down vote up
def init_classifier_impl(field_code: str, init_script: str):
    if init_script is not None:
        init_script = init_script.strip()

    if not init_script:
        from sklearn import tree as sklearn_tree
        return sklearn_tree.DecisionTreeClassifier()

    from sklearn import tree as sklearn_tree
    from sklearn import neural_network as sklearn_neural_network
    from sklearn import neighbors as sklearn_neighbors
    from sklearn import svm as sklearn_svm
    from sklearn import gaussian_process as sklearn_gaussian_process
    from sklearn.gaussian_process import kernels as sklearn_gaussian_process_kernels
    from sklearn import ensemble as sklearn_ensemble
    from sklearn import naive_bayes as sklearn_naive_bayes
    from sklearn import discriminant_analysis as sklearn_discriminant_analysis
    from sklearn import linear_model as sklearn_linear_model

    eval_locals = {
        'sklearn_linear_model': sklearn_linear_model,
        'sklearn_tree': sklearn_tree,
        'sklearn_neural_network': sklearn_neural_network,
        'sklearn_neighbors': sklearn_neighbors,
        'sklearn_svm': sklearn_svm,
        'sklearn_gaussian_process': sklearn_gaussian_process,
        'sklearn_gaussian_process_kernels': sklearn_gaussian_process_kernels,
        'sklearn_ensemble': sklearn_ensemble,
        'sklearn_naive_bayes': sklearn_naive_bayes,
        'sklearn_discriminant_analysis': sklearn_discriminant_analysis
    }
    return eval_script('classifier init script of field {0}'.format(field_code), init_script, eval_locals) 
Example #19
Source File: advanced_supvervised_model_trainer.py    From healthcareai-py with MIT License 5 votes vote down vote up
def ensemble_regression(self, scoring_metric='neg_mean_squared_error', model_by_name=None):
        # TODO stub
        self.validate_regression('Ensemble Regression')
        raise HealthcareAIError('We apologize. An ensemble linear regression has not yet been implemented.') 
Example #20
Source File: Policy.py    From slates_semisynth_expts with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def train(self, dataset, targets, hyper_params):
        numQueries=len(dataset.docsPerQuery)
        validDocs=numpy.minimum(dataset.docsPerQuery, self.rankingSize)
        queryDocPosTriplets=numpy.dot(dataset.docsPerQuery, validDocs)
        designMatrix=numpy.zeros((queryDocPosTriplets, self.numFeatures), dtype=numpy.float32, order='F')
        regressionTargets=numpy.zeros(queryDocPosTriplets, dtype=numpy.float64, order='C')
        sampleWeights=numpy.zeros(queryDocPosTriplets, dtype=numpy.float32)
        currID=-1
        for i in range(numQueries):
            numAllowedDocs=dataset.docsPerQuery[i]
            currValidDocs=validDocs[i]
            allFeatures=dataset.features[i].toarray()
            
            for doc in range(numAllowedDocs):
                docID=doc
                if dataset.mask is not None:
                    docID=dataset.mask[i][doc]
                    
                for j in range(currValidDocs):
                    currID+=1

                    designMatrix[currID,:]=self.createFeature(allFeatures[docID,:], j)
                    regressionTargets[currID]=targets[i][j,doc] 
                    sampleWeights[currID]=1.0/(numAllowedDocs * currValidDocs)
        
        for i in targets:
            del i
        del targets
        
        print("L2RPolicy:train [LOG] Finished creating features and targets ", 
                numpy.amin(regressionTargets), numpy.amax(regressionTargets), numpy.median(regressionTargets), flush=True)
        print("L2RPolicy:train [LOG] Histogram of targets ", numpy.histogram(regressionTargets), flush=True)
        
        if self.modelType == 'gbrt':
            tree=sklearn.ensemble.GradientBoostingRegressor(learning_rate=hyper_params['lr'],
                            n_estimators=hyper_params['ensemble'], subsample=hyper_params['subsample'], max_leaf_nodes=hyper_params['leaves'], 
                            max_features=1.0, presort=False)
            tree.fit(designMatrix, regressionTargets, sample_weight=sampleWeights)
            self.tree=tree
            print("L2RPolicy:train [INFO] %s" % self.modelType, flush=True)
                
        elif self.modelType == 'ridge':
            ridgeCV=sklearn.linear_model.RidgeCV(alphas=self.hyperParams, fit_intercept=False,
                                                            normalize=False, cv=3)
            ridgeCV.fit(designMatrix, regressionTargets, sample_weight=sampleWeights)
            self.policyParams=ridgeCV.coef_
            print("L2RPolicy:train [INFO] Done. ", flush=True)
            
        else:
            print("L2RPolicy:train [ERR] %s not supported." % self.modelType, flush = True)
            sys.exit(0)
            
        print("L2R:train [INFO] Created %s predictor using dataset %s." %
                (self.modelType, dataset.name), flush = True) 
Example #21
Source File: unsupervised.py    From fklearn with Apache License 2.0 4 votes vote down vote up
def isolation_forest_learner(df: pd.DataFrame,
                             features: List[str],
                             params: Dict[str, Any] = None,
                             prediction_column: str = "prediction",
                             encode_extra_cols: bool = True) -> LearnerReturnType:
    """
    Fits an anomaly detection algorithm (Isolation Forest) to the dataset

    Parameters
    ----------
    df : pandas.DataFrame
        A Pandas' DataFrame with features and target columns.
        The model will be trained to predict the target column
        from the features.

    features : list of str
        A list os column names that are used as features for the model. All this names
        should be in `df`.

    params : dict
        The IsolationForest parameters in the format {"par_name": param}. See:
        http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html

    prediction_column : str
        The name of the column with the predictions from the model.

    encode_extra_cols : bool (default: True)
        If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
    """

    default_params = {"n_jobs": -1, "random_state": 1729, "contamination": 0.1, "behaviour": "new"}
    params = default_params if not params else merge(default_params, params)

    features = features if not encode_extra_cols else expand_features_encoded(df, features)

    model = IsolationForest()
    model.set_params(**params)
    model.fit(df[features].values)

    def p(new_df: pd.DataFrame) -> pd.DataFrame:
        output_col = {prediction_column: model.decision_function(
            new_df[features])}

        return new_df.assign(**output_col)

    p.__doc__ = learner_pred_fn_docstring("isolation_forest_learner")

    log = {'isolation_forest_learner': {
        'features': features,
        'parameters': params,
        'prediction_column': prediction_column,
        'package': "sklearn",
        'package_version': sklearn.__version__,
        'training_samples': len(df)}}

    return p, p(df), log 
Example #22
Source File: clf_helpers.py    From ibeis with Apache License 2.0 4 votes vote down vote up
def _get_estimator(pblm, clf_key):
        """
        Returns sklearn classifier
        """
        tup = clf_key.split('-')
        wrap_type = None if len(tup) == 1 else tup[1]
        est_type = tup[0]
        multiclass_wrapper = {
            None: ut.identity,
            'OVR': sklearn.multiclass.OneVsRestClassifier,
            'OVO': sklearn.multiclass.OneVsOneClassifier,
        }[wrap_type]
        est_class = {
            'RF': sklearn.ensemble.RandomForestClassifier,
            'SVC': sklearn.svm.SVC,
            'Logit': sklearn.linear_model.LogisticRegression,
            'MLP': sklearn.neural_network.MLPClassifier,
        }[est_type]

        est_kw1, est_kw2 = pblm._estimator_params(est_type)
        est_params = ut.merge_dicts(est_kw1, est_kw2)

        # steps = []
        # steps.append((est_type, est_class(**est_params)))
        # if wrap_type is not None:
        #     steps.append((wrap_type, multiclass_wrapper))
        if est_type == 'MLP':
            def clf_partial():
                pipe = sklearn.pipeline.Pipeline([
                    ('inputer', sklearn.preprocessing.Imputer(
                        missing_values='NaN', strategy='mean', axis=0)),
                    # ('scale', sklearn.preprocessing.StandardScaler),
                    ('est', est_class(**est_params)),
                ])
                return multiclass_wrapper(pipe)
        elif est_type == 'Logit':
            def clf_partial():
                pipe = sklearn.pipeline.Pipeline([
                    ('inputer', sklearn.preprocessing.Imputer(
                        missing_values='NaN', strategy='mean', axis=0)),
                    ('est', est_class(**est_params)),
                ])
                return multiclass_wrapper(pipe)
        else:
            def clf_partial():
                return multiclass_wrapper(est_class(**est_params))

        return clf_partial 
Example #23
Source File: clf_helpers.py    From ibeis with Apache License 2.0 4 votes vote down vote up
def _estimator_params(pblm, clf_key):
        est_type = clf_key.split('-')[0]
        if est_type in {'RF', 'RandomForest'}:
            est_kw1 = {
                # 'max_depth': 4,
                'bootstrap': True,
                'class_weight': None,
                'criterion': 'entropy',
                'max_features': 'sqrt',
                # 'max_features': None,
                'min_samples_leaf': 5,
                'min_samples_split': 2,
                # 'n_estimators': 64,
                'n_estimators': 256,
            }
            # Hack to only use missing values if we have the right sklearn
            if 'missing_values' in ut.get_func_kwargs(sklearn.ensemble.RandomForestClassifier.__init__):
                est_kw1['missing_values'] = np.nan
            est_kw2 = {
                'random_state': 3915904814,
                'verbose': 0,
                'n_jobs': -1,
            }
        elif est_type in {'SVC', 'SVM'}:
            est_kw1 = dict(kernel='linear')
            est_kw2 = {}
        elif est_type in {'Logit', 'LogisticRegression'}:
            est_kw1 = {}
            est_kw2 = {}
        elif est_type in {'MLP'}:
            est_kw1 = dict(
                activation='relu', alpha=1e-05, batch_size='auto',
                beta_1=0.9, beta_2=0.999, early_stopping=False,
                epsilon=1e-08, hidden_layer_sizes=(10, 10),
                learning_rate='constant', learning_rate_init=0.001,
                max_iter=200, momentum=0.9, nesterovs_momentum=True,
                power_t=0.5, random_state=3915904814, shuffle=True,
                solver='lbfgs', tol=0.0001, validation_fraction=0.1,
                warm_start=False
            )
            est_kw2 = dict(verbose=False)
        else:
            raise KeyError('Unknown Estimator')
        return est_kw1, est_kw2 
Example #24
Source File: advanced_supvervised_model_trainer.py    From healthcareai-py with MIT License 4 votes vote down vote up
def ensemble_classification(self, scoring_metric='roc_auc', trained_model_by_name=None):
        """
        This provides a simple way to put data in and have healthcare.ai train 
        a few models and pick the best one for your data.

        Args:
            scoring_metric (str): The metric used to rank the models. Defaults 
            to 'roc_auc'

            trained_model_by_name (dict): A dictionary of trained models to 
            compare for a custom ensemble

        Returns:
            TrainedSupervisedModel: The best TrainedSupervisedModel found.
        """
        self.validate_classification('Ensemble Classification')
        self.validate_score_metric_for_number_of_classes(scoring_metric)
        score_by_name = {}

        # Here is the default list of algorithms to try for the ensemble
        # Adding an ensemble method is as easy as adding a new key:value pair 
        # in the `model_by_name` dictionary
        if trained_model_by_name is None:
            # TODO because these now all return TSMs it will be additionally 
            # slow by all the factor models.

            # TODO Could these be trained separately then after the best is 
            # found, train the factor model and add to TSM?
            trained_model_by_name = {
                'KNN': self.knn(randomized_search=True, scoring_metric=scoring_metric),
                'Logistic Regression': self.logistic_regression(randomized_search=True),
                'Random Forest Classifier': self.random_forest_classifier(
                    trees=200,
                    randomized_search=True,
                    scoring_metric=scoring_metric)}

        for name, model in trained_model_by_name.items():
            # Unroll estimator from trained supervised model
            estimator = hcai_tsm.get_estimator_from_trained_supervised_model(model)

            # Get the score objects for the estimator
            score = self.metrics(estimator)
            self._console_log('{} algorithm: score = {}'.format(name, score))

            # TODO this may need to ferret out each classification score separately
            score_by_name[name] = score[scoring_metric]

        sorted_names_and_scores = sorted(score_by_name.items(), key=lambda x: x[1])
        best_algorithm_name, best_score = sorted_names_and_scores[-1]
        best_model = trained_model_by_name[best_algorithm_name]

        self._console_log('Based on the scoring metric {}, the best algorithm found is: {}'.format(scoring_metric,
                                                                                                   best_algorithm_name))
        self._console_log('{} {} = {}'.format(best_algorithm_name, scoring_metric, best_score))

        return best_model