Python sklearn.ensemble.ExtraTreesClassifier() Examples

The following are 30 code examples of sklearn.ensemble.ExtraTreesClassifier(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.ensemble , or try the search function .
Example #1
Source File: ExtraTreesClassifier.py    From mltk-algo-contrib with Apache License 2.0 6 votes vote down vote up
def __init__(self, options):
        self.handle_options(options)

        out_params = convert_params(
            options.get('params', {}),
            ints=['random_state', 'n_estimators', 'max_depth',
                  'min_samples_split', 'max_leaf_nodes'],
            strs=['max_features', 'criterion'],
        )

        if 'max_depth' not in out_params:
            out_params.setdefault('max_leaf_nodes', 2000)

        if 'max_features' in out_params:
            out_params['max_features'] = handle_max_features(out_params['max_features'])

        self.estimator = _ExtraTreesClassifier(class_weight='balanced',
                                                 **out_params) 
Example #2
Source File: classification.py    From pyeo with GNU General Public License v3.0 6 votes vote down vote up
def create_model_from_signatures(sig_csv_path, model_out, sig_datatype=np.int32):
    """
    Takes a .csv file containing class signatures - produced by extract_features_to_csv - and uses it to train
    and pickle a scikit-learn model.

    Parameters
    ----------
    sig_csv_path
        The path to the signatures file
    model_out
        The location to save the pickled model to.
    sig_datatype
        The datatype to read the csv as. Defaults to int32.

    Notes
    -----
    At present, the model is an ExtraTreesClassifier arrived at by tpot:
    model = ens.ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.55, min_samples_leaf=2,
                                 min_samples_split=16, n_estimators=100, n_jobs=4, class_weight='balanced')
    """
    model = ens.ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.55, min_samples_leaf=2,
                                     min_samples_split=16, n_estimators=100, n_jobs=4, class_weight='balanced')
    features, labels = load_signatures(sig_csv_path, sig_datatype)
    model.fit(features, labels)
    joblib.dump(model, model_out) 
Example #3
Source File: sklearn_test.py    From nni with MIT License 6 votes vote down vote up
def test():
    url_zip_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_train.binary.bz2'
    urllib.request.urlretrieve(url_zip_train, filename='train.bz2')

    f_svm = open('train.svm', 'wt')
    with bz2.open('train.bz2', 'rb') as f_zip:
        data = f_zip.read()
        f_svm.write(data.decode('utf-8'))
    f_svm.close()


    X, y = load_svmlight_file('train.svm')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


    pipeline = make_pipeline(FeatureGradientSelector(n_epochs=1, n_features=10), LogisticRegression())
    # pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())

    pipeline.fit(X_train, y_train)

    print("Pipeline Score: ", pipeline.score(X_train, y_train)) 
Example #4
Source File: benchmark_test.py    From nni with MIT License 6 votes vote down vote up
def test_time(pipeline_name, name, path):
    if pipeline_name == "LR":
        pipeline = make_pipeline(LogisticRegression())

    if pipeline_name == "FGS":
        pipeline = make_pipeline(FeatureGradientSelector(), LogisticRegression())

    if pipeline_name == "Tree":
        pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())
    
    test_benchmark = Benchmark()
    print("Dataset:\t", name)
    print("Pipeline:\t", pipeline_name)
    starttime = datetime.datetime.now()
    test_benchmark.run_test(pipeline, name, path)
    endtime = datetime.datetime.now()
    print("Used time: ", (endtime - starttime).microseconds/1000)
    print("") 
Example #5
Source File: models_classification.py    From easyML with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def __init__(
        self,data_block, predictors=[],cv_folds=10,
        scoring_metric='accuracy',additional_display_metrics=[]):

        base_classification.__init__(
            self, alg=ExtraTreesClassifier(), data_block=data_block, 
            predictors=predictors,cv_folds=cv_folds,
            scoring_metric=scoring_metric, 
            additional_display_metrics=additional_display_metrics)

        self.model_output = pd.Series(self.default_parameters)
        self.model_output['Feature_Importance'] = "-"
        self.model_output['OOB_Score'] = "-"

        #Set parameters to default values:
        self.set_parameters(set_default=True) 
Example #6
Source File: PipeTasks.py    From ProFET with GNU General Public License v3.0 6 votes vote down vote up
def GetKFeatures(filename, method='RFE',kbest=30,alpha=0.01, reduceMatrix = True):
    '''
    Gets best features using chosen method
    (K-best, RFE, RFECV,'L1' (RandomizedLogisticRegression),'Tree' (ExtraTreesClassifier), mrmr),
    then prints top K features' names (from featNames).
    If reduceMatrix =  True, then also returns X reduced to the K best features.

    Available methods' names are: 'RFE','RFECV','RandomizedLogisticRegression','K-best','ExtraTreesClassifier'..
    Note, that effectiveyl, Any scikit learn method could be used, if correctly imported..
    '''
    #est = method()
    '''
    Gets the K-best features (filtered by FDR, then select best ranked by t-test , more advanced options can be implemented).
    Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv"
    '''
    features, labels, lb_encoder,featureNames = load_data(filename)
    X, y = features, labels

    # change the names as ints back to strings
    class_names=lb_encoder.inverse_transform(y)
    print("Data and labels imported. PreFilter Feature matrix shape:")
    print(X.shape)

    selectK = SelectKBest(k=kbest)
    selectK.fit(X,y)
    selectK_mask=selectK.get_support()
    K_featnames = featureNames[selectK_mask]
    print('X After K filter:',X.shape)
    print("K_featnames: %s" %(K_featnames))
    if reduceMatrix ==True :
        Reduced_df = pd.read_csv(filename, index_col=0)
        Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]]
        Reduced_df.to_csv('REDUCED_Feat.csv')
        print('Saved to REDUCED_Feat.csv')
        return Reduced_df

#WORKS! But unreadable with too many features! 
Example #7
Source File: models.py    From jh-kaggle-util with Apache License 2.0 6 votes vote down vote up
def run_sklearn():
  n_trees = 100
  n_folds = 3

  # https://www.analyticsvidhya.com/blog/2015/06/tuning-random-forest-model/
  alg_list = [
      ['lreg',LinearRegression()],
      ['rforest',RandomForestRegressor(n_estimators=1000, n_jobs=-1, max_depth=3)],
      ['extree',ExtraTreesClassifier(n_estimators = 1000,max_depth=2)],
      ['adaboost',AdaBoostRegressor(base_estimator=None, n_estimators=600, learning_rate=1.0)],
      ['knn', sklearn.neighbors.KNeighborsRegressor(n_neighbors=5)]
  ]

  start_time = time.time()
  for name,alg in alg_list:
      train = jhkaggle.train_sklearn.TrainSKLearn("1",name,alg,False)
      train.run()
      train = None
  elapsed_time = time.time() - start_time
  print("Elapsed time: {}".format(jhkaggle.util.hms_string(elapsed_time))) 
Example #8
Source File: extra_trees.py    From mljar-supervised with MIT License 6 votes vote down vote up
def __init__(self, params):
        super(ExtraTreesAlgorithm, self).__init__(params)
        logger.debug("ExtraTreesAlgorithm.__init__")

        self.library_version = sklearn.__version__
        self.trees_in_step = additional.get("trees_in_step", 100)
        self.max_steps = additional.get("max_steps", 50)
        self.early_stopping_rounds = additional.get("early_stopping_rounds", 50)
        self.model = ExtraTreesClassifier(
            n_estimators=self.trees_in_step,
            criterion=params.get("criterion", "gini"),
            max_features=params.get("max_features", 0.6),
            min_samples_split=params.get("min_samples_split", 30),
            warm_start=True,
            n_jobs=-1,
            random_state=params.get("seed", 1),
        ) 
Example #9
Source File: GetMLPara.py    From dr_droid with Apache License 2.0 5 votes vote down vote up
def feature_importances(X,y):
    # the output does not stable because of the randomness
    # Build a classification task using 3 informative features
    #X, y = make_classification(n_samples=1000,n_features=10,n_informative=3,n_redundant=0,n_repeated=0,n_classes=2,n_state=0,shuffle=False)
    # Build a forest and compute the feature importances
    from sklearn.ensemble import ExtraTreesClassifier
    forest = ExtraTreesClassifier(n_estimators= 25, criterion = 'entropy' , random_state=None)
    forest.fit(X, y)
    importances = forest.feature_importances_

    std = np.std([tree.feature_importances_ for tree in forest.estimators_],axis=0)
    indices = np.argsort(importances)[::-1]
    # print (indices)
    # Print the feature ranking
    print("Feature ranking:")
    sum1 = 0.0
    for f in range(80):
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
        sum1 = sum1 +  importances[indices[f]]
    print (sum1)
    # Plot the feature importances of the forest
    #width = 0.5
    x_len = range(len(importances))
    plt.figure()
    plt.title("Feature importances")
    plt.bar(x_len, importances[indices] ,color="r", yerr=std[indices], align="center")
    plt.xticks(x_len, indices)
    plt.xlim([-1, max(x_len)+1])
    plt.show()

######################################READ DATA#################################################### 
Example #10
Source File: binary.py    From stacking with MIT License 5 votes vote down vote up
def build_model(self):
            return ExtraTreesClassifier(**self.params) 
Example #11
Source File: multiclass.py    From stacking with MIT License 5 votes vote down vote up
def build_model(self):
            return ExtraTreesClassifier(**self.params) 
Example #12
Source File: plot.py    From speedml with MIT License 5 votes vote down vote up
def importance(self):
        """
        Plot importance of features based on ExtraTreesClassifier.
        """
        Base.data_n()
        X = Base.train_n
        y = X[Base.target].copy()
        X = X.drop([Base.target], axis=1)
        model = ExtraTreesClassifier()
        model.fit(X, y)
        self._plot_importance(X.columns, model.feature_importances_) 
Example #13
Source File: one-classifier.py    From quantopian-ensemble-methods with MIT License 5 votes vote down vote up
def initialize(context):
    set_symbol_lookup_date('2012-01-01')
    
    # Parameters to be changed
    
    context.model = ExtraTreesClassifier(n_estimators=300)
    context.lookback = 14
    context.history_range = 1000
    context.beta_coefficient = 0.0
    context.percentage_change = 0.025
    context.maximum_leverage = 2.0
    context.number_of_stocks = 150
    context.maximum_pe_ratio = 8
    context.maximum_market_cap = 0.1e9
    context.starting_probability = 0.5
    
    # End of parameters

    schedule_function(create_model, date_rules.month_start(), time_rules.market_open())
    schedule_function(rebalance, date_rules.month_start(), time_rules.market_open())
    schedule_function(trade, date_rules.every_day(), time_rules.market_open())

    context.algorithm_returns = []
    context.longs = []
    context.shorts = []
    context.training_stocks = symbols('SPY')
    context.trading_stocks  = []
    context.beta = 1.0
    context.beta_list = []
    context.completed = False 
Example #14
Source File: test_forest.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_min_impurity_decrease():
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    all_estimators = [RandomForestClassifier, RandomForestRegressor,
                      ExtraTreesClassifier, ExtraTreesRegressor]

    for Estimator in all_estimators:
        est = Estimator(min_impurity_decrease=0.1)
        est.fit(X, y)
        for tree in est.estimators_:
            # Simply check if the parameter is passed on correctly. Tree tests
            # will suffice for the actual working of this param
            assert_equal(tree.min_impurity_decrease, 0.1) 
Example #15
Source File: utilanalisis.py    From pghumor with Apache License 2.0 5 votes vote down vote up
def tree_based_feature_selection(features, clases, nombres_features_ordenadas):
    print("Realizando tree-based feature selection")
    clf = ExtraTreesClassifier(n_estimators=1000)
    clf.fit(features, clases)

    imprimir_importancias(clf.feature_importances_, "Tree-based feature selection", nombres_features_ordenadas) 
Example #16
Source File: test_ensemble.py    From pandas-ml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.ensemble.AdaBoostClassifier,
                      ensemble.AdaBoostClassifier)
        self.assertIs(df.ensemble.AdaBoostRegressor,
                      ensemble.AdaBoostRegressor)
        self.assertIs(df.ensemble.BaggingClassifier,
                      ensemble.BaggingClassifier)
        self.assertIs(df.ensemble.BaggingRegressor,
                      ensemble.BaggingRegressor)
        self.assertIs(df.ensemble.ExtraTreesClassifier,
                      ensemble.ExtraTreesClassifier)
        self.assertIs(df.ensemble.ExtraTreesRegressor,
                      ensemble.ExtraTreesRegressor)

        self.assertIs(df.ensemble.GradientBoostingClassifier,
                      ensemble.GradientBoostingClassifier)
        self.assertIs(df.ensemble.GradientBoostingRegressor,
                      ensemble.GradientBoostingRegressor)

        self.assertIs(df.ensemble.IsolationForest,
                      ensemble.IsolationForest)

        self.assertIs(df.ensemble.RandomForestClassifier,
                      ensemble.RandomForestClassifier)
        self.assertIs(df.ensemble.RandomTreesEmbedding,
                      ensemble.RandomTreesEmbedding)
        self.assertIs(df.ensemble.RandomForestRegressor,
                      ensemble.RandomForestRegressor)

        self.assertIs(df.ensemble.VotingClassifier,
                      ensemble.VotingClassifier) 
Example #17
Source File: imgPred_training.py    From python-urbanPlanning with MIT License 5 votes vote down vote up
def __init__(self, X, label_words):
        self.le=preprocessing.LabelEncoder()
        self.clf=ExtraTreesClassifier(n_estimators=100,max_depth=16,random_state=0) #http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
        y=self.encode_labels(label_words)
        self.clf.fit(np.asarray(X),y)
        with open('clf.pkl', 'wb') as f:  #存储训练好的图像分类器模型
            pickle.dump(self.clf, f) 
Example #18
Source File: baselines.py    From Auto-PyTorch with Apache License 2.0 5 votes vote down vote up
def fit(self, X_train, y_train, X_val, y_val):
        results = dict()

        self.all_nan = np.all(np.isnan(X_train), axis=0)
        X_train = X_train[:, ~self.all_nan]
        X_val = X_val[:, ~self.all_nan]

        X_train = np.nan_to_num(X_train)
        X_val = np.nan_to_num(X_val)

        self.config["warm_start"] = False
        self.num_classes = len(np.unique(y_train))
        if self.num_classes>2:
            print("==> Using warmstarting for multiclass")
            final_n_estimators = self.config["n_estimators"]
            self.config["n_estimators"] = 8
            self.config["warm_start"] = True

        self.model = ExtraTreesClassifier(**self.config)

        self.model.fit(X_train, y_train)
        if self.config["warm_start"]:
            self.model.n_estimators = final_n_estimators
            self.model.fit(X_train, y_train)


        pred_val_probas = self.model.predict_proba(X_val)

        pred_train = self.model.predict(X_train)
        pred_val = self.model.predict(X_val)

        results["train_acc"] = metrics.accuracy_score(y_train, pred_train)
        results["train_balanced_acc"] = metrics.balanced_accuracy_score(y_train, pred_train)
        results["val_acc"] = metrics.accuracy_score(y_val, pred_val)
        results["val_balanced_acc"] = metrics.balanced_accuracy_score(y_val, pred_val)
        results["val_preds"] = pred_val_probas.tolist()
        results["labels"] = y_val.tolist()

        return results 
Example #19
Source File: test_forest.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_min_impurity_split():
    # Test if min_impurity_split of base estimators is set
    # Regression test for #8006
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    all_estimators = [RandomForestClassifier, RandomForestRegressor,
                      ExtraTreesClassifier, ExtraTreesRegressor]

    for Estimator in all_estimators:
        est = Estimator(min_impurity_split=0.1)
        est = assert_warns_message(DeprecationWarning, "min_impurity_decrease",
                                   est.fit, X, y)
        for tree in est.estimators_:
            assert_equal(tree.min_impurity_split, 0.1) 
Example #20
Source File: model_loop.py    From fake-news-detection with MIT License 5 votes vote down vote up
def define_clfs_params(self):
        '''
        Defines all relevant parameters and classes for classfier objects.
        Edit these if you wish to change parameters.
        '''
        # These are the classifiers
        self.clfs = {
            'RF': RandomForestClassifier(n_estimators = 50, n_jobs = -1),
            'ET': ExtraTreesClassifier(n_estimators = 10, n_jobs = -1, criterion = 'entropy'),
            'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth = [1, 5, 10, 15]), algorithm = "SAMME", n_estimators = 200),
            'LR': LogisticRegression(penalty = 'l1', C = 1e5),
            'SVM': svm.SVC(kernel = 'linear', probability = True, random_state = 0),
            'GB': GradientBoostingClassifier(learning_rate = 0.05, subsample = 0.5, max_depth = 6, n_estimators = 10),
            'NB': GaussianNB(),
            'DT': DecisionTreeClassifier(),
            'SGD': SGDClassifier(loss = 'log', penalty = 'l2'),
            'KNN': KNeighborsClassifier(n_neighbors = 3)
            }
        # These are the parameters which will be run through
        self.params = {
             'RF':{'n_estimators': [1,10,100,1000], 'max_depth': [10, 15,20,30,40,50,60,70,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]},
             'LR': {'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10], 'random_state': [1]},
             'SGD': {'loss': ['log'], 'penalty': ['l2','l1','elasticnet'], 'random_state': [1]},
             'ET': {'n_estimators': [1,10,100,1000], 'criterion' : ['gini', 'entropy'], 'max_depth': [1,3,5,10,15], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]},
             'AB': {'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000], 'random_state': [1]},
             'GB': {'n_estimators': [1,10,100,1000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100], 'random_state': [1]},
             'NB': {},
             'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,2,15,20,30,40,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]},
             'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear'], 'random_state': [1]},
             'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
             } 
Example #21
Source File: test_autoai_libs.py    From lale with Apache License 2.0 5 votes vote down vote up
def test_FS2(self):
        from sklearn.ensemble import ExtraTreesClassifier
        trainable = lale.lib.autoai_libs.FS2(
            cols_ids_must_keep=[1],
            additional_col_count_to_keep=3,
            ptype='classification',
            eval_algo=ExtraTreesClassifier,
        )
        self.doTest(trainable, **self._iris) 
Example #22
Source File: benchmark_test.py    From nni with MIT License 5 votes vote down vote up
def test_memory(pipeline_name, name, path):
    if pipeline_name == "LR":
        pipeline = make_pipeline(LogisticRegression())

    if pipeline_name == "FGS":
        pipeline = make_pipeline(FeatureGradientSelector(), LogisticRegression())

    if pipeline_name == "Tree":
        pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())
    
    test_benchmark = Benchmark()
    print("Dataset:\t", name)
    print("Pipeline:\t", pipeline_name)
    test_benchmark.run_test(pipeline, name, path)
    print("") 
Example #23
Source File: models.py    From jh-kaggle-util with Apache License 2.0 5 votes vote down vote up
def run_sklearn():
  n_trees = 100
  n_folds = 3

  # https://www.analyticsvidhya.com/blog/2015/06/tuning-random-forest-model/
  alg_list = [
      ['rforest',RandomForestClassifier(n_estimators=1000, n_jobs=-1, verbose=1, max_depth=3)],
      ['extree',ExtraTreesClassifier(n_estimators = 1000,max_depth=3,n_jobs=-1)],
      ['adaboost',AdaBoostClassifier(base_estimator=None, n_estimators=600, learning_rate=1.0)],
      ['knn', sklearn.neighbors.KNeighborsClassifier(n_neighbors=5,n_jobs=-1)]
  ]

  start_time = time.time()
  for name,alg in alg_list:
      train = jhkaggle.train_sklearn.TrainSKLearn("1",name,alg,False)
      train.run()
      train = None 
Example #24
Source File: extra_trees.py    From DataMiningCompetitionFirstPrize with MIT License 5 votes vote down vote up
def learn(x, y, test_x):
    cw = {"0":variables.weight_0_rf, "1000":variables.weight_1000_rf, "1500":variables.weight_1500_rf, "2000":variables.weight_2000_rf}
    clf = ExtraTreesClassifier(n_jobs = -1,
                                     n_estimators=variables.n_estimators_et,
                                     max_depth=variables.max_depth_et, random_state=0,
                                     min_samples_split=variables.min_samples_split_et,
                                     min_samples_leaf=variables.min_samples_leaf_et,
                                     max_features=variables.max_feature_et,
                                     max_leaf_nodes=variables.max_leaf_nodes_et,
                                     criterion=variables.criterion_et,
                                     min_impurity_split=variables.min_impurity_split_et,
                                     class_weight=variables.cw_et).fit(x, y)

    print "n_estimators=", variables.n_estimators_et,
    print "max_depth=", variables.max_depth_et,
    print "min_samples_split=", variables.min_samples_split_et,
    print "min_samples_leaf=", variables.min_samples_leaf_et,
    print "max_features=",variables.max_feature_et,
    print "max_leaf_nodes=",variables.max_leaf_nodes_et,
    print "criterion=",variables.criterion_et,
    print "min_impurity_split=",variables.min_impurity_split_et,
    print "class_weight=", variables.cw_et

    prediction_list = clf.predict(test_x)
    prediction_list_prob = clf.predict_proba(test_x)
    return prediction_list,prediction_list_prob 
Example #25
Source File: trainer.py    From Python-Machine-Learning-Cookbook-Second-Edition with MIT License 5 votes vote down vote up
def __init__(self, X, label_words):
        self.le = preprocessing.LabelEncoder()  
        self.clf = ExtraTreesClassifier(n_estimators=100, 
                max_depth=16, random_state=0)

        y = self.encode_labels(label_words)
        self.clf.fit(np.asarray(X), y) 
Example #26
Source File: sentiment_analysis_ml.py    From Sentiment_Analysis_cnn_lstm_cnnlstm_textcnn_bilstm with Apache License 2.0 5 votes vote down vote up
def extract_tree(train_vecs,y_train,test_vecs,y_test):
    clf = ExtraTreesClassifier(n_estimators=10, max_depth=10,min_samples_split=2,n_jobs=1,random_state=0)
    clf.fit(train_vecs,y_train)
    joblib.dump(clf,storedpaths+'model_extracttree.pkl')
    test_scores=clf.score(test_vecs,y_test)
    return test_scores
    
# 训练 GBDT 分类算法 
Example #27
Source File: sentiment_analysis_ml.py    From Sentiment_Analysis_cnn_lstm_cnnlstm_textcnn_bilstm with Apache License 2.0 5 votes vote down vote up
def random_forest(train_vecs,y_train,test_vecs,y_test):
    clf = RandomForestClassifier(n_estimators=10, max_depth=10,min_samples_split=2,n_jobs=1,random_state=0)
    clf.fit(train_vecs,y_train)
    joblib.dump(clf,storedpaths+'model_randomforest.pkl')
    test_scores=clf.score(test_vecs,y_test)
    return test_scores
    
# 训练 ExtraTreesClassifier 分类算法 
Example #28
Source File: extra_trees.py    From driverlessai-recipes with Apache License 2.0 5 votes vote down vote up
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
        orig_cols = list(X.names)
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)
            model = ExtraTreesClassifier(**self.params)
        else:
            model = ExtraTreesRegressor(**self.params)

        # Replace missing values with a value smaller than all observed values
        self.min = dict()
        for col in X.names:
            XX = X[:, col]
            self.min[col] = XX.min1()
            if self.min[col] is None or np.isnan(self.min[col]):
                self.min[col] = -1e10
            else:
                self.min[col] -= 1
            XX.replace(None, self.min[col])
            X[:, col] = XX
            assert X[dt.isna(dt.f[col]), col].nrows == 0
        X = X.to_numpy()

        model.fit(X, y)
        importances = np.array(model.feature_importances_)
        self.set_model_properties(model=model,
                                  features=orig_cols,
                                  importances=importances.tolist(),
                                  iterations=self.params['n_estimators']) 
Example #29
Source File: test_forest.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_min_impurity_decrease():
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    all_estimators = [RandomForestClassifier, RandomForestRegressor,
                      ExtraTreesClassifier, ExtraTreesRegressor]

    for Estimator in all_estimators:
        est = Estimator(min_impurity_decrease=0.1)
        est.fit(X, y)
        for tree in est.estimators_:
            # Simply check if the parameter is passed on correctly. Tree tests
            # will suffice for the actual working of this param
            assert_equal(tree.min_impurity_decrease, 0.1) 
Example #30
Source File: test_forest.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_min_impurity_split():
    # Test if min_impurity_split of base estimators is set
    # Regression test for #8006
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    all_estimators = [RandomForestClassifier, RandomForestRegressor,
                      ExtraTreesClassifier, ExtraTreesRegressor]

    for Estimator in all_estimators:
        est = Estimator(min_impurity_split=0.1)
        est = assert_warns_message(DeprecationWarning, "min_impurity_decrease",
                                   est.fit, X, y)
        for tree in est.estimators_:
            assert_equal(tree.min_impurity_split, 0.1)