Python sklearn.ensemble.ExtraTreesClassifier() Examples

The following are 30 code examples for showing how to use sklearn.ensemble.ExtraTreesClassifier(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module sklearn.ensemble , or try the search function .

Example 1
Project: pyeo   Author: clcr   File: classification.py    License: GNU General Public License v3.0 6 votes vote down vote up
def create_model_from_signatures(sig_csv_path, model_out, sig_datatype=np.int32):
    """
    Takes a .csv file containing class signatures - produced by extract_features_to_csv - and uses it to train
    and pickle a scikit-learn model.

    Parameters
    ----------
    sig_csv_path
        The path to the signatures file
    model_out
        The location to save the pickled model to.
    sig_datatype
        The datatype to read the csv as. Defaults to int32.

    Notes
    -----
    At present, the model is an ExtraTreesClassifier arrived at by tpot:
    model = ens.ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.55, min_samples_leaf=2,
                                 min_samples_split=16, n_estimators=100, n_jobs=4, class_weight='balanced')
    """
    model = ens.ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.55, min_samples_leaf=2,
                                     min_samples_split=16, n_estimators=100, n_jobs=4, class_weight='balanced')
    features, labels = load_signatures(sig_csv_path, sig_datatype)
    model.fit(features, labels)
    joblib.dump(model, model_out) 
Example 2
Project: mljar-supervised   Author: mljar   File: extra_trees.py    License: MIT License 6 votes vote down vote up
def __init__(self, params):
        super(ExtraTreesAlgorithm, self).__init__(params)
        logger.debug("ExtraTreesAlgorithm.__init__")

        self.library_version = sklearn.__version__
        self.trees_in_step = additional.get("trees_in_step", 100)
        self.max_steps = additional.get("max_steps", 50)
        self.early_stopping_rounds = additional.get("early_stopping_rounds", 50)
        self.model = ExtraTreesClassifier(
            n_estimators=self.trees_in_step,
            criterion=params.get("criterion", "gini"),
            max_features=params.get("max_features", 0.6),
            min_samples_split=params.get("min_samples_split", 30),
            warm_start=True,
            n_jobs=-1,
            random_state=params.get("seed", 1),
        ) 
Example 3
Project: jh-kaggle-util   Author: jeffheaton   File: models.py    License: Apache License 2.0 6 votes vote down vote up
def run_sklearn():
  n_trees = 100
  n_folds = 3

  # https://www.analyticsvidhya.com/blog/2015/06/tuning-random-forest-model/
  alg_list = [
      ['lreg',LinearRegression()],
      ['rforest',RandomForestRegressor(n_estimators=1000, n_jobs=-1, max_depth=3)],
      ['extree',ExtraTreesClassifier(n_estimators = 1000,max_depth=2)],
      ['adaboost',AdaBoostRegressor(base_estimator=None, n_estimators=600, learning_rate=1.0)],
      ['knn', sklearn.neighbors.KNeighborsRegressor(n_neighbors=5)]
  ]

  start_time = time.time()
  for name,alg in alg_list:
      train = jhkaggle.train_sklearn.TrainSKLearn("1",name,alg,False)
      train.run()
      train = None
  elapsed_time = time.time() - start_time
  print("Elapsed time: {}".format(jhkaggle.util.hms_string(elapsed_time))) 
Example 4
Project: jh-kaggle-util   Author: jeffheaton   File: models.py    License: Apache License 2.0 6 votes vote down vote up
def run_sklearn():
  n_trees = 100
  n_folds = 3

  # https://www.analyticsvidhya.com/blog/2015/06/tuning-random-forest-model/
  alg_list = [
      ['rforest',RandomForestClassifier(n_estimators=1000, n_jobs=-1, verbose=1, max_depth=3)],
      ['extree',ExtraTreesClassifier(n_estimators = 1000,max_depth=3,n_jobs=-1)],
      ['adaboost',AdaBoostClassifier(base_estimator=None, n_estimators=600, learning_rate=1.0)],
      ['knn', sklearn.neighbors.KNeighborsClassifier(n_neighbors=5,n_jobs=-1)]
  ]

  start_time = time.time()
  for name,alg in alg_list:
      train = jhkaggle.train_sklearn.TrainSKLearn("1",name,alg,False)
      train.run()
      train = None 
Example 5
Project: nni   Author: microsoft   File: benchmark_test.py    License: MIT License 6 votes vote down vote up
def test_time(pipeline_name, name, path):
    if pipeline_name == "LR":
        pipeline = make_pipeline(LogisticRegression())

    if pipeline_name == "FGS":
        pipeline = make_pipeline(FeatureGradientSelector(), LogisticRegression())

    if pipeline_name == "Tree":
        pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())
    
    test_benchmark = Benchmark()
    print("Dataset:\t", name)
    print("Pipeline:\t", pipeline_name)
    starttime = datetime.datetime.now()
    test_benchmark.run_test(pipeline, name, path)
    endtime = datetime.datetime.now()
    print("Used time: ", (endtime - starttime).microseconds/1000)
    print("") 
Example 6
Project: nni   Author: microsoft   File: sklearn_test.py    License: MIT License 6 votes vote down vote up
def test():
    url_zip_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_train.binary.bz2'
    urllib.request.urlretrieve(url_zip_train, filename='train.bz2')

    f_svm = open('train.svm', 'wt')
    with bz2.open('train.bz2', 'rb') as f_zip:
        data = f_zip.read()
        f_svm.write(data.decode('utf-8'))
    f_svm.close()


    X, y = load_svmlight_file('train.svm')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


    pipeline = make_pipeline(FeatureGradientSelector(n_epochs=1, n_features=10), LogisticRegression())
    # pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())

    pipeline.fit(X_train, y_train)

    print("Pipeline Score: ", pipeline.score(X_train, y_train)) 
Example 7
Project: mltk-algo-contrib   Author: splunk   File: ExtraTreesClassifier.py    License: Apache License 2.0 6 votes vote down vote up
def __init__(self, options):
        self.handle_options(options)

        out_params = convert_params(
            options.get('params', {}),
            ints=['random_state', 'n_estimators', 'max_depth',
                  'min_samples_split', 'max_leaf_nodes'],
            strs=['max_features', 'criterion'],
        )

        if 'max_depth' not in out_params:
            out_params.setdefault('max_leaf_nodes', 2000)

        if 'max_features' in out_params:
            out_params['max_features'] = handle_max_features(out_params['max_features'])

        self.estimator = _ExtraTreesClassifier(class_weight='balanced',
                                                 **out_params) 
Example 8
Project: easyML   Author: aarshayj   File: models_classification.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def __init__(
        self,data_block, predictors=[],cv_folds=10,
        scoring_metric='accuracy',additional_display_metrics=[]):

        base_classification.__init__(
            self, alg=ExtraTreesClassifier(), data_block=data_block, 
            predictors=predictors,cv_folds=cv_folds,
            scoring_metric=scoring_metric, 
            additional_display_metrics=additional_display_metrics)

        self.model_output = pd.Series(self.default_parameters)
        self.model_output['Feature_Importance'] = "-"
        self.model_output['OOB_Score'] = "-"

        #Set parameters to default values:
        self.set_parameters(set_default=True) 
Example 9
Project: fake-news-detection   Author: aldengolab   File: model_loop.py    License: MIT License 5 votes vote down vote up
def define_clfs_params(self):
        '''
        Defines all relevant parameters and classes for classfier objects.
        Edit these if you wish to change parameters.
        '''
        # These are the classifiers
        self.clfs = {
            'RF': RandomForestClassifier(n_estimators = 50, n_jobs = -1),
            'ET': ExtraTreesClassifier(n_estimators = 10, n_jobs = -1, criterion = 'entropy'),
            'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth = [1, 5, 10, 15]), algorithm = "SAMME", n_estimators = 200),
            'LR': LogisticRegression(penalty = 'l1', C = 1e5),
            'SVM': svm.SVC(kernel = 'linear', probability = True, random_state = 0),
            'GB': GradientBoostingClassifier(learning_rate = 0.05, subsample = 0.5, max_depth = 6, n_estimators = 10),
            'NB': GaussianNB(),
            'DT': DecisionTreeClassifier(),
            'SGD': SGDClassifier(loss = 'log', penalty = 'l2'),
            'KNN': KNeighborsClassifier(n_neighbors = 3)
            }
        # These are the parameters which will be run through
        self.params = {
             'RF':{'n_estimators': [1,10,100,1000], 'max_depth': [10, 15,20,30,40,50,60,70,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]},
             'LR': {'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10], 'random_state': [1]},
             'SGD': {'loss': ['log'], 'penalty': ['l2','l1','elasticnet'], 'random_state': [1]},
             'ET': {'n_estimators': [1,10,100,1000], 'criterion' : ['gini', 'entropy'], 'max_depth': [1,3,5,10,15], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]},
             'AB': {'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000], 'random_state': [1]},
             'GB': {'n_estimators': [1,10,100,1000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100], 'random_state': [1]},
             'NB': {},
             'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,2,15,20,30,40,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]},
             'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear'], 'random_state': [1]},
             'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
             } 
Example 10
Project: malss   Author: canard0328   File: test.py    License: MIT License 5 votes vote down vote up
def test_change_algorithms():
    X, y = make_classification(n_samples=1000,
                               n_features=10,
                               n_classes=2,
                               n_clusters_per_class=1,
                               random_state=0)
    X = pd.DataFrame(X)
    y = pd.Series(y)
    cls = MALSS('classification')
    cls.fit(X, y, algorithm_selection_only=True)
    algorithms = cls.get_algorithms()

    assert algorithms[0][0] == 'Support Vector Machine (RBF Kernel)'
    assert algorithms[1][0] == 'Random Forest'
    assert algorithms[2][0] == 'Logistic Regression'
    assert algorithms[3][0] == 'Decision Tree'
    assert algorithms[4][0] == 'k-Nearest Neighbors'

    cls.remove_algorithm(0)
    cls.remove_algorithm()
    algorithms = cls.get_algorithms()
    assert algorithms[0][0] == 'Random Forest'
    assert algorithms[1][0] == 'Logistic Regression'
    assert algorithms[2][0] == 'Decision Tree'

    from sklearn.ensemble import ExtraTreesClassifier as ET
    cls.add_algorithm(ET(n_jobs=3),
                      [{'n_estimators': [10, 30, 50],
                        'max_depth': [3, 5, None],
                        'max_features': [0.3, 0.6, 'auto']}],
                      'Extremely Randomized Trees')
    algorithms = cls.get_algorithms()
    assert algorithms[0][0] == 'Random Forest'
    assert algorithms[1][0] == 'Logistic Regression'
    assert algorithms[2][0] == 'Decision Tree'
    assert algorithms[3][0] == 'Extremely Randomized Trees' 
Example 11
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_forest.py    License: MIT License 5 votes vote down vote up
def test_min_impurity_split():
    # Test if min_impurity_split of base estimators is set
    # Regression test for #8006
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    all_estimators = [RandomForestClassifier, RandomForestRegressor,
                      ExtraTreesClassifier, ExtraTreesRegressor]

    for Estimator in all_estimators:
        est = Estimator(min_impurity_split=0.1)
        est = assert_warns_message(DeprecationWarning, "min_impurity_decrease",
                                   est.fit, X, y)
        for tree in est.estimators_:
            assert_equal(tree.min_impurity_split, 0.1) 
Example 12
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_forest.py    License: MIT License 5 votes vote down vote up
def test_min_impurity_decrease():
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    all_estimators = [RandomForestClassifier, RandomForestRegressor,
                      ExtraTreesClassifier, ExtraTreesRegressor]

    for Estimator in all_estimators:
        est = Estimator(min_impurity_decrease=0.1)
        est.fit(X, y)
        for tree in est.estimators_:
            # Simply check if the parameter is passed on correctly. Tree tests
            # will suffice for the actual working of this param
            assert_equal(tree.min_impurity_decrease, 0.1) 
Example 13
Project: driverlessai-recipes   Author: h2oai   File: extra_trees.py    License: Apache License 2.0 5 votes vote down vote up
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
        orig_cols = list(X.names)
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)
            model = ExtraTreesClassifier(**self.params)
        else:
            model = ExtraTreesRegressor(**self.params)

        # Replace missing values with a value smaller than all observed values
        self.min = dict()
        for col in X.names:
            XX = X[:, col]
            self.min[col] = XX.min1()
            if self.min[col] is None or np.isnan(self.min[col]):
                self.min[col] = -1e10
            else:
                self.min[col] -= 1
            XX.replace(None, self.min[col])
            X[:, col] = XX
            assert X[dt.isna(dt.f[col]), col].nrows == 0
        X = X.to_numpy()

        model.fit(X, y)
        importances = np.array(model.feature_importances_)
        self.set_model_properties(model=model,
                                  features=orig_cols,
                                  importances=importances.tolist(),
                                  iterations=self.params['n_estimators']) 
Example 14
def random_forest(train_vecs,y_train,test_vecs,y_test):
    clf = RandomForestClassifier(n_estimators=10, max_depth=10,min_samples_split=2,n_jobs=1,random_state=0)
    clf.fit(train_vecs,y_train)
    joblib.dump(clf,storedpaths+'model_randomforest.pkl')
    test_scores=clf.score(test_vecs,y_test)
    return test_scores
    
# 训练 ExtraTreesClassifier 分类算法 
Example 15
def extract_tree(train_vecs,y_train,test_vecs,y_test):
    clf = ExtraTreesClassifier(n_estimators=10, max_depth=10,min_samples_split=2,n_jobs=1,random_state=0)
    clf.fit(train_vecs,y_train)
    joblib.dump(clf,storedpaths+'model_extracttree.pkl')
    test_scores=clf.score(test_vecs,y_test)
    return test_scores
    
# 训练 GBDT 分类算法 
Example 16
Project: Python-Machine-Learning-Cookbook-Second-Edition   Author: PacktPublishing   File: trainer.py    License: MIT License 5 votes vote down vote up
def __init__(self, X, label_words):
        self.le = preprocessing.LabelEncoder()  
        self.clf = ExtraTreesClassifier(n_estimators=100, 
                max_depth=16, random_state=0)

        y = self.encode_labels(label_words)
        self.clf.fit(np.asarray(X), y) 
Example 17
Project: DataMiningCompetitionFirstPrize   Author: lzddzh   File: extra_trees.py    License: MIT License 5 votes vote down vote up
def learn(x, y, test_x):
    cw = {"0":variables.weight_0_rf, "1000":variables.weight_1000_rf, "1500":variables.weight_1500_rf, "2000":variables.weight_2000_rf}
    clf = ExtraTreesClassifier(n_jobs = -1,
                                     n_estimators=variables.n_estimators_et,
                                     max_depth=variables.max_depth_et, random_state=0,
                                     min_samples_split=variables.min_samples_split_et,
                                     min_samples_leaf=variables.min_samples_leaf_et,
                                     max_features=variables.max_feature_et,
                                     max_leaf_nodes=variables.max_leaf_nodes_et,
                                     criterion=variables.criterion_et,
                                     min_impurity_split=variables.min_impurity_split_et,
                                     class_weight=variables.cw_et).fit(x, y)

    print "n_estimators=", variables.n_estimators_et,
    print "max_depth=", variables.max_depth_et,
    print "min_samples_split=", variables.min_samples_split_et,
    print "min_samples_leaf=", variables.min_samples_leaf_et,
    print "max_features=",variables.max_feature_et,
    print "max_leaf_nodes=",variables.max_leaf_nodes_et,
    print "criterion=",variables.criterion_et,
    print "min_impurity_split=",variables.min_impurity_split_et,
    print "class_weight=", variables.cw_et

    prediction_list = clf.predict(test_x)
    prediction_list_prob = clf.predict_proba(test_x)
    return prediction_list,prediction_list_prob 
Example 18
Project: nni   Author: microsoft   File: benchmark_test.py    License: MIT License 5 votes vote down vote up
def test_memory(pipeline_name, name, path):
    if pipeline_name == "LR":
        pipeline = make_pipeline(LogisticRegression())

    if pipeline_name == "FGS":
        pipeline = make_pipeline(FeatureGradientSelector(), LogisticRegression())

    if pipeline_name == "Tree":
        pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())
    
    test_benchmark = Benchmark()
    print("Dataset:\t", name)
    print("Pipeline:\t", pipeline_name)
    test_benchmark.run_test(pipeline, name, path)
    print("") 
Example 19
Project: lale   Author: IBM   File: test_autoai_libs.py    License: Apache License 2.0 5 votes vote down vote up
def test_FS2(self):
        from sklearn.ensemble import ExtraTreesClassifier
        trainable = lale.lib.autoai_libs.FS2(
            cols_ids_must_keep=[1],
            additional_col_count_to_keep=3,
            ptype='classification',
            eval_algo=ExtraTreesClassifier,
        )
        self.doTest(trainable, **self._iris) 
Example 20
Project: stacking   Author: ikki407   File: binary.py    License: MIT License 5 votes vote down vote up
def build_model(self):
            return ExtraTreesClassifier(**self.params) 
Example 21
Project: stacking   Author: ikki407   File: multiclass.py    License: MIT License 5 votes vote down vote up
def build_model(self):
            return ExtraTreesClassifier(**self.params) 
Example 22
Project: speedml   Author: Speedml   File: plot.py    License: MIT License 5 votes vote down vote up
def importance(self):
        """
        Plot importance of features based on ExtraTreesClassifier.
        """
        Base.data_n()
        X = Base.train_n
        y = X[Base.target].copy()
        X = X.drop([Base.target], axis=1)
        model = ExtraTreesClassifier()
        model.fit(X, y)
        self._plot_importance(X.columns, model.feature_importances_) 
Example 23
Project: quantopian-ensemble-methods   Author: KhaledSharif   File: one-classifier.py    License: MIT License 5 votes vote down vote up
def initialize(context):
    set_symbol_lookup_date('2012-01-01')
    
    # Parameters to be changed
    
    context.model = ExtraTreesClassifier(n_estimators=300)
    context.lookback = 14
    context.history_range = 1000
    context.beta_coefficient = 0.0
    context.percentage_change = 0.025
    context.maximum_leverage = 2.0
    context.number_of_stocks = 150
    context.maximum_pe_ratio = 8
    context.maximum_market_cap = 0.1e9
    context.starting_probability = 0.5
    
    # End of parameters

    schedule_function(create_model, date_rules.month_start(), time_rules.market_open())
    schedule_function(rebalance, date_rules.month_start(), time_rules.market_open())
    schedule_function(trade, date_rules.every_day(), time_rules.market_open())

    context.algorithm_returns = []
    context.longs = []
    context.shorts = []
    context.training_stocks = symbols('SPY')
    context.trading_stocks  = []
    context.beta = 1.0
    context.beta_list = []
    context.completed = False 
Example 24
Project: dr_droid   Author: ririhedou   File: GetMLPara.py    License: Apache License 2.0 5 votes vote down vote up
def feature_importances(X,y):
    # the output does not stable because of the randomness
    # Build a classification task using 3 informative features
    #X, y = make_classification(n_samples=1000,n_features=10,n_informative=3,n_redundant=0,n_repeated=0,n_classes=2,n_state=0,shuffle=False)
    # Build a forest and compute the feature importances
    from sklearn.ensemble import ExtraTreesClassifier
    forest = ExtraTreesClassifier(n_estimators= 25, criterion = 'entropy' , random_state=None)
    forest.fit(X, y)
    importances = forest.feature_importances_

    std = np.std([tree.feature_importances_ for tree in forest.estimators_],axis=0)
    indices = np.argsort(importances)[::-1]
    # print (indices)
    # Print the feature ranking
    print("Feature ranking:")
    sum1 = 0.0
    for f in range(80):
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
        sum1 = sum1 +  importances[indices[f]]
    print (sum1)
    # Plot the feature importances of the forest
    #width = 0.5
    x_len = range(len(importances))
    plt.figure()
    plt.title("Feature importances")
    plt.bar(x_len, importances[indices] ,color="r", yerr=std[indices], align="center")
    plt.xticks(x_len, indices)
    plt.xlim([-1, max(x_len)+1])
    plt.show()

######################################READ DATA#################################################### 
Example 25
Project: pghumor   Author: pln-fing-udelar   File: utilanalisis.py    License: Apache License 2.0 5 votes vote down vote up
def tree_based_feature_selection(features, clases, nombres_features_ordenadas):
    print("Realizando tree-based feature selection")
    clf = ExtraTreesClassifier(n_estimators=1000)
    clf.fit(features, clases)

    imprimir_importancias(clf.feature_importances_, "Tree-based feature selection", nombres_features_ordenadas) 
Example 26
Project: pandas-ml   Author: pandas-ml   File: test_ensemble.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.ensemble.AdaBoostClassifier,
                      ensemble.AdaBoostClassifier)
        self.assertIs(df.ensemble.AdaBoostRegressor,
                      ensemble.AdaBoostRegressor)
        self.assertIs(df.ensemble.BaggingClassifier,
                      ensemble.BaggingClassifier)
        self.assertIs(df.ensemble.BaggingRegressor,
                      ensemble.BaggingRegressor)
        self.assertIs(df.ensemble.ExtraTreesClassifier,
                      ensemble.ExtraTreesClassifier)
        self.assertIs(df.ensemble.ExtraTreesRegressor,
                      ensemble.ExtraTreesRegressor)

        self.assertIs(df.ensemble.GradientBoostingClassifier,
                      ensemble.GradientBoostingClassifier)
        self.assertIs(df.ensemble.GradientBoostingRegressor,
                      ensemble.GradientBoostingRegressor)

        self.assertIs(df.ensemble.IsolationForest,
                      ensemble.IsolationForest)

        self.assertIs(df.ensemble.RandomForestClassifier,
                      ensemble.RandomForestClassifier)
        self.assertIs(df.ensemble.RandomTreesEmbedding,
                      ensemble.RandomTreesEmbedding)
        self.assertIs(df.ensemble.RandomForestRegressor,
                      ensemble.RandomForestRegressor)

        self.assertIs(df.ensemble.VotingClassifier,
                      ensemble.VotingClassifier) 
Example 27
Project: ProFET   Author: ddofer   File: PipeTasks.py    License: GNU General Public License v3.0 5 votes vote down vote up
def GetKFeatures(filename, method='RFE',kbest=30,alpha=0.01, reduceMatrix = True):
    '''
    Gets best features using chosen method
    (K-best, RFE, RFECV,'L1' (RandomizedLogisticRegression),'Tree' (ExtraTreesClassifier), mrmr),
    then prints top K features' names (from featNames).
    If reduceMatrix =  True, then also returns X reduced to the K best features.

    Available methods' names are: 'RFE','RFECV','RandomizedLogisticRegression','K-best','ExtraTreesClassifier'..
    Note, that effectiveyl, Any scikit learn method could be used, if correctly imported..
    '''
    #est = method()
    '''
    Gets the K-best features (filtered by FDR, then select best ranked by t-test , more advanced options can be implemented).
    Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv"
    '''
    features, labels, lb_encoder,featureNames = load_data(filename)
    X, y = features, labels

    # change the names as ints back to strings
    class_names=lb_encoder.inverse_transform(y)
    print("Data and labels imported. PreFilter Feature matrix shape:")
    print(X.shape)

    selectK = SelectKBest(k=kbest)
    selectK.fit(X,y)
    selectK_mask=selectK.get_support()
    K_featnames = featureNames[selectK_mask]
    print('X After K filter:',X.shape)
    print("K_featnames: %s" %(K_featnames))
    if reduceMatrix ==True :
        Reduced_df = pd.read_csv(filename, index_col=0)
        Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]]
        Reduced_df.to_csv('REDUCED_Feat.csv')
        print('Saved to REDUCED_Feat.csv')
        return Reduced_df

#WORKS! But unreadable with too many features! 
Example 28
Project: python-urbanPlanning   Author: richieBao   File: imgPred_training.py    License: MIT License 5 votes vote down vote up
def __init__(self, X, label_words):
        self.le=preprocessing.LabelEncoder()
        self.clf=ExtraTreesClassifier(n_estimators=100,max_depth=16,random_state=0) #http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
        y=self.encode_labels(label_words)
        self.clf.fit(np.asarray(X),y)
        with open('clf.pkl', 'wb') as f:  #存储训练好的图像分类器模型
            pickle.dump(self.clf, f) 
Example 29
Project: Auto-PyTorch   Author: automl   File: baselines.py    License: Apache License 2.0 5 votes vote down vote up
def fit(self, X_train, y_train, X_val, y_val):
        results = dict()

        self.all_nan = np.all(np.isnan(X_train), axis=0)
        X_train = X_train[:, ~self.all_nan]
        X_val = X_val[:, ~self.all_nan]

        X_train = np.nan_to_num(X_train)
        X_val = np.nan_to_num(X_val)

        self.config["warm_start"] = False
        self.num_classes = len(np.unique(y_train))
        if self.num_classes>2:
            print("==> Using warmstarting for multiclass")
            final_n_estimators = self.config["n_estimators"]
            self.config["n_estimators"] = 8
            self.config["warm_start"] = True

        self.model = ExtraTreesClassifier(**self.config)

        self.model.fit(X_train, y_train)
        if self.config["warm_start"]:
            self.model.n_estimators = final_n_estimators
            self.model.fit(X_train, y_train)


        pred_val_probas = self.model.predict_proba(X_val)

        pred_train = self.model.predict(X_train)
        pred_val = self.model.predict(X_val)

        results["train_acc"] = metrics.accuracy_score(y_train, pred_train)
        results["train_balanced_acc"] = metrics.balanced_accuracy_score(y_train, pred_train)
        results["val_acc"] = metrics.accuracy_score(y_val, pred_val)
        results["val_balanced_acc"] = metrics.balanced_accuracy_score(y_val, pred_val)
        results["val_preds"] = pred_val_probas.tolist()
        results["labels"] = y_val.tolist()

        return results 
Example 30
Project: twitter-stock-recommendation   Author: alvarobartt   File: test_forest.py    License: MIT License 5 votes vote down vote up
def test_min_impurity_split():
    # Test if min_impurity_split of base estimators is set
    # Regression test for #8006
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    all_estimators = [RandomForestClassifier, RandomForestRegressor,
                      ExtraTreesClassifier, ExtraTreesRegressor]

    for Estimator in all_estimators:
        est = Estimator(min_impurity_split=0.1)
        est = assert_warns_message(DeprecationWarning, "min_impurity_decrease",
                                   est.fit, X, y)
        for tree in est.estimators_:
            assert_equal(tree.min_impurity_split, 0.1)