Python sklearn.ensemble.RandomForestClassifier() Examples

The following are 30 code examples of sklearn.ensemble.RandomForestClassifier(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.ensemble , or try the search function .
Example #1
Source File: mmbot.py    From MaliciousMacroBot with MIT License 8 votes vote down vote up
def mmb_evaluate_model(self):
        """
        Returns scores from cross validation evaluation on the malicious / benign classifier
        """
        predictive_features = self.features['predictive_features']
        self.clf_X = self.modeldata[predictive_features].values
        self.clf_y = np.array(self.modeldata['label'])

        X_train, X_test, y_train, y_test = train_test_split(self.clf_X, self.clf_y, test_size=0.2, random_state=0)
        lb = LabelBinarizer()
        y_train = np.array([number[0] for number in lb.fit_transform(y_train)])
        eval_cls = RandomForestClassifier(n_estimators=100, max_features=.2)
        eval_cls.fit(X_train, y_train)

        recall = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='recall')
        precision = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='precision')
        accuracy = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='accuracy')
        f1_score = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='f1_macro')

        return {'accuracy': accuracy, 'f1': f1_score, 'precision': precision, 'recall': recall} 
Example #2
Source File: forest.py    From cgpm with Apache License 2.0 6 votes vote down vote up
def __init__(self, outputs, inputs, k=None, hypers=None, params=None,
            distargs=None, rng=None):
        self.rng = gu.gen_rng() if rng is None else rng
        self.outputs = outputs
        self.inputs = inputs
        self.rng = gu.gen_rng() if rng is None else rng
        assert len(self.outputs) == 1
        assert len(self.inputs) >= 1
        assert self.outputs[0] not in self.inputs
        assert len(distargs['inputs']['stattypes']) == len(self.inputs)
        self.stattypes = distargs['inputs']['stattypes']
        # Number of output categories and input dimension.
        # XXX WHATTA HACK. BayesDB passes in top-level kwargs, not in distargs.
        self.k = k if k is not None else int(distargs['k'])
        self.p = len(distargs['inputs']['stattypes'])
        # Sufficient statistics.
        self.N = 0
        self.data = Data(x=OrderedDict(), Y=OrderedDict())
        self.counts = [0] * self.k
        # Outlier and random forest parameters.
        if params is None: params = {}
        self.alpha = params.get('alpha', .1)
        self.regressor = params.get('forest', None)
        if self.regressor is None:
            self.regressor = RandomForestClassifier(random_state=self.rng) 
Example #3
Source File: classifier.py    From stock-price-prediction with MIT License 6 votes vote down vote up
def buildModel(dataset, method, parameters):
    """
    Build final model for predicting real testing data
    """
    features = dataset.columns[0:-1]

    if method == 'RNN':
        clf = performRNNlass(dataset[features], dataset['UpDown'])
        return clf

    elif method == 'RF':
        clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1)

    elif method == 'KNN':
        clf = neighbors.KNeighborsClassifier()

    elif method == 'SVM':
        c = parameters[0]
        g =  parameters[1]
        clf = SVC(C=c, gamma=g)

    elif method == 'ADA':
        clf = AdaBoostClassifier()

    return clf.fit(dataset[features], dataset['UpDown']) 
Example #4
Source File: 03_fit_predict_plot_midwest_survey.py    From dirty_cat with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def make_pipeline(encoding_method):
    # static transformers from the other columns
    transformers = [('one-hot-clean', encoder_dict['one-hot'], clean_columns)]
    # adding the encoded column
    transformers += [(encoding_method + '-dirty', encoder_dict[encoding_method],
                      [dirty_column])]
    pipeline = Pipeline([
        # Use ColumnTransformer to combine the features
        ('union', ColumnTransformer(
            transformers=transformers,
            remainder='drop')),
        ('scaler', StandardScaler(with_mean=False)),
        ('classifier', RandomForestClassifier(random_state=5))
    ])

    return pipeline


###############################################################################
# Evaluation of different encoding methods
# -----------------------------------------
# We then loop over encoding methods, scoring the different pipeline predictions
# using a cross validation score: 
Example #5
Source File: vanilla_model.py    From OpenChem with MIT License 6 votes vote down vote up
def __init__(self, model_type='classifier', feature_type='fingerprints',
                 n_estimators=100, n_ensemble=5):
        super(RandomForestQSAR, self).__init__()
        self.n_estimators = n_estimators
        self.n_ensemble = n_ensemble
        self.model = []
        self.model_type = model_type
        if self.model_type == 'classifier':
            for i in range(n_ensemble):
                self.model.append(RFC(n_estimators=n_estimators))
        elif self.model_type == 'regressor':
            for i in range(n_ensemble):
                self.model.append(RFR(n_estimators=n_estimators))
        else:
            raise ValueError('invalid value for argument')
        self.feature_type = feature_type
        if self.feature_type == 'descriptors':
            self.calc = Calculator(descriptors, ignore_3D=True)
            self.desc_mean = [0]*self.n_ensemble 
Example #6
Source File: adult_RF_Classify.py    From Machine-Learning-for-Beginner-by-Python3 with MIT License 6 votes vote down vote up
def Train(data, treecount, tezh, yanzhgdata):
    model = RFC(n_estimators=treecount, max_features=tezh, class_weight='balanced')
    model.fit(data[:, :-1], data[:, -1])
    # 给出训练数据的预测值
    train_out = model.predict(data[:, :-1])
    # 计算MSE
    train_mse = fmse(data[:, -1], train_out)[0]

    # 给出验证数据的预测值
    add_yan = model.predict(yanzhgdata[:, :-1])
    # 计算f1度量
    add_mse = fmse(yanzhgdata[:, -1], add_yan)[0]
    print(train_mse, add_mse)
    return train_mse, add_mse

# 最终确定组合的函数 
Example #7
Source File: test_train_pairwise_similarity_model.py    From redshells with MIT License 6 votes vote down vote up
def test_run(self):
        self.input_data['item2embedding'] = dict(i0=[1, 2], i1=[3, 4])
        self.input_data['similarity_data'] = pd.DataFrame(
            dict(item1=['i0', 'i0', 'i1'], item2=['i0', 'i1', 'i1'], similarity=[1, 0, 1]))

        task = TrainPairwiseSimilarityModel(
            item2embedding_task=_DummyTask(),
            similarity_data_task=_DummyTask(),
            model_name='RandomForestClassifier',
            item0_column_name='item1',
            item1_column_name='item2',
            similarity_column_name='similarity')
        task.load = MagicMock(side_effect=self._load)
        task.dump = MagicMock(side_effect=self._dump)

        task.run()
        self.assertIsInstance(self.dump_data, RandomForestClassifier) 
Example #8
Source File: function.py    From Karta with MIT License 6 votes vote down vote up
def trainFunctionTypeClassifier(self, scs):
        """Train the type classifier, according to all known code segments.

        Args:
            scs (list): list of all known (sark) code segments

        Note:
            Training must happen *after* the calibration phase
        """
        functions = []
        for sc in scs:
            functions += list(filter(lambda func: not self._analyzer.fptr_identifier.isPointedFunction(func.start_ea), sc.functions))
        clf = RandomForestClassifier(n_estimators=100)
        eas = list(map(lambda x: x.start_ea, functions))
        data_set = list(map(self.extractFunctionTypeSample, eas))
        data_results = list(map(self._analyzer.codeType, eas))
        # classify
        clf.fit(data_set, data_results)
        # store the results
        self._type_classifier = clf 
Example #9
Source File: Stock_Prediction_Model_Random_Forrest.py    From StockRecommendSystem with MIT License 6 votes vote down vote up
def build_model(self, X_train, y_train):
        if self.paras.load == True:
            model = self.load_training_model(self.paras.window_len)
            if model != None:
                return model

        print('build Random Forrest model...')

        # range of number of trees : 5*(1 -> 10) = 5,10,...,50 trees
        t_min = self.paras.tree_min[index]
        t_max = self.paras.tree_max[index]
        # range of max of features : 1 -> 10 features
        f_min = self.paras.feature_min[index]
        f_max = self.paras.feature_max[index]
        # range of window : 1 -> 70 days 
        w_min = self.paras.window_min
        w_max = self.paras.window_max
        
        w_opt, n_opt, m_opt = self.best_window(X_train, y_train, w_min,w_max,t_min,t_max,f_min,f_max)
        model = RandomForestClassifier(n_estimators=n_opt,max_features=m_opt, n_jobs=8, verbose=self.paras.verbose)
        return model 
Example #10
Source File: test_random_forest_classifier.py    From monasca-analytics with Apache License 2.0 5 votes vote down vote up
def test_learn_structure(self):
        data = self.get_testing_data()
        clf = self.rf_sml.learn_structure(data)
        self.assertIsInstance(clf, ensemble.RandomForestClassifier) 
Example #11
Source File: Blending_Classify_adult.py    From Machine-Learning-for-Beginner-by-Python3 with MIT License 5 votes vote down vote up
def RF_First(self, data, n_estimators=800, max_features='sqrt'):
        # 对训练数据进行训练,返回模验证数据,预测数据的预测结果
        model = RF(n_estimators=n_estimators, max_features=max_features)
        model.fit(data['train'][:, :-1], data['train'][:, -1])
        # 存储验证数据集结果和预测数据集结果
        # 训练数据集的预测结果
        xul = model.predict(data['train'][:, :-1])
        # 验证的预测结果
        yanre = model.predict(data['test'][:, :-1])
        # 预测的预测结果
        prer = model.predict(data['predict'][:, :-1])

        # 每计算一折后,要计算训练、验证、预测数据的误差
        xx = self.F1(xul, data['train'][:, -1])

        yy = self.F1(yanre, data['test'][:, -1])

        pp = self.F1(prer, data['predict'][:, -1])

        # 开始结合
        self.yanzhneg_pr.append(yanre)
        self.yanzhneg_real = data['test'][:, -1]
        self.predi.append(prer)
        self.preal = data['predict'][:, -1]

        # 存储误差
        self.error_dict['随机森林'] = [xx, yy, pp]
        return print('1层中的随机森林运行完毕')

    # AdaBoost 
Example #12
Source File: classification_randomForest.py    From practicalDataAnalysisCookbook with GNU General Public License v2.0 5 votes vote down vote up
def fitRandomForest(data):
    '''
        Build a random forest classifier
    '''
    # create the classifier object
    forest = en.RandomForestClassifier(n_jobs=-1, 
        min_samples_split=100, n_estimators=10, 
        class_weight="auto")

    # fit the data
    return forest.fit(data[0],data[1])

# the file name of the dataset 
Example #13
Source File: mlmodel.py    From speech-emotion-recognition with MIT License 5 votes vote down vote up
def __init__(self, **params):
        params['name'] = 'Random Forest'
        super(RF, self).__init__(**params)
        self.model = RandomForestClassifier(n_estimators=30) 
Example #14
Source File: test_pipe.py    From skutil with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_pipeline_complex():
    pipe = Pipeline([
        ('selector',  FeatureRetainer(cols=['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)'])),
        ('scaler',    SelectiveScaler()),
        ('boxcox',    BoxCoxTransformer()),
        ('pca',       SelectivePCA()),
        ('svd',       SelectiveTruncatedSVD()),
        ('model',     RandomForestClassifier())
    ])

    pipe.fit(X, iris.target) 
Example #15
Source File: test_random_forest_classifier.py    From monasca-analytics with Apache License 2.0 5 votes vote down vote up
def setUp(self):
        super(TestRandomForestClassifier, self).setUp()
        self.rf_sml = random_forest_classifier.RandomForestClassifier(
            "fakeid", {"module": "fake", "nb_samples": 1000}) 
Example #16
Source File: random_forest_classifier.py    From monasca-analytics with Apache License 2.0 5 votes vote down vote up
def _get_best_detector(self, train, label):
        detector = ensemble.RandomForestClassifier()
        detector.fit(train, label)
        return detector 
Example #17
Source File: random_forest_classifier.py    From monasca-analytics with Apache License 2.0 5 votes vote down vote up
def __init__(self, _id, _config):
        super(RandomForestClassifier, self).__init__(_id, _config)
        self._nb_samples = int(_config['nb_samples']) 
Example #18
Source File: models.py    From aletheia with MIT License 5 votes vote down vote up
def fit(self, X, y):
        
        self.selector = SelectKBest(f_classif, k=self.max_features)
        self.selector.fit(X, y)

        X_train=self.selector.transform(X)
        y_train=y

        param_list=[]
        idx = range(len(y_train))
        for i in range(self.n_estimators):
            random.shuffle(idx)
            param_list.append((X_train[idx[:self.max_samples]], 
                               y_train[idx[:self.max_samples]]))

        pool = ThreadPool(cpu_count())
        self.clf_list = pool.map(self._prepare_classifier, param_list)
        pool.close()
        pool.join()

        """
        X2=[]
        for clf in self.clf_list:
            P=clf.predict_proba(X_train)
            if len(X2)==0:
                X2=P[:, 0]
            else:
                X2=numpy.vstack((X2, P[:, 0]))
        X2=numpy.swapaxes(X2, 0, 1)
        print "X2:", X2.shape

        from sklearn.ensemble import RandomForestClassifier
        self.clf2=RandomForestClassifier(n_estimators=100)
        self.clf2.fit(X2, y_train)
        """ 
Example #19
Source File: pipline.py    From MachineLearning with Apache License 2.0 5 votes vote down vote up
def get_rfc():
    return RandomForestClassifier(
        n_estimators=100,
        max_features=0.5,
        max_depth=None,
        max_leaf_nodes=270,
        min_impurity_decrease=0.0001,
        random_state=123,
        n_jobs=-1
    ) 
Example #20
Source File: classifier.py    From stock-price-prediction with MIT License 5 votes vote down vote up
def performRFClass(X_train, y_train, X_test, y_test, parameters, savemodel):
    """
    Random Forest Binary Classification
    """
    clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
    clf.fit(X_train, y_train)

    accuracy = clf.score(X_test, y_test)

    return accuracy 
Example #21
Source File: pu_learning.py    From LogClass with MIT License 5 votes vote down vote up
def instatiate_pu_adapter(params, **kwargs):
    """
        Returns a RF adapted to do PU Learning wrapped by the PUAdapterWrapper.
    """
    hparms = {
        'n_estimators': 10,
        'criterion': "entropy",
        'bootstrap': True,
        'n_jobs': -1,
    }
    hparms.update(kwargs)
    estimator = RandomForestClassifier(**hparms)
    wrapped_pu_estimator = PUAdapterWrapper(PUAdapter(estimator), params)
    return wrapped_pu_estimator 
Example #22
Source File: test_pipe.py    From skutil with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_pipeline_basic():
    pipe = Pipeline([
        ('selector', FeatureRetainer(cols=['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)'])),
        ('scaler',   SelectiveScaler()),
        ('model',    RandomForestClassifier())
    ])

    pipe.fit(X, iris.target) 
Example #23
Source File: function.py    From Karta with MIT License 5 votes vote down vote up
def calibrateFunctionTypeClassifier(self, scs):
        """Calibrate the type classifier, according to all known code segments.

        Args:
            scs (list): list of all known (sark) code segments

        Return Value:
            True iff the calibration was successfully and is more accurate than the assigned lower bound
        """
        functions = []
        for sc in scs:
            functions += list(filter(lambda func: not self._analyzer.fptr_identifier.isPointedFunction(func.start_ea), sc.functions))
        # 1st round - calibration
        # 2nd round - test
        for training_round in range(2):
            round_name = "Calibration" if training_round == 0 else "Testing"
            clf = RandomForestClassifier(n_estimators=100)
            eas = list(map(lambda x: x.start_ea, functions))
            data_set = list(map(self.extractFunctionTypeSample, eas))
            data_results = list(map(self._analyzer.codeType, eas))
            # split to train and test (70%, 30%)
            X_train, X_test, Y_train, Y_test = train_test_split(data_set, data_results, test_size=0.7, random_state=5)
            # classify
            clf.fit(X_train, Y_train)
            # test
            Y_pred = clf.predict(X_test)
            accuracy = metrics.accuracy_score(Y_test, Y_pred)
            self._analyzer.logger.info("%s: Function accuracy Type Accuracy: %.2f%%", round_name, accuracy * 100)
            # Pick up the best features, and use only them (only needed in the first round)
            if training_round == 0:
                type_impact = list(zip(self._classifier_type_offsets, clf.feature_importances_))
                type_impact.sort(key=lambda x: x[1], reverse=True)
                self._classifier_type_offsets = list(map(lambda x: x[0], type_impact[:self._feature_size]))
            elif accuracy < CALIBRATION_LOWER_BOUND:
                self._analyzer.logger.error("Function Prologue Type Accuracy is too low, can't continue: %.2f%% < %.2f%%", accuracy * 100, CALIBRATION_LOWER_BOUND * 100)
                return False
        # If reached this point it means that all was OK
        return True 
Example #24
Source File: annotation.py    From scVI with MIT License 5 votes vote down vote up
def compute_accuracy_rf(
    data_train, labels_train, data_test, labels_test, param_grid=None, verbose=0
):
    if param_grid is None:
        param_grid = {"max_depth": np.arange(3, 10), "n_estimators": [10, 50, 100, 200]}
    rf = RandomForestClassifier(max_depth=2, random_state=0)
    clf = GridSearchCV(rf, param_grid, verbose=verbose, cv=3)
    return compute_accuracy_classifier(
        clf, data_train, labels_train, data_test, labels_test
    ) 
Example #25
Source File: advanced_supvervised_model_trainer.py    From healthcareai-py with MIT License 5 votes vote down vote up
def random_forest_classifier(self,
                                 trees=200,
                                 scoring_metric='roc_auc',
                                 hyperparameter_grid=None,
                                 randomized_search=True,
                                 number_iteration_samples=5):
        """
        A light wrapper for Sklearn's random forest classifier that performs 
        randomized search over an overridable
        default hyperparameter grid.
        
        Args:
            trees (int): number of trees to use if not performing a randomized 
            grid search scoring_metric (str): Any sklearn scoring metric appropriate 
            for classification hyperparameter_grid (dict): hyperparameters by name
            randomized_search (bool): True for randomized search (default)
            number_iteration_samples (int): Number of models to train during the 
            randomized search for exploring the hyperparameter space. More may lead 
            to a better model, but will take longer.

        Returns:
            TrainedSupervisedModel: 
        """
        self.validate_classification('Random Forest Classifier')
        if hyperparameter_grid is None:
            max_features = hcai_helpers.calculate_random_forest_mtry_hyperparameter(len(self.X_test.columns),
                                                                                    self.model_type)
            hyperparameter_grid = {'n_estimators': [100, 200, 300], 'max_features': max_features}
            number_iteration_samples = 5

        algorithm = get_algorithm(RandomForestClassifier,
                                  scoring_metric,
                                  hyperparameter_grid,
                                  randomized_search,
                                  number_iteration_samples=number_iteration_samples,
                                  n_estimators=trees)

        trained_supervised_model = self._create_trained_supervised_model(algorithm)

        return trained_supervised_model 
Example #26
Source File: transpile.py    From go-ml-transpiler with Apache License 2.0 5 votes vote down vote up
def main(export_dir):

    ## load dataset
    x, y = load_dataset(return_X_y=True)

    ## train xgb
    xgbc = xgb.XGBClassifier(n_estimators=100, max_depth=7)
    xgbc.fit(x, y)

    # transpile model
    os.mkdir(os.path.join(export_dir, "xgb"))
    transpiler = Transpiler(xgbc)
    transpiler.transpile(package_name="xgb", method_name="predict", export_method=True)
    transpiler.write(os.path.join(export_dir, "xgb"))
    print("xgb done.")


    ## train rfc
    rfc = RFC(n_estimators=100, max_depth=7)
    rfc.fit(x, y)

    # transpile model
    os.mkdir(os.path.join(export_dir, "rfc"))
    transpiler = Transpiler(rfc)
    transpiler.transpile(package_name="rfc", method_name="predict", export_method=True)
    transpiler.write(os.path.join(export_dir, "rfc"))
    print("rfc done.") 
Example #27
Source File: common_utils.py    From interpret-text with MIT License 5 votes vote down vote up
def create_random_forest_tfidf():
    vectorizer = TfidfVectorizer(lowercase=False)
    rf = RandomForestClassifier(n_estimators=500, random_state=777)
    return Pipeline([("vectorizer", vectorizer), ("rf", rf)]) 
Example #28
Source File: malss.py    From malss with MIT License 5 votes vote down vote up
def select_features(self):
        if self.data is None:
            warnings.warn("'drop_col' must be used after 'fit' has used.")
            return

        if self.task == 'regression':
            rf = RandomForestRegressor(random_state=0, oob_score=True, n_estimators=50, n_jobs=self.n_jobs)
        else:
            rf = RandomForestClassifier(random_state=0, oob_score=True, n_estimators=50, n_jobs=self.n_jobs)
        
        num_col = len(self.data.X.columns)
        self.data.drop_col(rf)
        if len(self.data.X.columns) < num_col:
            self.algorithms = self.__choose_algorithm()
            self.is_ready = True 
Example #29
Source File: rfpimp.py    From malss with MIT License 5 votes vote down vote up
def importances_raw(rf, X_train, y_train, n_samples=5000):
    if isinstance(rf, RandomForestClassifier):
        return permutation_importances_raw(rf, X_train, y_train, oob_classifier_accuracy, n_samples)
    elif isinstance(rf, RandomForestRegressor):
        return permutation_importances_raw(rf, X_train, y_train, oob_regression_r2_score, n_samples)
    return None 
Example #30
Source File: rfpimp.py    From malss with MIT License 5 votes vote down vote up
def oob_dropcol_importances(rf, X_train, y_train):
    """
    Compute drop-column feature importances for scikit-learn.

    Given a RandomForestClassifier or RandomForestRegressor in rf
    and training X and y data, return a data frame with columns
    Feature and Importance sorted in reverse order by importance.

    A clone of rf is trained once to get the baseline score and then
    again, once per feature to compute the drop in out of bag (OOB)
    score.

    return: A data frame with Feature, Importance columns

    SAMPLE CODE

    rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True)
    X_train, y_train = ..., ...
    rf.fit(X_train, y_train)
    imp = oob_dropcol_importances(rf, X_train, y_train)
    """
    rf_ = clone(rf)
    rf_.random_state = 999
    rf_.fit(X_train, y_train)
    baseline = rf_.oob_score_
    imp = []
    for col in X_train.columns:
        X = X_train.drop(col, axis=1)
        rf_ = clone(rf)
        rf_.random_state = 999
        rf_.fit(X, y_train)
        o = rf_.oob_score_
        imp.append(baseline - o)
    imp = np.array(imp)
    I = pd.DataFrame(data={'Feature':X_train.columns, 'Importance':imp})
    I = I.set_index('Feature')
    I = I.sort_values('Importance', ascending=False)
    return I