python source code of random

"""
An implementation of a random forest. Uses the provided DecisionTree class.
The RF is parameterized by the following values:

    1. n_trees: The number of trees in the forest
    2. boot_percent: The bootstrap percent. For each tree, we select a
    bootstrap sample of the dataset (with replacement). This is the
    percentage of the dataset to use for each tree.
    3. feat_percent: The percentage of features to consider at each split.
    Instead of examining all features at each split, we select feat_percent
    of them to consider.

In addition, you need to specify the typical decision tree parameters,
namely max depth and the number of linear split points.

==============
Copyright Info
==============
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

Copyright Brian Dolhansky 2014
bdolmail@gmail.com
"""

from interface_utils import prog_bar
from fast_decision_tree import FastDecisionTree
from scipy import stats
import numpy as np

class RandomForest:
    def __init__(self, n_trees, max_depth, num_splits,
        boot_percent=0.3, feat_percent=0.3, threaded=False, debug=False):

        self.n_trees = n_trees
        self.max_depth = max_depth
        self.num_splits = num_splits
        self.boot_percent = boot_percent
        self.feat_percent = feat_percent
        self.threaded = threaded
        self.debug = debug

        self.roots = []

    def train(self, train_data, train_target):
        t = 0
        for i in range(self.n_trees):
            prog_bar(t, self.n_trees)
            t += 1

            keep_idx = np.random.rand(train_data.shape[0]) <= \
                       self.boot_percent

            boot_train_data = train_data[keep_idx, :]
            boot_train_target = train_target[keep_idx]

            dt = FastDecisionTree(self.max_depth, self.num_splits,
                                  feat_subset=self.feat_percent,
                                  debug=self.debug)

            r = dt.train(boot_train_data, boot_train_target)
            self.roots.append(r)

        prog_bar(self.n_trees, self.n_trees)

    def test(self, test_data, test_target):
        t = 0
        # TODO: refactor the RF test function to depend not on an external
        # root but on itself
        dt = FastDecisionTree(1, 1)
        yhat_forest = np.zeros((test_data.shape[0], self.n_trees))
        for i in range(len(self.roots)):
            r = self.roots[i]
            prog_bar(t, self.n_trees)
            t += 1

            yhat_forest[:, i:] = dt.test_preds(r, test_data)

        prog_bar(self.n_trees, self.n_trees)

        yhat = stats.mode(yhat_forest, axis=1)[0]
        return yhat