python source code of generic

"""The generic_classifier module contains code to do validation
of the classifier predictions."""

import src.utils.python.util as _utils
import src.utils.python.math as mymath
import src.features.python.feature_utils as futils
import numpy as np
from numpy import interp
from sklearn import cross_validation
import sklearn.metrics as metrics
import pandas as pd


class GenericClassifier(object):

    def __init__(self,
                 total_iterations=5,
                 classify_oncogene=True,
                 classify_tsg=True,
                 rseed=None):
        self.min_count = 0  # min mutations for a gene

        # set the the state of the prng
        self.prng = np.random.RandomState(rseed)
        self.rseed = rseed

        # set integer codes for classes
        self.set_classes(oncogene=classify_oncogene, tsg=classify_tsg)

        self.set_total_iter(total_iterations)  # initialize total number of simulations
        self._init_metrics()  # initialize metrics

        # genes categorized as oncogenes/tsg by vogelstein's
        # science paper
        self.vogelsteins_oncogenes = _utils.oncogene_set
        self.vogelsteins_tsg = _utils.tsg_set

    def set_total_iter(self, myiterations):
        self.total_iter = myiterations

    def _filter_rows(self, df):
        """Filter out rows with counts less than the minimum."""
        row_sums = df.T.sum()
        filtered_df = df[row_sums > self.min_count]
        return filtered_df

    def train(self):
        """Train classifier on entire data set provided."""
        self.x, self.y = futils.randomize(self.x, self.prng)
        futils.check_num_classes(self.y) # warn user if not 3 classes
        self.clf.fit(self.x, self.y)

    def train_cv(self, k=10):
        """Train classifier on entire data set provided, but done in cross-validation."""
        # generate indices for kfold cross validation
        self.num_pred = 0  # number of predictions
        self.test_fold_df = pd.DataFrame({l+1: 0 for l in range(self.total_iter)}, index=self.x.index)

        for i in range(self.total_iter):
            # randomize for another round
            self.x, self.y = futils.randomize(self.x, self.prng)
            futils.check_num_classes(self.y) # warn user if not 3 classes

            # set up stratified kfold iterator
            k_fold = cross_validation.StratifiedKFold(self.y,
                                                      n_folds=k)

            # obtain predictions from single round of kfold validation
            for nfold, (train_ix, test_ix) in enumerate(k_fold):
                # retreive indices from pandas dataframe using row number
                tmp_train_ix = self.x.iloc[train_ix].index

                # save which genes are in the test fold
                tmp_test_ix = self.x.iloc[test_ix].index
                self.test_fold_df.loc[tmp_test_ix, i+1] = nfold + 1

                if self.is_weighted_sample:
                    # figure out sample weights
                    num_train = len(train_ix)
                    sample_weight = np.zeros(num_train)
                    onco_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.onco_num)[0]
                    tsg_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.tsg_num)[0]
                    other_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.other_num)[0]
                    sample_weight[onco_ix] = 1. / len(onco_ix)
                    sample_weight[tsg_ix] = 1. / len(tsg_ix)
                    sample_weight[other_ix] = 1. / len(other_ix)

                    # do training with sample weighting
                    self.clf.fit(self.x.ix[tmp_train_ix].copy(),
                                 self.y.ix[tmp_train_ix].copy(),
                                 sample_weight=sample_weight)
                else:
                    # do training without weighting
                    self.clf.fit(self.x.loc[tmp_train_ix].copy(),
                                 self.y.loc[tmp_train_ix].copy())
                self.clf.append_fold_result()  # add the training result from each fold
            self.clf.append_cv_result()  # add the training result for a single CV to the R variable

            self.num_pred += 1
        self.clf.set_cv_fold(self.test_fold_df)

    def predict(self):
        """Predict after using the train method."""
        self.num_pred = 1  # only one prediction

        # initialize predicted results variables
        num_genes = len(self.y)

        # do predictions
        proba_ = self.clf.predict_proba(self.x)

        # update information
        onco_prob = proba_[:, self.onco_num] # predicted oncogenes
        tsg_prob = proba_[:, self.tsg_num] # predicted oncogenes
        other_prob = 1 - (onco_prob + tsg_prob)

        return onco_prob, tsg_prob, other_prob

    def predict_cv(self, k=10):
        """Predict after using the gene hold-out cross-validated train method."""
        # generate indices for kfold cross validation
        self.num_pred = 0  # number of predictions

        # initialize variables for prediction results
        prediction = pd.Series(index=self.x.index)  # predicted class
        onco_prob = pd.Series(index=self.x.index).fillna(0)
        tsg_prob = pd.Series(index=self.x.index).fillna(0)

        # figure out which genes may have not been trained on before
        new_genes = self.x.index.difference(self.clf.cv_folds.index)

        # perform total_iter number of cross-validations
        for i in range(self.total_iter):
            # predict on genes never found in training data, so there
            # is no worry about overfitting.
            test_feat = self.x.ix[new_genes]
            if not test_feat.empty:
                self.clf.set_model(i+1, 1)
                tmp_prob = self.clf.predict_proba(test_feat)
                onco_prob.ix[new_genes] += tmp_prob[:, self.onco_num]
                tsg_prob.ix[new_genes] += tmp_prob[:, self.tsg_num]

            # obtain predictions from single round of kfold validation
            col = 'X{0}'.format(i+1)
            for j in range(k):
                self.clf.set_model(i+1, j+1)
                # figure out which genes should be tested
                good_ix = self.clf.cv_folds[self.clf.cv_folds[col]==j+1].index
                tmp_test_ix = self.x.index.intersection(good_ix)

                # predict test data in kfold validation
                test_feat = self.x.ix[tmp_test_ix]
                if not test_feat.empty:
                    tmp_prob = self.clf.predict_proba(test_feat)
                    onco_prob.ix[tmp_test_ix] += tmp_prob[:, self.onco_num]
                    tsg_prob.ix[tmp_test_ix] += tmp_prob[:, self.tsg_num]

            self.num_pred += 1

        # convert number of trees to fraction of trees
        onco_prob /= self.num_pred
        tsg_prob /= self.num_pred
        other_prob = 1 - (onco_prob + tsg_prob)

        # return prediction.astype(int), prob
        return onco_prob, tsg_prob, other_prob

    def kfold_validation(self, k=10):
        """Records the performance in terms of ROC and PR AUC for cross-validation.

        Params
        ------
        k : int (10)
            Number of cross-validation folds
        """
        self.num_pred = 0  # number of predictions

        for i in range(self.total_iter):
            # randomize for another round
            self.x, self.y = futils.randomize(self.x, self.prng)
            futils.check_num_classes(self.y) # warn user if not 3 classes

            # initialize predicted results variables
            num_genes = len(self.y)
            onco_pred = np.zeros(num_genes)
            onco_prob = np.zeros(num_genes)
            tsg_pred = np.zeros(num_genes)
            tsg_prob = np.zeros(num_genes)
            overall_pred = np.zeros(num_genes)

            # set up stratified kfold iterator
            k_fold = cross_validation.StratifiedKFold(self.y,
                                                      n_folds=k)

            # evaluate k-fold cross validation
            for train_ix, test_ix in k_fold:
                if self.is_weighted_sample:
                    # weight classes by using sample weights
                    num_train = len(train_ix)
                    sample_weight = np.zeros(num_train)
                    onco_ix = np.nonzero(self.y[train_ix]==self.onco_num)[0]
                    tsg_ix = np.nonzero(self.y[train_ix]==self.tsg_num)[0]
                    other_ix = np.nonzero(self.y[train_ix]==self.other_num)[0]
                    sample_weight[onco_ix] = 1. / len(onco_ix)
                    sample_weight[tsg_ix] = 1. / len(tsg_ix)
                    sample_weight[other_ix] = 1. / len(other_ix)

                    # do training
                    self.clf.fit(self.x.iloc[train_ix].copy(),
                                 self.y.iloc[train_ix].copy(),
                                 sample_weight=sample_weight)
                else:
                    # do training without sample weights
                    self.clf.fit(self.x.iloc[train_ix].copy(),
                                 self.y.iloc[train_ix].copy())

                # do prediction
                y_pred = self.clf.predict(self.x.iloc[test_ix])
                proba_ = self.clf.predict_proba(self.x.iloc[test_ix])

                # update information
                overall_pred[test_ix] = y_pred  # prediction including all classes
                onco_pred[test_ix] = (y_pred==self.onco_num).astype(int)  # predicted oncogenes
                onco_prob[test_ix] = proba_[:, self.onco_num] # predicted oncogenes
                tsg_pred[test_ix] = (y_pred==self.tsg_num).astype(int)  # predicted oncogenes
                tsg_prob[test_ix] = proba_[:, self.tsg_num] # predicted oncogenes

            # update information
            true_onco = (self.y==self.onco_num).astype(int)
            self._update_onco_metrics(true_onco,
                                      onco_pred,
                                      onco_prob)
            true_tsg = (self.y==self.tsg_num).astype(int)  # true oncogenes
            self._update_tsg_metrics(true_tsg,
                                     tsg_pred,
                                     tsg_prob)
            self._update_metrics(self.y,
                                 overall_pred,
                                 onco_prob,
                                 tsg_prob)
            self.num_pred += 1

        self._on_finish()  # update info for kfold cross-validation

    def kfold_prediction(self, k=10):
        # generate indices for kfold cross validation
        self.num_pred = 0  # number of predictions

        prediction = pd.Series(index=self.y.index)  # predicted class
        onco_prob = pd.Series(index=self.y.index).fillna(0)
        tsg_prob = pd.Series(index=self.y.index).fillna(0)

        for i in range(self.total_iter):
            # randomize for another round
            self.x, self.y = futils.randomize(self.x, self.prng)
            futils.check_num_classes(self.y) # warn user if not 3 classes

            # set up stratified kfold iterator
            k_fold = cross_validation.StratifiedKFold(self.y,
                                                      n_folds=k)

            # obtain predictions from single round of kfold validation
            for train_ix, test_ix in k_fold:
                # retreive indices from pandas dataframe using row number
                tmp_train_ix = self.x.iloc[train_ix].index
                tmp_test_ix = self.x.iloc[test_ix].index

                if self.is_weighted_sample:
                    # figure out sample weights
                    num_train = len(train_ix)
                    sample_weight = np.zeros(num_train)
                    onco_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.onco_num)[0]
                    tsg_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.tsg_num)[0]
                    other_ix = np.nonzero(self.y.ix[tmp_train_ix]==self.other_num)[0]
                    sample_weight[onco_ix] = 1. / len(onco_ix)
                    sample_weight[tsg_ix] = 1. / len(tsg_ix)
                    sample_weight[other_ix] = 1. / len(other_ix)

                    # do training with sample weighting
                    self.clf.fit(self.x.ix[tmp_train_ix].copy(),
                                 self.y.ix[tmp_train_ix].copy(),
                                 sample_weight=sample_weight)
                else:
                    # do training without weighting
                    self.clf.fit(self.x.ix[tmp_train_ix].copy(),
                                 self.y.ix[tmp_train_ix].copy())

                # predict test data in kfold validation
                tmp_prob = self.clf.predict_proba(self.x.ix[tmp_test_ix])
                onco_prob.ix[tmp_test_ix] += tmp_prob[:, self.onco_num]
                tsg_prob.ix[tmp_test_ix] += tmp_prob[:, self.tsg_num]

            self.num_pred += 1

        # convert number of trees to fraction of trees
        onco_prob /= self.num_pred
        tsg_prob /= self.num_pred
        other_prob = 1 - (onco_prob + tsg_prob)

        # return prediction.astype(int), prob
        return onco_prob, tsg_prob, other_prob

    def _init_metrics(self):
        """Initialize classification diagnostric metrics."""
        self.feature_importance = []
        self.confusion_matrix = np.zeros((self.num_classes, self.num_classes))
        self.num_pred = 0  # no predictions have been made yet

        # oncogene metrics
        num_points = 100  # number of points to calculate metrics
        self.onco_f1_score = np.zeros(self.total_iter)
        self.onco_precision = np.zeros(self.total_iter)
        self.onco_recall = np.zeros(self.total_iter)
        self.onco_gene_count = np.zeros(self.total_iter)
        self.onco_tpr_array = np.zeros((self.total_iter, num_points))
        self.onco_fpr_array = np.linspace(0, 1, num_points)
        self.onco_precision_array = np.zeros((self.total_iter, num_points))
        self.onco_recall_array = np.linspace(0, 1, num_points)
        self.onco_threshold_array = np.zeros((self.total_iter, num_points))

        # tsg metrics
        self.tsg_f1_score = np.zeros(self.total_iter)
        self.tsg_precision = np.zeros(self.total_iter)
        self.tsg_recall = np.zeros(self.total_iter)
        self.tsg_gene_count = np.zeros(self.total_iter)
        self.tsg_tpr_array = np.zeros((self.total_iter, num_points))
        self.tsg_fpr_array = np.linspace(0, 1, num_points)
        self.tsg_precision_array = np.zeros((self.total_iter, num_points))
        self.tsg_recall_array = np.linspace(0, 1, num_points)

        # driver metrics
        self.driver_precision = np.zeros(self.total_iter)
        self.driver_recall = np.zeros(self.total_iter)
        self.driver_tpr_array = np.zeros((self.total_iter, num_points))
        self.driver_fpr_array = np.linspace(0, 1, num_points)
        self.driver_precision_array = np.zeros((self.total_iter, num_points))
        self.driver_recall_array = np.linspace(0, 1, num_points)
        self.driver_threshold_array = np.zeros((self.total_iter, num_points))
        self.cancer_gene_count = np.zeros(self.total_iter)

        # overall metrics
        self.f1_score = np.zeros(self.total_iter)
        self.precision = np.zeros(self.total_iter)
        self.recall = np.zeros(self.total_iter)

    def _update_metrics(self, y_true, y_pred,
                        onco_prob, tsg_prob):
        # record which genes were predicted what
        self.driver_gene_pred = pd.Series(y_pred, self.y.index)
        self.driver_gene_score = pd.Series(onco_prob+tsg_prob, self.y.index)

        # evaluate performance
        prec, recall, fscore, support = metrics.precision_recall_fscore_support(y_true, y_pred,
                                                                                average='macro')
        cancer_gene_pred = ((onco_prob + tsg_prob)>.5).astype(int)
        self.cancer_gene_count[self.num_pred] = np.sum(cancer_gene_pred)
        self.precision[self.num_pred] = prec
        self.recall[self.num_pred] = recall
        self.f1_score[self.num_pred] = fscore

        # compute Precision-Recall curve metrics
        driver_prob = onco_prob + tsg_prob
        driver_true = (y_true > 0).astype(int)
        p, r, thresh = metrics.precision_recall_curve(driver_true, driver_prob)
        p, r, thresh = p[::-1], r[::-1], thresh[::-1]  # reverse order of results
        thresh = np.insert(thresh, 0, 1.0)
        self.driver_precision_array[self.num_pred, :] = interp(self.driver_recall_array, r, p)
        self.driver_threshold_array[self.num_pred, :] = interp(self.driver_recall_array, r, thresh)

        # calculate prediction summary statistics
        prec, recall, fscore, support = metrics.precision_recall_fscore_support(driver_true, cancer_gene_pred)
        self.driver_precision[self.num_pred] = prec[1]
        self.driver_recall[self.num_pred] = recall[1]

        # save driver metrics
        fpr, tpr, thresholds = metrics.roc_curve(driver_true, driver_prob)
        self.driver_tpr_array[self.num_pred, :] = interp(self.driver_fpr_array, fpr, tpr)

    def _update_onco_metrics(self, y_true, y_pred, prob):
        self.onco_gene_pred = pd.Series(y_pred, self.y.index)
        self.onco_gene_score = pd.Series(prob, self.y.index)

        # compute metrics for classification
        self.onco_gene_count[self.num_pred] = sum(y_pred)
        prec, recall, fscore, support = metrics.precision_recall_fscore_support(y_true, y_pred)
        self.onco_precision[self.num_pred] = prec[self.onco_num]
        self.onco_recall[self.num_pred] = recall[self.onco_num]
        self.onco_f1_score[self.num_pred] = fscore[self.onco_num]
        self.logger.debug('Onco Iter %d: Precission=%s, Recall=%s, f1_score=%s' % (
                          self.num_pred + 1, str(prec), str(recall), str(fscore)))

        # compute ROC curve metrics
        fpr, tpr, thresholds = metrics.roc_curve(y_true, prob)
        self.onco_tpr_array[self.num_pred, :] = interp(self.onco_fpr_array, fpr, tpr)
        #self.onco_mean_tpr[0] = 0.0

        # compute Precision-Recall curve metrics
        p, r, thresh = metrics.precision_recall_curve(y_true, prob)
        p, r, thresh = p[::-1], r[::-1], thresh[::-1]  # reverse order of results
        thresh = np.insert(thresh, 0, 1.0)
        self.onco_precision_array[self.num_pred, :] = interp(self.onco_recall_array, r, p)
        self.onco_threshold_array[self.num_pred, :] = interp(self.onco_recall_array, r, thresh)

    def _update_tsg_metrics(self, y_true, y_pred, prob):
        self.tsg_gene_pred = pd.Series(y_pred, self.y.index)
        self.tsg_gene_score = pd.Series(prob, self.y.index)

        # compute metrics for classification
        self.tsg_gene_count[self.num_pred] = sum(y_pred)
        prec, recall, fscore, support = metrics.precision_recall_fscore_support(y_true, y_pred)
        tsg_col = 1  # column for metrics relate to tsg
        self.tsg_precision[self.num_pred] = prec[tsg_col]
        self.tsg_recall[self.num_pred] = recall[tsg_col]
        self.tsg_f1_score[self.num_pred] = fscore[tsg_col]
        self.logger.debug('Tsg Iter %d: Precission=%s, Recall=%s, f1_score=%s' % (
                          self.num_pred + 1, str(prec), str(recall), str(fscore)))

        # compute ROC curve metrics
        fpr, tpr, thresholds = metrics.roc_curve(y_true, prob)
        self.tsg_tpr_array[self.num_pred, :] = interp(self.tsg_fpr_array, fpr, tpr)
        #self.tsg_tpr_array[0] = 0.0

        # compute Precision-Recall curve metrics
        p, r, thresh = metrics.precision_recall_curve(y_true, prob)
        p, r, thresh = p[::-1], r[::-1], thresh[::-1]  # reverse order of results
        self.tsg_precision_array[self.num_pred, :] = interp(self.tsg_recall_array, r, p)

    def _on_finish(self):
        """Record the mean ROC and PR AUC after all of the cross-validation is finished
        in kfold_validation method.
        """
        # ROC curve metrics
        self.onco_mean_roc_auc = float(metrics.auc(self.onco_fpr_array,
                                                   np.mean(self.onco_tpr_array, axis=0)))
        self.tsg_mean_roc_auc = float(metrics.auc(self.tsg_fpr_array,
                                                  np.mean(self.tsg_tpr_array, axis=0)))
        self.driver_mean_roc_auc = float(metrics.auc(self.driver_fpr_array,
                                                     np.mean(self.driver_tpr_array, axis=0)))

        # Precision-Recall curve metrics
        self.onco_mean_pr_auc = float(metrics.auc(self.onco_recall_array,
                                                  np.mean(self.onco_precision_array, axis=0)))
        self.tsg_mean_pr_auc = float(metrics.auc(self.tsg_recall_array,
                                                 np.mean(self.tsg_precision_array, axis=0)))
        self.driver_mean_pr_auc = float(metrics.auc(self.driver_recall_array,
                                                    np.mean(self.driver_precision_array, axis=0)))

        # log info on classifier predictions
        self.logger.debug('TSG: Precision=%s, Recall=%s, Fscore=%s' % (
                          np.mean(self.tsg_precision), np.mean(self.tsg_recall), np.mean(self.tsg_f1_score)))
        self.logger.debug('Oncogene: Precision=%s, Recall=%s, Fscore=%s' % (
                          np.mean(self.onco_precision), np.mean(self.onco_recall), np.mean(self.onco_f1_score)))
        self.logger.debug('Driver: Precision=%s, Recall=%s' % (
                          np.mean(self.driver_precision), np.mean(self.driver_recall)))

    def get_onco_roc_metrics(self):
        """Simple get method for oncogene ROC metrics."""
        return self.onco_tpr_array, self.onco_fpr_array, self.onco_mean_roc_auc

    def get_tsg_roc_metrics(self):
        """Simple get method for tumor supressor ROC metrics."""
        return self.tsg_tpr_array, self.tsg_fpr_array, self.tsg_mean_roc_auc

    def get_onco_pr_metrics(self):
        """Simple get method for oncogene Precision-Recall metrics."""
        return self.onco_precision_array, self.onco_recall_array, self.onco_mean_pr_auc

    def get_tsg_pr_metrics(self):
        """Simple get method for tumor supressor Precision-Recall metrics"""
        return self.tsg_precision_array, self.tsg_recall_array, self.tsg_mean_pr_auc

    def get_driver_pr_metrics(self):
        """Simple get method for driver gene Precision-Recall metrics"""
        return self.driver_precision_array, self.driver_recall_array, self.driver_mean_pr_auc

    def get_driver_roc_metrics(self):
        """Simple get method for driver gene ROC metrics."""
        return self.driver_tpr_array, self.driver_fpr_array, self.driver_mean_roc_auc

    def set_classes(self, oncogene, tsg):
        """Sets the integers used to represent classes in classification."""
        if not oncogene and not tsg:
            raise ValueError('Classification needs at least two classes')
        elif oncogene and tsg:
            self.other_num = 0
            self.onco_num = 1
            self.tsg_num = 2
            self.num_classes = 3
        else:
            self.other_num = 0
            self.num_classes = 2
            self.onco_num = 1 if oncogene else 0
            self.tsg_num = 1 if tsg else 0

        # significantly mutated genes
        self.smg_num = 1

    def set_min_count(self, ct):
        """Set a minimum count threshold."""
        if ct >= 0:
            self.min_count = ct