Python sklearn.feature_selection.mutual_info_classif() Examples

The following are 5 code examples of sklearn.feature_selection.mutual_info_classif(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.feature_selection , or try the search function .
Example #1
Source File: preprocessing.py    From Emotion-Recognition-from-Speech with Apache License 2.0 7 votes vote down vote up
def mutual_info_select(self,F,y,threshold):
		mi = list(enumerate(mutual_info_classif(F,y)))
		f_best = []
		for (ind,rank) in mi:
			if rank > threshold:
				f_best.append(ind)
		return f_best 
Example #2
Source File: scores.py    From SecuML with GNU General Public License v2.0 5 votes vote down vote up
def _set_scoring_func(self):
        self.scoring_func = [('variance', False)]
        if self.annotated_instances.num_instances() > 0:
            self.scoring_func.append(('f_classif', True))
            self.scoring_func.append(('mutual_info_classif', False))
            if self.instances.features.all_positives():
                self.scoring_func.append(('chi2', True)) 
Example #3
Source File: scores.py    From SecuML with GNU General Public License v2.0 5 votes vote down vote up
def compute_scoring_func(self, func):
        if func == 'variance':
            features = self.instances.features.get_values()
            annotations = self.instances.annotations.get_labels()
            if isinstance(features, spmatrix):
                variance = mean_variance_axis(features, axis=0)[1]
            else:
                variance = features.var(axis=0)
            return variance, None

        features = self.annotated_instances.features.get_values()
        annotations = self.annotated_instances.annotations.get_supervision(
                                                               self.multiclass)
        if func == 'f_classif':
            return f_classif(features, annotations)
        elif func == 'mutual_info_classif':
            if isinstance(features, spmatrix):
                discrete_indexes = True
            else:
                features_types = self.instances.features.info.types
                discrete_indexes = [i for i, t in enumerate(features_types)
                                    if t == FeatureType.binary]
                if not discrete_indexes:
                    discrete_indexes = False
            return (mutual_info_classif(features, annotations,
                                        discrete_features=discrete_indexes),
                    None)
        elif func == 'chi2':
            return chi2(features, annotations)
        else:
            assert(False) 
Example #4
Source File: eda.py    From xam with MIT License 4 votes vote down vote up
def feature_importance_classification(features, target, n_neighbors=3, random_state=None):

    cont = features.select_dtypes(include=[np.floating])
    disc = features.select_dtypes(include=[np.integer, np.bool])

    cont_imp = pd.DataFrame(index=cont.columns)
    disc_imp = pd.DataFrame(index=disc.columns)

    # Continuous features
    if cont_imp.index.size > 0:

        # F-test
        f_test = feature_selection.f_classif(cont, target)
        cont_imp['f_statistic'] = f_test[0]
        cont_imp['f_p_value'] = f_test[1]

        # Mutual information
        mut_inf = feature_selection.mutual_info_classif(cont, target, discrete_features=False,
                                                        n_neighbors=n_neighbors,
                                                        random_state=random_state)
        cont_imp['mutual_information'] = mut_inf

    # Discrete features
    if disc_imp.index.size > 0:

        # Chi²-test
        chi2_tests = defaultdict(dict)

        for feature in disc.columns:
            cont = pd.crosstab(disc[feature], target)
            statistic, p_value, _, _ = stats.chi2_contingency(cont)
            chi2_tests[feature]['chi2_statistic'] = statistic
            chi2_tests[feature]['chi2_p_value'] = p_value

        chi2_tests_df = pd.DataFrame.from_dict(chi2_tests, orient='index')
        disc_imp['chi2_statistic'] = chi2_tests_df['chi2_statistic']
        disc_imp['chi2_p_value'] = chi2_tests_df['chi2_p_value']

        # Cramér's V (corrected)
        disc_imp['cramers_v'] = [
            cramers_v_corrected_stat(pd.crosstab(feature, target).values)
            for _, feature in disc.iteritems()
        ]

        # Mutual information
        mut_inf = feature_selection.mutual_info_classif(disc, target, discrete_features=True,
                                                        n_neighbors=n_neighbors,
                                                        random_state=random_state)
        disc_imp['mutual_information'] = mut_inf

    return cont_imp, disc_imp 
Example #5
Source File: shapelet_transform.py    From pyts with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def _fit_one_time_series(
        self, x, X, y, n_timestamps, n_shapelets, window_sizes,
        window_steps, remove_similar, i, rng
    ):
        """Fit one time series."""
        # Extract all shapelets
        shapelets, lengths, start_idx, end_idx = _extract_all_shapelets(
            x, window_sizes, window_steps, n_timestamps)

        # Derive distances between shapelets and time series
        X_dist = _derive_all_distances(
            X, window_sizes, shapelets, lengths, fit=True)

        if self.criterion == 'mutual_info':
            scores = mutual_info_classif(X_dist, y, discrete_features=False,
                                         random_state=rng)
        else:
            scores, _ = f_classif(X_dist, y)

        # Flatten the list of 2D arrays into an array of 1D arrays
        shapelets = [list(shapelet) for shapelet in shapelets]
        shapelets = np.asarray(list(chain.from_iterable(shapelets)))

        # Concatenate the list/tuple of 1D arrays into one 1D array
        start_idx = np.concatenate(start_idx)
        end_idx = np.concatenate(end_idx)

        # Remove similar shapelets
        if remove_similar:
            idx = _remove_similar_shapelets(scores.copy(), start_idx, end_idx)
            scores = scores[idx]
            shapelets = shapelets[idx]
            start_idx = start_idx[idx]
            end_idx = end_idx[idx]
            X_dist = X_dist[:, idx]

        # Keep at most 'n_shapelets'
        if scores.size > n_shapelets - 1:
            idx = np.argpartition(
                scores, scores.size - n_shapelets)[-n_shapelets:]
            scores = scores[idx]
            shapelets = shapelets[idx]
            start_idx = start_idx[idx]
            end_idx = end_idx[idx]
            X_dist = X_dist[:, idx]

        time_series_idx = np.full(scores.size, i)
        return X_dist, scores, shapelets, start_idx, end_idx, time_series_idx