python source code of feature

"""Feature extraction functions."""

# Author: Jean-Baptiste Schiratti <jean.baptiste.schiratti@gmail.com>
#         Alexandre Gramfort <alexandre.gramfort@inria.fr>
# License: BSD 3 clause

from inspect import getargs

import numpy as np
import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.externals import joblib
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer

from .bivariate import get_bivariate_funcs
from .univariate import get_univariate_funcs
from .utils import _get_python_func


class FeatureFunctionTransformer(FunctionTransformer):
    """Construct a transformer from a given feature function.

    Similarly to :class:`~sklearn.preprocessing.FunctionTransformer`,
    :class:`FeatureFunctionTranformer` applies a feature function to a given
    array X.

    Parameters
    ----------
    func : callable or None (default: None)
        Feature function to be used in the transformer.
        If None, the identity function is used.

    validate : bool (default: True)
        If True, the array X will be checked before calling the function.
        If possible, a 2d Numpy array is returned. Otherwise, an exception
        will be raised. If False, the array X is not checked.

    params : dict or None (default: None)
        If not None, dictionary of additional keyword arguments to pass to the
        feature function.
    """

    def __init__(self, func=None, validate=True, params=None):
        """Instantiate a FeatureFunctionTransformer object."""
        self.params = params
        super(FeatureFunctionTransformer, self).__init__(func=func,
                                                         validate=validate,
                                                         kw_args=params)

    def transform(self, X):
        """Apply the given feature function to the array X.

        Parameters
        ----------
        X : ndarray, shape (n_channels, n_times)

        Returns
        -------
        X_out : ndarray, shape (n_output_func,)
            Usually, ``n_output_func`` will be equal to ``n_channels`` for most
            univariate feature functions and to
            ``(n_channels * (n_channels + 1)) // 2`` for most bivariate feature
            functions. See the doc of the given feature function for more
            details.
        """
        X_out = super(FeatureFunctionTransformer, self).transform(X)
        self.output_shape_ = X_out.shape[0]
        return X_out

    def fit(self, X, y=None):
        """Fit the FeatureFunctionTransformer (does not extract features).

        Parameters
        ----------
        X : ndarray, shape (n_channels, n_times)

        y : ignored

        Returns
        -------
        self
        """
        self._check_input(X)
        _feature_func = _get_python_func(self.func)
        if hasattr(_feature_func, 'get_feature_names'):
            _params = self.get_params()
            self.feature_names_ = _feature_func.get_feature_names(X, **_params)
        return self

    def get_feature_names(self):
        """Mapping of the feature indices to feature names."""
        if not hasattr(self, 'output_shape_'):
            raise ValueError('Call `fit_transform` first.')
        elif hasattr(self, 'feature_names_'):
            return self.feature_names_
        else:
            return np.arange(self.output_shape_).astype(str)

    def get_params(self, deep=True):
        """Get the parameters (if any) of the given feature function.

        Parameters
        ----------
        deep : bool (default: True)
            If True, the method will get the parameters of the transformer.
            (See :class:`~sklearn.preprocessing.FunctionTransformer`).
        """
        func_to_inspect = _get_python_func(self.func)
        # Get code object from the function
        if hasattr(func_to_inspect, 'func_code'):
            func_code = func_to_inspect.func_code
        else:
            func_code = func_to_inspect.__code__
        args, _, _ = getargs(func_code)
        # Get defaults from the function
        if hasattr(func_to_inspect, 'defaults'):
            defaults = func_to_inspect.func_defaults
        else:
            defaults = func_to_inspect.__defaults__
        if defaults is None:
            return dict()
        else:
            n_defaults = len(defaults)
            func_params = {key: value for key, value in
                           zip(args[-n_defaults:], defaults)}
        if self.params is not None:
            func_params.update(self.params)
        return func_params

    def set_params(self, **new_params):
        """Set the parameters (if any) of the given feature function."""
        valid_params = self.get_params()
        for key in new_params.keys():
            if key not in valid_params:
                raise ValueError('Invalid parameter %s for transformer %s. '
                                 'Check the list of available parameters '
                                 'using the `get_params` method of the '
                                 'transformer.' % (key, self))
        if self.params is not None:
            self.params.update(new_params)
        else:
            self.params = new_params
        self.kw_args = self.params
        return self


def _format_as_dataframe(X, feature_names):
    """Format to Pandas DataFrame.

    Utility function to format extracted features (X) as a Pandas
    DataFrame using names and indexes from ``feature_names``. The index of the
    columns is a MultiIndex with two levels. At level 0, the alias of the
    feature function is given. At level 1, an enumeration of the features is
    given.

    Parameters
    ----------
    X : ndarray, shape (n_epochs, n_features)
        Extracted features. X should be the output of :func:`extract_features`.

    feature_names : list of str

    Returns
    -------
    output : Pandas DataFrame
    """
    n_features = X.shape[1]
    if len(feature_names) != n_features:
        raise ValueError('The length of `feature_names` should be equal to '
                         '`X.shape[1]` (`n_features`).')
    else:
        _names = [n.split('__')[0] for n in feature_names]
        _idx = [n.split('__')[1] for n in feature_names]
        columns = pd.MultiIndex.from_arrays([_names, _idx])
        return pd.DataFrame(data=X, columns=columns)


def _apply_extractor(extractor, X, return_as_df):
    """Utility function to apply features extractor to ndarray X.

    Parameters
    ----------
    extractor : Instance of :class:`~sklearn.pipeline.FeatureUnion` or
    :class:`~sklearn.pipeline.Pipeline`.

    X : ndarray, shape (n_channels, n_times)

    return_as_df : bool

    Returns
    -------
    X : ndarray, shape (n_features,)

    feature_names : list of str | None
        Not None, only if ``return_as_df`` is True.
    """
    X = extractor.fit_transform(X)
    feature_names = None
    if return_as_df:
        feature_names = extractor.get_feature_names()
    return X, feature_names


def _check_funcs(selected, feature_funcs):
    """Selection checker.

    Checks if the elements of ``selected`` are either strings (alias of a
    feature function defined in mne-features) or tuples of the form
    ``(str, callable)`` (user-defined feature function).

    Parameters
    ----------
    selected : list of str or tuples
        Names of the selected feature functions.

    feature_funcs : dict
        Dictionary of the feature functions (univariate and bivariate)
        available in mne-features.

    Returns
    -------
    valid_funcs : list of tuples
    """
    valid_funcs = list()
    _intrinsic_func_names = feature_funcs.keys()
    for s in selected:
        if isinstance(s, str):
            # Case of a MNE-feature alias
            if s in _intrinsic_func_names:
                valid_funcs.append((s, feature_funcs[s]))
            else:
                raise ValueError('The given alias (%s) is not valid. The '
                                 'valid aliases for feature functions are: %s.'
                                 % (s, _intrinsic_func_names))
        elif isinstance(s, tuple):
            if len(s) != 2:
                raise ValueError('The given tuple (%s) is not of length 2. '
                                 'Each user-defined feature function should '
                                 'be passed as a tuple of the form '
                                 '`(str, callable)`.' % str(s))
            else:
                # Case of a user-defined feature function
                if s[0] in _intrinsic_func_names:
                    raise ValueError('A user-defined feature function was '
                                     'given an alias (%s) which is already '
                                     'used by mne-features. The list of '
                                     'aliases used by mne-features is: %s.'
                                     % (s[0], _intrinsic_func_names))
                else:
                    valid_funcs.append(s)
        else:
            # Case where the element is neither a string, nor a tuple
            raise ValueError('%s is not a valid feature function and cannot '
                             'be interpreted as a user-defined feature '
                             'function.' % str(s))
    if not valid_funcs:
        raise ValueError('No valid feature function was given.')
    else:
        return valid_funcs


class FeatureExtractor(BaseEstimator, TransformerMixin):
    """Feature extraction from epoched EEG data.

    The method ``fit_transform`` implemented in this class can be used to
    extract univariate or bivariate features from epoched data
    (see example below). The method ``fit`` does not have any effect and is
    implemented for compatibility with Scikit-learn's API. As a result, the
    class ``FeatureExtractor`` can be used as a step in a Pipeline (see
    :class:`~sklearn.pipeline.Pipeline` and MNE-features examples). The class
    also accepts a ``memory`` parameter which allows for caching the result of
    feature extraction. Therefore, if caching is used, calling
    ``fit_transform`` twice on the same data will not trigger a second call
    to :func:`extract_features`.

    Parameters
    ----------
    sfreq : float (default: 256.)
        Sampling rate of the data.

    selected_funcs : list of str or tuples
        The elements of ``selected_features`` are either strings or tuples of
        the form ``(str, callable)``. If an element is of type ``str``, it is
        the alias of a feature function. The aliases are built from the
        feature functions' names by removing ``compute_``. For instance, the
        alias of the feature function :func:`compute_ptp_amp` is ``ptp_amp``.
        (See the documentation of mne-features). If an element is of type
        ``tuple``, the first element of the tuple should be a string
        (name/alias given to a user-defined feature function) and the second
        element should be a  callable (a user-defined feature function which
        accepts Numpy arrays with shape ``(n_channels, n_times)``). The
        names/aliases given to user-defined feature functions should not
        intersect the aliases used by mne-features. If the name given to a
        user-defined feature function is already used as an alias in
        mne-features, an error will be raised.

    params : dict or None (default: None)
        If not None, dict of optional parameters to be passed to
        :func:`extract_features`. Each key of the ``funcs_params`` dict should
        be of the form: ``[alias_feature_function]__[optional_param]``
        (for example: ``higuchi_fd__kmax``).

    n_jobs : int (default: 1)
        Number of CPU cores used when parallelizing the feature extraction.
        If given a value of -1, all cores are used.

    memory : str or None (default: None)
        If None, no caching is performed. If a string is given, the string
        should be the path to the caching directory. Caching is particularly
        advantageous when feature extraction is time consuming.

    Examples
    --------
    >>> import numpy as np
    >>> rng = np.random.RandomState(42)
    >>> n_epochs, n_channels, n_times = 5, 3, 32
    >>> X = rng.randn(n_epochs, n_channels, n_times)
    >>> fe = FeatureExtractor(sfreq=100., selected_funcs=['std', 'kurtosis'])
    >>> X = fe.fit_transform(X)
    >>> print(X.shape)
    (5, 6)

    See also
    --------
    :func:`extract_features`
    """

    def __init__(self, sfreq=256., selected_funcs=None, params=None, n_jobs=1,
                 memory=None):
        """Instantiate a FeatureExtractor object."""
        self.sfreq = sfreq
        self.selected_funcs = selected_funcs
        self.params = params
        self.n_jobs = n_jobs
        self.memory = memory

    def fit(self, X, y=None):
        """Do not have any effect."""
        return self

    def transform(self, X):
        """Extract features from the array X.

        Parameters
        ----------
        X : ndarray, shape (n_epochs, n_channels, n_times)

        Returns
        -------
        Xnew : ndarray, shape (n_epochs, n_features)
            Extracted features.
        """
        mem = joblib.Memory(location=self.memory)
        _extractor = mem.cache(extract_features)
        return _extractor(X, self.sfreq, self.selected_funcs,
                          funcs_params=self.params, n_jobs=self.n_jobs)

    def get_params(self, deep=True):
        """Get the parameters of the transformer."""
        return super(FeatureExtractor, self).get_params(deep=deep)

    def set_params(self, **params):
        """Set the parameters of the transformer."""
        self.params = params
        return self


def extract_features(X, sfreq, selected_funcs, funcs_params=None, n_jobs=1,
                     return_as_df=False):
    """Extraction of temporal or spectral features from epoched EEG signals.

    Parameters
    ----------
    X : ndarray, shape (n_epochs, n_channels, n_times)
        Array of epoched EEG data.

    sfreq : float
        Sampling rate of the data.

    selected_funcs : list of str or tuples
        The elements of ``selected_features`` are either strings or tuples of
        the form ``(str, callable)``. If an element is of type ``str``, it is
        the alias of a feature function. The aliases are built from the
        feature functions' names by removing ``compute_``. For instance, the
        alias of the feature function :func:`compute_ptp_amp` is ``ptp_amp``.
        (See the documentation of mne-features). If an element is of type
        ``tuple``, the first element of the tuple should be a string
        (name/alias given to a user-defined feature function) and the second
        element should be a  callable (a user-defined feature function which
        accepts Numpy arrays with shape ``(n_channels, n_times)``). The
        names/aliases given to user-defined feature functions should not
        intersect the aliases used by mne-features. If the name given to a
        user-defined feature function is already used as an alias in
        mne-features, an error will be raised.

    funcs_params : dict or None (default: None)
        If not None, dict of optional parameters to be passed to the feature
        functions. Each key of the ``funcs_params`` dict should be of the form:
        ``[alias_feature_function]__[optional_param]`` (for example:
        ``higuchi_fd__kmax``).

    n_jobs : int (default: 1)
        Number of CPU cores used when parallelizing the feature extraction.
        If given a value of -1, all cores are used.

    return_as_df : bool (default: False)
        If True, the extracted features will be returned as a Pandas DataFrame.
        The column index is a MultiIndex (see :class:`~pandas.MultiIndex`)
        which contains the alias of each feature function which was used.
        If False, the features are returned as a 2d Numpy array.

    Returns
    -------
    array-like, shape (n_epochs, n_features)
    """
    if sfreq <= 0:
        raise ValueError('Sampling rate `sfreq` must be positive.')
    univariate_funcs = get_univariate_funcs(sfreq)
    bivariate_funcs = get_bivariate_funcs(sfreq)
    feature_funcs = univariate_funcs.copy()
    feature_funcs.update(bivariate_funcs)
    sel_funcs = _check_funcs(selected_funcs, feature_funcs)

    # Feature extraction
    n_epochs = X.shape[0]
    _tr = [(n, FeatureFunctionTransformer(func=func)) for n, func in sel_funcs]
    extractor = FeatureUnion(transformer_list=_tr)
    if funcs_params is not None:
        extractor.set_params(**funcs_params)
    res = joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(_apply_extractor)(
        extractor, X[j, :, :], return_as_df) for j in range(n_epochs))
    feature_names = res[0][1]
    res = list(zip(*res))[0]
    Xnew = np.vstack(res)
    if return_as_df:
        return _format_as_dataframe(Xnew, feature_names)
    else:
        return Xnew