Python sklearn.utils.indexable() Examples

The following are 5 code examples of sklearn.utils.indexable(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.utils , or try the search function .
Example #1
Source File: _split.py    From mriqc with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def split(self, X, y, groups=None):
        if groups is None:
            groups = self._groups

        X, y, groups = indexable(X, y, groups)

        msk = np.array(groups, dtype=bool)
        train_idx = np.arange(len(X))[~msk]
        test_idx = np.arange(len(X))[msk]

        try:
            test_x = X.as_matrix()[test_idx, :]
        except AttributeError:
            test_x = X[test_idx, :]

        test_y = np.array(y)[test_idx]
        split = super(PartiallyHeldOutKFold, self).split(test_x, test_y)

        offset = test_idx[0]
        for test_train, test_test in split:
            test_train = np.concatenate((train_idx, test_train + offset))
            yield test_train, test_test 
Example #2
Source File: _validation.py    From mriqc with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def cross_val_score(
    estimator,
    X,
    y=None,
    groups=None,
    scoring=None,
    cv=None,
    n_jobs=1,
    verbose=0,
    fit_params=None,
    pre_dispatch="2*n_jobs",
):
    """
    Evaluate a score by cross-validation
    """
    if not isinstance(scoring, (list, tuple)):
        scoring = [scoring]

    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    splits = list(cv.split(X, y, groups))
    scorer = [check_scoring(estimator, scoring=s) for s in scoring]
    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
    scores = parallel(
        delayed(_fit_and_score)(
            clone(estimator), X, y, scorer, train, test, verbose, None, fit_params
        )
        for train, test in splits
    )

    group_order = []
    if hasattr(cv, "groups"):
        group_order = [np.array(cv.groups)[test].tolist()[0] for _, test in splits]
    return np.squeeze(np.array(scores)), group_order 
Example #3
Source File: split.py    From TSCV with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like, with shape (n_samples,)
            Always ignored, exists for compatibility.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        gap_size = self.gap_size
        test_size = self.test_size if self.test_size else n_samples // n_folds

        # Make sure we have enough samples for the given split parameters
        if n_folds > n_samples:
            raise ValueError(
                ("Cannot have number of folds ={0} greater"
                 " than the number of samples: {1}.").format(n_folds,
                                                             n_samples))
        if n_samples - gap_size - (test_size * n_splits) <= 0:
            raise ValueError(
                ("Too many splits ={0} for number of samples"
                 " ={1} with test_size ={2} and gap_size ={3}."
                 "").format(n_splits, n_samples, test_size, gap_size))

        indices = np.arange(n_samples)
        test_starts = range(n_samples - n_splits * test_size,
                            n_samples, test_size)

        for test_start in test_starts:
            train_end = test_start - gap_size
            if self.max_train_size and self.max_train_size < train_end:
                yield (indices[train_end - self.max_train_size:train_end],
                       indices[test_start:test_start + test_size])
            else:
                yield (indices[:train_end],
                       indices[test_start:test_start + test_size]) 
Example #4
Source File: _validation.py    From mriqc with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def permutation_test_score(
    estimator,
    X,
    y,
    groups=None,
    cv=None,
    n_permutations=100,
    n_jobs=1,
    random_state=0,
    verbose=0,
    scoring=None,
):
    """
    Evaluate the significance of a cross-validated score with permutations,
    as in test 1 of [Ojala2010]_.

    A modification of original sklearn's permutation test score function
    to evaluate p-value outside this function, so that the score can be
    reused from outside.


    .. [Ojala2010] Ojala and Garriga. Permutation Tests for Studying Classifier
                   Performance.  The Journal of Machine Learning Research (2010)
                   vol. 11

    """
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=scoring)
    random_state = check_random_state(random_state)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(_permutation_test_score)(
            clone(estimator), X, _shuffle(y, groups, random_state), groups, cv, scorer
        )
        for _ in range(n_permutations)
    )
    permutation_scores = np.array(permutation_scores)
    return permutation_scores 
Example #5
Source File: base.py    From skoot with MIT License 4 votes vote down vote up
def _validate_X_y_ratio_classes(X, y, ratio):
    # validate the cheap stuff before copying arrays around...
    validate_float(ratio, 'balance_ratio')

    # validate arrays
    X, y = indexable(X, y)  # want to allow pd.DataFrame
    y = column_or_1d(y, warn=False)  # type: np.ndarray

    # get n classes in y, ensure they are <= MAX_N_CLASSES, but first
    # ensure these are actually class labels and not floats or anything...
    y_type = type_of_target(y)
    supported_types = {'multiclass', 'binary'}
    if y_type not in supported_types:
        raise ValueError('balancers only support %r, but got %r'
                         % ("(" + ', '.join(supported_types) + ")", y_type))

    present_classes, counts = np.unique(y, return_counts=True)
    n_classes = len(present_classes)

    # ensure <= MAX_N_CLASSES
    if n_classes > MAX_N_CLASSES:
        raise ValueError('balancers currently only support a maximum of %i '
                         'unique class labels, but %i were identified.'
                         % (MAX_N_CLASSES, n_classes))

    # get the majority class label, and its count:
    majority_count_idx = np.argmax(counts, axis=0)
    majority_label, majority_count = (present_classes[majority_count_idx],
                                      counts[majority_count_idx])
    target_count = max(int(ratio * majority_count), 1)

    # define a min_n_samples based on the sample ratio to max_class
    # required = {target_count - counts[i]
    #             for i, v in enumerate(present_classes)
    #             if v != majority_label}

    # THIS WAS OUR ORIGINAL LOGIC:
    #   * If there were any instances where the number of synthetic examples
    #     required for a class outweighed the number that existed in the class
    #     to begin with, we would end up having to potentially sample from the
    #     synthetic examples. We didn't want to have to do that.
    #
    # But it seems like a totally valid use-case. If we're detecting breast
    # cancer, it might be a rare event that needs lots of bolstering. We
    # should allow that, even though we may discourage it.

    # if any counts < MIN_N_SAMPLES, raise:
    if any(i < MIN_N_SAMPLES for i in counts):
        raise ValueError('All label counts must be >= %i' % MIN_N_SAMPLES)

    return (X, y, n_classes, present_classes, counts,
            majority_label, target_count)