Python sklearn.model_selection.KFold() Examples

The following are 30 code examples for showing how to use sklearn.model_selection.KFold(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module sklearn.model_selection , or try the search function .

Example 1
Project: Kaggler   Author: jeongyoonlee   File: test_encoders.py    License: MIT License 6 votes vote down vote up
def test_TargetEncoder(generate_data):
    df = generate_data()
    feature_cols = [x for x in df.columns if x != TARGET_COL]
    cat_cols = [x for x in feature_cols if df[x].nunique() < 100]

    te = TargetEncoder()
    X_cat = te.fit_transform(df[cat_cols], df[TARGET_COL])
    print('Without CV:\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols)

    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=RANDOM_SEED)
    te = TargetEncoder(cv=cv)
    X_cat = te.fit_transform(df[cat_cols], df[TARGET_COL])
    print('With CV (fit_transform()):\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols)

    te = TargetEncoder(cv=cv)
    te.fit(df[cat_cols], df[TARGET_COL])
    X_cat = te.transform(df[cat_cols])
    print('With CV (fit() and transform() separately):\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols) 
Example 2
Project: Kaggler   Author: jeongyoonlee   File: test_encoders.py    License: MIT License 6 votes vote down vote up
def test_FrequencyEncoder(generate_data):
    df = generate_data()
    feature_cols = [x for x in df.columns if x != TARGET_COL]
    cat_cols = [x for x in feature_cols if df[x].nunique() < 100]

    te = FrequencyEncoder()
    X_cat = te.fit_transform(df[cat_cols])
    print('Without CV:\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols)

    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=RANDOM_SEED)
    te = FrequencyEncoder(cv=cv)
    X_cat = te.fit_transform(df[cat_cols])
    print('With CV (fit_transform()):\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols)

    te = FrequencyEncoder(cv=cv)
    te.fit(df[cat_cols])
    X_cat = te.transform(df[cat_cols])
    print('With CV (fit() and transform() separately):\n{}'.format(X_cat.head()))

    assert X_cat.shape[1] == len(cat_cols) 
Example 3
Project: ibllib   Author: int-brain-lab   File: cca.py    License: MIT License 6 votes vote down vote up
def split_trials(trial_ids, n_splits=5, rng_seed=0):
    """
    Assign each trial to testing or training fold

    :param trial_ids:
    :type trial_ids: array-like
    :param n_splits: one split used for testing; remaining splits used for training
    :type n_splits: int
    :param rng_seed: set random state for shuffling trials
    :type rng_seed: int
    :return: list of dicts of indices with keys `train` and `test`
    """
    from sklearn.model_selection import KFold
    shuffle = True if rng_seed is not None else False
    kf = KFold(n_splits=n_splits, random_state=rng_seed, shuffle=shuffle)
    kf.get_n_splits(trial_ids)
    idxs = [None for _ in range(n_splits)]
    for i, t0 in enumerate(kf.split(trial_ids)):
        idxs[i] = {'train': t0[0], 'test': t0[1]}
    return idxs 
Example 4
Project: heamy   Author: rushter   File: dataset.py    License: MIT License 6 votes vote down vote up
def kfold(self, k=5, stratify=False, shuffle=True, seed=33):
        """K-Folds cross validation iterator.

        Parameters
        ----------
        k : int, default 5
        stratify : bool, default False
        shuffle : bool, default True
        seed : int, default 33

        Yields
        -------
        X_train, y_train, X_test, y_test, train_index, test_index
        """
        if stratify:
            kf = StratifiedKFold(n_splits=k, random_state=seed, shuffle=shuffle)
        else:
            kf = KFold(n_splits=k, random_state=seed, shuffle=shuffle)

        for train_index, test_index in kf.split(self.X_train, self.y_train):
            X_train, y_train = idx(self.X_train, train_index), self.y_train[train_index]
            X_test, y_test = idx(self.X_train, test_index), self.y_train[test_index]
            yield X_train, y_train, X_test, y_test, train_index, test_index 
Example 5
Project: nyaggle   Author: nyanp   File: test_split.py    License: MIT License 6 votes vote down vote up
def test_skip():
    df = pd.DataFrame()
    df['id'] = np.arange(10)

    kf = split.Skip(2, KFold(5))
    folds = kf.split(df)

    assert kf.get_n_splits() == 3

    train_index, test_index = next(folds)
    assert np.array_equal(test_index, np.array([4, 5]))

    train_index, test_index = next(folds)
    assert np.array_equal(test_index, np.array([6, 7]))

    train_index, test_index = next(folds)
    assert np.array_equal(test_index, np.array([8, 9]))

    with pytest.raises(StopIteration):
        next(folds) 
Example 6
Project: nyaggle   Author: nyanp   File: test_split.py    License: MIT License 6 votes vote down vote up
def test_nth():
    df = pd.DataFrame()
    df['id'] = np.arange(10)

    kf = split.Nth(3, KFold(5))
    folds = kf.split(df)

    assert kf.get_n_splits() == 1

    train_index, test_index = next(folds)
    assert np.array_equal(test_index, np.array([4, 5]))

    with pytest.raises(StopIteration):
        next(folds)

    kf = split.Nth(1, KFold(5))
    folds = kf.split(df)

    assert kf.get_n_splits() == 1

    train_index, test_index = next(folds)
    assert np.array_equal(test_index, np.array([0, 1]))

    with pytest.raises(StopIteration):
        next(folds) 
Example 7
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_split.py    License: MIT License 6 votes vote down vote up
def test_2d_y():
    # smoke test for 2d y and multi-label
    n_samples = 30
    rng = np.random.RandomState(1)
    X = rng.randint(0, 3, size=(n_samples, 2))
    y = rng.randint(0, 3, size=(n_samples,))
    y_2d = y.reshape(-1, 1)
    y_multilabel = rng.randint(0, 2, size=(n_samples, 3))
    groups = rng.randint(0, 3, size=(n_samples,))
    splitters = [LeaveOneOut(), LeavePOut(p=2), KFold(), StratifiedKFold(),
                 RepeatedKFold(), RepeatedStratifiedKFold(),
                 ShuffleSplit(), StratifiedShuffleSplit(test_size=.5),
                 GroupShuffleSplit(), LeaveOneGroupOut(),
                 LeavePGroupsOut(n_groups=2), GroupKFold(), TimeSeriesSplit(),
                 PredefinedSplit(test_fold=groups)]
    for splitter in splitters:
        list(splitter.split(X, y, groups))
        list(splitter.split(X, y_2d, groups))
        try:
            list(splitter.split(X, y_multilabel, groups))
        except ValueError as e:
            allowed_target_types = ('binary', 'multiclass')
            msg = "Supported target types are: {}. Got 'multilabel".format(
                allowed_target_types)
            assert msg in str(e) 
Example 8
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_split.py    License: MIT License 6 votes vote down vote up
def test_kfold_no_shuffle():
    # Manually check that KFold preserves the data ordering on toy datasets
    X2 = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]

    splits = KFold(2).split(X2[:-1])
    train, test = next(splits)
    assert_array_equal(test, [0, 1])
    assert_array_equal(train, [2, 3])

    train, test = next(splits)
    assert_array_equal(test, [2, 3])
    assert_array_equal(train, [0, 1])

    splits = KFold(2).split(X2)
    train, test = next(splits)
    assert_array_equal(test, [0, 1, 2])
    assert_array_equal(train, [3, 4])

    train, test = next(splits)
    assert_array_equal(test, [3, 4])
    assert_array_equal(train, [0, 1, 2]) 
Example 9
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_split.py    License: MIT License 6 votes vote down vote up
def test_shuffle_kfold():
    # Check the indices are shuffled properly
    kf = KFold(3)
    kf2 = KFold(3, shuffle=True, random_state=0)
    kf3 = KFold(3, shuffle=True, random_state=1)

    X = np.ones(300)

    all_folds = np.zeros(300)
    for (tr1, te1), (tr2, te2), (tr3, te3) in zip(
            kf.split(X), kf2.split(X), kf3.split(X)):
        for tr_a, tr_b in combinations((tr1, tr2, tr3), 2):
            # Assert that there is no complete overlap
            assert_not_equal(len(np.intersect1d(tr_a, tr_b)), len(tr1))

        # Set all test indices in successive iterations of kf2 to 1
        all_folds[te2] = 1

    # Check that all indices are returned in the different test folds
    assert_equal(sum(all_folds), 300) 
Example 10
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_split.py    License: MIT License 6 votes vote down vote up
def test_cv_iterable_wrapper():
    kf_iter = KFold(n_splits=5).split(X, y)
    kf_iter_wrapped = check_cv(kf_iter)
    # Since the wrapped iterable is enlisted and stored,
    # split can be called any number of times to produce
    # consistent results.
    np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)),
                            list(kf_iter_wrapped.split(X, y)))
    # If the splits are randomized, successive calls to split yields different
    # results
    kf_randomized_iter = KFold(n_splits=5, shuffle=True).split(X, y)
    kf_randomized_iter_wrapped = check_cv(kf_randomized_iter)
    # numpy's assert_array_equal properly compares nested lists
    np.testing.assert_equal(list(kf_randomized_iter_wrapped.split(X, y)),
                            list(kf_randomized_iter_wrapped.split(X, y)))

    try:
        np.testing.assert_equal(list(kf_iter_wrapped.split(X, y)),
                                list(kf_randomized_iter_wrapped.split(X, y)))
        splits_are_equal = True
    except AssertionError:
        splits_are_equal = False
    assert not splits_are_equal, (
        "If the splits are randomized, "
        "successive calls to split should yield different results") 
Example 11
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_validation.py    License: MIT License 6 votes vote down vote up
def test_cross_val_score_mask():
    # test that cross_val_score works with boolean masks
    svm = SVC(kernel="linear")
    iris = load_iris()
    X, y = iris.data, iris.target
    kfold = KFold(5)
    scores_indices = cross_val_score(svm, X, y, cv=kfold)
    kfold = KFold(5)
    cv_masks = []
    for train, test in kfold.split(X, y):
        mask_train = np.zeros(len(y), dtype=np.bool)
        mask_test = np.zeros(len(y), dtype=np.bool)
        mask_train[train] = 1
        mask_test[test] = 1
        cv_masks.append((train, test))
    scores_masks = cross_val_score(svm, X, y, cv=cv_masks)
    assert_array_equal(scores_indices, scores_masks) 
Example 12
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_validation.py    License: MIT License 6 votes vote down vote up
def test_learning_curve_with_boolean_indices():
    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
                               n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    estimator = MockImprovingEstimator(20)
    cv = KFold(n_splits=3)
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, train_sizes=np.linspace(0.1, 1.0, 10))
    assert_array_equal(train_sizes, np.linspace(2, 20, 10))
    assert_array_almost_equal(train_scores.mean(axis=1),
                              np.linspace(1.9, 1.0, 10))
    assert_array_almost_equal(test_scores.mean(axis=1),
                              np.linspace(0.1, 1.0, 10))


# 0.23. warning about tol not having its correct default value. 
Example 13
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_validation.py    License: MIT License 6 votes vote down vote up
def check_cross_val_predict_binary(est, X, y, method):
    """Helper for tests of cross_val_predict with binary classification"""
    cv = KFold(n_splits=3, shuffle=False)

    # Generate expected outputs
    if y.ndim == 1:
        exp_shape = (len(X),) if method == 'decision_function' else (len(X), 2)
    else:
        exp_shape = y.shape
    expected_predictions = np.zeros(exp_shape)
    for train, test in cv.split(X, y):
        est = clone(est).fit(X[train], y[train])
        expected_predictions[test] = getattr(est, method)(X[test])

    # Check actual outputs for several representations of y
    for tg in [y, y + 1, y - 2, y.astype('str')]:
        assert_allclose(cross_val_predict(est, X, tg, method=method, cv=cv),
                        expected_predictions) 
Example 14
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_search.py    License: MIT License 6 votes vote down vote up
def test_deprecated_grid_search_iid():
    depr_message = ("The default of the `iid` parameter will change from True "
                    "to False in version 0.22")
    X, y = make_blobs(n_samples=54, random_state=0, centers=2)
    grid = GridSearchCV(SVC(gamma='scale', random_state=0),
                        param_grid={'C': [10]}, cv=3)
    # no warning with equally sized test sets
    assert_no_warnings(grid.fit, X, y)

    grid = GridSearchCV(SVC(gamma='scale', random_state=0),
                        param_grid={'C': [10]}, cv=5)
    # warning because 54 % 5 != 0
    assert_warns_message(DeprecationWarning, depr_message, grid.fit, X, y)

    grid = GridSearchCV(SVC(gamma='scale', random_state=0),
                        param_grid={'C': [10]}, cv=2)
    # warning because stratification into two classes and 27 % 2 != 0
    assert_warns_message(DeprecationWarning, depr_message, grid.fit, X, y)

    grid = GridSearchCV(SVC(gamma='scale', random_state=0),
                        param_grid={'C': [10]}, cv=KFold(2))
    # no warning because no stratification and 54 % 2 == 0
    assert_no_warnings(grid.fit, X, y) 
Example 15
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_search.py    License: MIT License 6 votes vote down vote up
def test_empty_cv_iterator_error():
    # Use global X, y

    # create cv
    cv = KFold(n_splits=3).split(X)

    # pop all of it, this should cause the expected ValueError
    [u for u in cv]
    # cv is empty now

    train_size = 100
    ridge = RandomizedSearchCV(Ridge(), {'alpha': [1e-3, 1e-2, 1e-1]},
                               cv=cv, n_jobs=-1)

    # assert that this raises an error
    with pytest.raises(ValueError,
                       match='No fits were performed. '
                             'Was the CV iterator empty\\? '
                             'Were there no candidates\\?'):
        ridge.fit(X[:train_size], y[:train_size]) 
Example 16
Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_search.py    License: MIT License 6 votes vote down vote up
def test_random_search_bad_cv():
    # Use global X, y

    class BrokenKFold(KFold):
        def get_n_splits(self, *args, **kw):
            return 1

    # create bad cv
    cv = BrokenKFold(n_splits=3)

    train_size = 100
    ridge = RandomizedSearchCV(Ridge(), {'alpha': [1e-3, 1e-2, 1e-1]},
                               cv=cv, n_jobs=-1)

    # assert that this raises an error
    with pytest.raises(ValueError,
                       match='cv.split and cv.get_n_splits returned '
                             'inconsistent results. Expected \\d+ '
                             'splits, got \\d+'):
        ridge.fit(X[:train_size], y[:train_size]) 
Example 17
Project: models   Author: kipoi   File: bio_utils.py    License: MIT License 5 votes vote down vote up
def split_dataset(nb_samples, nb_folds):
    indicies = KFold(n=nb_samples, n_folds=nb_folds, shuffle=True)
    indicies = list(indicies)  # list in the order of (train, test)

    return (indicies)

# functions to get statistics from a confusion matrix 
Example 18
Project: Kaggler   Author: jeongyoonlee   File: categorical.py    License: MIT License 5 votes vote down vote up
def __init__(self, smoothing=1, min_samples=10, cv=kfold):
        """Initialize the TargetEncoder class object.

        Args:
            smoothing (int): smoothing effect to balance between the categorical average vs global mean
            min_samples (int): minimum samples to take category average into account
            cv (sklearn.model_selection._BaseKFold, optional): sklearn CV object. default=KFold(5, True, 42)
        """
        assert (min_samples >= 0) and (smoothing >= 0), 'min_samples and smoothing should be positive'
        self.smoothing = smoothing
        self.min_samples = min_samples
        self.cv = cv 
Example 19
Project: Kaggler   Author: jeongyoonlee   File: test_encoders.py    License: MIT License 5 votes vote down vote up
def test_EmbeddingEncoder(generate_data):
    df = generate_data()
    feature_cols = [x for x in df.columns if x != TARGET_COL]
    cat_cols = [x for x in feature_cols if df[x].nunique() < 100]
    num_cols = [x for x in feature_cols if x not in cat_cols]

    print('Test with the regression target')
    ee = EmbeddingEncoder(cat_cols=cat_cols,
                          num_cols=num_cols,
                          random_state=RANDOM_SEED)

    X_emb = ee.fit_transform(X=df[feature_cols], y=df[TARGET_COL])
    assert X_emb.shape[1] == sum(ee.n_emb)

    print('Test with the binary classification target')
    df[TARGET_COL] = (df[TARGET_COL] > df[TARGET_COL].mean()).astype(int)

    ee = EmbeddingEncoder(cat_cols=cat_cols,
                          num_cols=num_cols,
                          random_state=RANDOM_SEED)

    X_emb = ee.fit_transform(X=df[feature_cols], y=df[TARGET_COL])
    assert X_emb.shape[1] == sum(ee.n_emb)

    print('Test with the binary classification target with cross validation')
    cv = KFold(n_splits=N_FOLD, shuffle=True, random_state=RANDOM_SEED)
    ee = EmbeddingEncoder(cat_cols=cat_cols,
                          num_cols=num_cols,
                          cv=cv,
                          random_state=RANDOM_SEED)

    X_emb = ee.fit_transform(X=df[feature_cols], y=df[TARGET_COL])
    assert X_emb.shape[1] == sum(ee.n_emb) 
Example 20
def __init__(self, n_splits=2, shuffle=False):
        self.n_splits = n_splits
        if self.n_splits > 1:
            self.k_fold = KFold(n_splits=n_splits, shuffle=shuffle) 
Example 21
Project: Python-ELM   Author: masaponto   File: elm.py    License: MIT License 5 votes vote down vote up
def main():
    from sklearn import preprocessing
    from sklearn.datasets import fetch_openml as fetch_mldata
    from sklearn.model_selection import ShuffleSplit, KFold, cross_val_score

    db_name = 'australian'
    hid_nums = [100, 200, 300]

    data_set = fetch_mldata(db_name)
    data_set.data = preprocessing.normalize(data_set.data)
    data_set.target = [1 if i == 1 else -1
                       for i in  data_set.target.astype(int)]

    for hid_num in hid_nums:
        print(hid_num, end=' ')
        e = ELM(hid_num)

        ave = 0
        for i in range(10):
            cv = KFold(n_splits=5, shuffle=True)
            scores = cross_val_score(
                e, data_set.data, data_set.target,
                cv=cv, scoring='accuracy', n_jobs=-1)
            ave += scores.mean()

        ave /= 10

        print("Accuracy: %0.3f " % (ave)) 
Example 22
Project: insightface   Author: deepinsight   File: verification.py    License: MIT License 5 votes vote down vote up
def __init__(self, n_splits = 2, shuffle = False):
    self.n_splits = n_splits
    if self.n_splits>1:
      self.k_fold = KFold(n_splits = n_splits, shuffle = shuffle) 
Example 23
Project: insightface   Author: deepinsight   File: lfw.py    License: MIT License 5 votes vote down vote up
def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10):
    assert(embeddings1.shape[0] == embeddings2.shape[0])
    assert(embeddings1.shape[1] == embeddings2.shape[1])
    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
    nrof_thresholds = len(thresholds)
    k_fold = KFold(n_splits=nrof_folds, shuffle=False)
    
    val = np.zeros(nrof_folds)
    far = np.zeros(nrof_folds)
    
    diff = np.subtract(embeddings1, embeddings2)
    dist = np.sum(np.square(diff),1)
    indices = np.arange(nrof_pairs)
    
    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
      
        # Find the threshold that gives FAR = far_target
        far_train = np.zeros(nrof_thresholds)
        for threshold_idx, threshold in enumerate(thresholds):
            _, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set])
        if np.max(far_train)>=far_target:
            f = interpolate.interp1d(far_train, thresholds, kind='slinear')
            threshold = f(far_target)
        else:
            threshold = 0.0
    
        val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set])
  
    val_mean = np.mean(val)
    far_mean = np.mean(far)
    val_std = np.std(val)
    return val_mean, val_std, far_mean 
Example 24
Project: insightface   Author: deepinsight   File: verification.py    License: MIT License 5 votes vote down vote up
def __init__(self, n_splits = 2, shuffle = False):
    self.n_splits = n_splits
    if self.n_splits>1:
      self.k_fold = KFold(n_splits = n_splits, shuffle = shuffle) 
Example 25
Project: xcessiv   Author: reiinakano   File: functions.py    License: Apache License 2.0 5 votes vote down vote up
def get_sample_dataset(dataset_properties):
    """Returns sample dataset

    Args:
        dataset_properties (dict): Dictionary corresponding to the properties of the dataset
            used to verify the estimator and metric generators.

    Returns:
        X (array-like): Features array

        y (array-like): Labels array

        splits (iterator): This is an iterator that returns train test splits for
            cross-validation purposes on ``X`` and ``y``.
    """
    kwargs = dataset_properties.copy()
    data_type = kwargs.pop('type')
    if data_type == 'multiclass':
        try:
            X, y = datasets.make_classification(random_state=8, **kwargs)
            splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
        except Exception as e:
            raise exceptions.UserError(repr(e))
    elif data_type == 'iris':
        X, y = datasets.load_iris(return_X_y=True)
        splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
    elif data_type == 'mnist':
        X, y = datasets.load_digits(return_X_y=True)
        splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
    elif data_type == 'breast_cancer':
        X, y = datasets.load_breast_cancer(return_X_y=True)
        splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
    elif data_type == 'boston':
        X, y = datasets.load_boston(return_X_y=True)
        splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
    elif data_type == 'diabetes':
        X, y = datasets.load_diabetes(return_X_y=True)
        splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
    else:
        raise exceptions.UserError('Unknown dataset type {}'.format(dataset_properties['type']))
    return X, y, splits 
Example 26
Project: ludwig   Author: uber   File: data_utils.py    License: Apache License 2.0 5 votes vote down vote up
def generate_kfold_splits(data_df, num_folds, random_state):
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=random_state)
    fold_num = 0
    for train_indices, test_indices in kf.split(data_df):
        fold_num += 1
        yield train_indices, test_indices, fold_num 
Example 27
Project: basenji   Author: calico   File: basenji_bench_classify.py    License: Apache License 2.0 5 votes vote down vote up
def fold_roc(X, y, folds=8, random_state=44):
    """Compute ROC for a single value, sans model."""
    aurocs = []
    fpr_folds = []
    tpr_folds = []

    fpr_mean = np.linspace(0, 1, 256)
    tpr_mean = []

    # preds_full = np.zeros(y.shape)

    kf = KFold(n_splits=folds, shuffle=True, random_state=random_state)

    for train_index, test_index in kf.split(X):
        # predict test set (as is)
        preds = X[test_index,:]

        # save
        # preds_full[test_index] = preds.squeeze()

        # compute ROC curve
        fpr, tpr, _ = roc_curve(y[test_index], preds)
        fpr_folds.append(fpr)
        tpr_folds.append(tpr)

        interp_tpr = np.interp(fpr_mean, fpr, tpr)
        interp_tpr[0] = 0.0
        tpr_mean.append(interp_tpr)

        # compute AUROC
        aurocs.append(roc_auc_score(y[test_index], preds))

    # fpr_full, tpr_full, _ = roc_curve(y, preds_full)
    tpr_mean = np.array(tpr_mean).mean(axis=0)

    return np.array(aurocs), np.array(fpr_folds), np.array(tpr_folds), fpr_mean, tpr_mean 
Example 28
Project: TNT   Author: GaoangW   File: facenet.py    License: GNU General Public License v3.0 5 votes vote down vote up
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10, distance_metric=0, subtract_mean=False):
    assert(embeddings1.shape[0] == embeddings2.shape[0])
    assert(embeddings1.shape[1] == embeddings2.shape[1])
    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
    nrof_thresholds = len(thresholds)
    k_fold = KFold(n_splits=nrof_folds, shuffle=False)
    
    tprs = np.zeros((nrof_folds,nrof_thresholds))
    fprs = np.zeros((nrof_folds,nrof_thresholds))
    accuracy = np.zeros((nrof_folds))
    
    indices = np.arange(nrof_pairs)
    
    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
        if subtract_mean:
            mean = np.mean(np.concatenate([embeddings1[train_set], embeddings2[train_set]]), axis=0)
        else:
          mean = 0.0
        dist = distance(embeddings1-mean, embeddings2-mean, distance_metric)
        
        # Find the best threshold for the fold
        acc_train = np.zeros((nrof_thresholds))
        for threshold_idx, threshold in enumerate(thresholds):
            _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set])
        best_threshold_index = np.argmax(acc_train)
        for threshold_idx, threshold in enumerate(thresholds):
            tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set])
        _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], dist[test_set], actual_issame[test_set])
          
        tpr = np.mean(tprs,0)
        fpr = np.mean(fprs,0)
    return tpr, fpr, accuracy 
Example 29
Project: TNT   Author: GaoangW   File: facenet.py    License: GNU General Public License v3.0 5 votes vote down vote up
def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10, distance_metric=0, subtract_mean=False):
    assert(embeddings1.shape[0] == embeddings2.shape[0])
    assert(embeddings1.shape[1] == embeddings2.shape[1])
    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
    nrof_thresholds = len(thresholds)
    k_fold = KFold(n_splits=nrof_folds, shuffle=False)
    
    val = np.zeros(nrof_folds)
    far = np.zeros(nrof_folds)
    
    indices = np.arange(nrof_pairs)
    
    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
        if subtract_mean:
            mean = np.mean(np.concatenate([embeddings1[train_set], embeddings2[train_set]]), axis=0)
        else:
          mean = 0.0
        dist = distance(embeddings1-mean, embeddings2-mean, distance_metric)
      
        # Find the threshold that gives FAR = far_target
        far_train = np.zeros(nrof_thresholds)
        for threshold_idx, threshold in enumerate(thresholds):
            _, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set])
        if np.max(far_train)>=far_target:
            f = interpolate.interp1d(far_train, thresholds, kind='slinear')
            threshold = f(far_target)
        else:
            threshold = 0.0
    
        val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set])
  
    val_mean = np.mean(val)
    far_mean = np.mean(far)
    val_std = np.std(val)
    return val_mean, val_std, far_mean 
Example 30
Project: TNT   Author: GaoangW   File: facenet.py    License: GNU General Public License v3.0 5 votes vote down vote up
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10, distance_metric=0, subtract_mean=False):
    assert(embeddings1.shape[0] == embeddings2.shape[0])
    assert(embeddings1.shape[1] == embeddings2.shape[1])
    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
    nrof_thresholds = len(thresholds)
    k_fold = KFold(n_splits=nrof_folds, shuffle=False)
    
    tprs = np.zeros((nrof_folds,nrof_thresholds))
    fprs = np.zeros((nrof_folds,nrof_thresholds))
    accuracy = np.zeros((nrof_folds))
    
    indices = np.arange(nrof_pairs)
    
    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
        if subtract_mean:
            mean = np.mean(np.concatenate([embeddings1[train_set], embeddings2[train_set]]), axis=0)
        else:
          mean = 0.0
        dist = distance(embeddings1-mean, embeddings2-mean, distance_metric)
        
        # Find the best threshold for the fold
        acc_train = np.zeros((nrof_thresholds))
        for threshold_idx, threshold in enumerate(thresholds):
            _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set])
        best_threshold_index = np.argmax(acc_train)
        for threshold_idx, threshold in enumerate(thresholds):
            tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set])
        _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], dist[test_set], actual_issame[test_set])
          
        tpr = np.mean(tprs,0)
        fpr = np.mean(fprs,0)
    return tpr, fpr, accuracy