Python sklearn.model_selection._split._validate_shuffle_split() Examples
The following are 7
code examples of sklearn.model_selection._split._validate_shuffle_split().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.model_selection._split
, or try the search function
.
Example #1
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_stratified_shuffle_split_iter(): ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]), np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2), np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]), np.array([-1] * 800 + [1] * 50), np.concatenate([[i] * (100 + i) for i in range(11)]), [1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3], ['1', '1', '1', '1', '2', '2', '2', '3', '3', '3', '3', '3'], ] for y in ys: sss = StratifiedShuffleSplit(6, test_size=0.33, random_state=0).split(np.ones(len(y)), y) y = np.asanyarray(y) # To make it indexable for y[train] # this is how test-size is computed internally # in _validate_shuffle_split test_size = np.ceil(0.33 * len(y)) train_size = len(y) - test_size for train, test in sss: assert_array_equal(np.unique(y[train]), np.unique(y[test])) # Checks if folds keep classes proportions p_train = (np.bincount(np.unique(y[train], return_inverse=True)[1]) / float(len(y[train]))) p_test = (np.bincount(np.unique(y[test], return_inverse=True)[1]) / float(len(y[test]))) assert_array_almost_equal(p_train, p_test, 1) assert_equal(len(train) + len(test), y.size) assert_equal(len(train), train_size) assert_equal(len(test), test_size) assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])
Example #2
Source File: tabular.py From pvae with MIT License | 5 votes |
def getDataLoaders(self, batch_size, shuffle, device, *args): kwargs = {'num_workers': 1, 'pin_memory': True} if device == "cuda" else {} print('Load training data...') dataset = SyntheticDataset(*self.data_size, *map(lambda x: float(x), args)) n_train, n_test = _validate_shuffle_split(len(dataset), test_size=None, train_size=0.7) train_dataset, test_dataset = torch.utils.data.random_split(dataset, [n_train, n_test]) train_loader = DataLoader(train_dataset, batch_size=batch_size, drop_last=True, shuffle=shuffle, **kwargs) test_loader = DataLoader(test_dataset, batch_size=batch_size, drop_last=True, shuffle=False, **kwargs) return train_loader, test_loader
Example #3
Source File: tabular.py From pvae with MIT License | 5 votes |
def getDataLoaders(self, batch_size, shuffle, device, *args): kwargs = {'num_workers': 1, 'pin_memory': True} if device == "cuda" else {} print('Load training data...') dataset = CSVDataset(*args) n_train, n_test = _validate_shuffle_split(len(dataset), test_size=None, train_size=0.7) train_dataset, test_dataset = torch.utils.data.random_split(dataset, [n_train, n_test]) train_loader = DataLoader(train_dataset, batch_size=batch_size, drop_last=True, shuffle=shuffle, **kwargs) test_loader = DataLoader(test_dataset, batch_size=batch_size, drop_last=True, shuffle=False, **kwargs) return train_loader, test_loader
Example #4
Source File: _split.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _split_blockwise(self, X, seeds): chunks = X.chunks[0] train_pct, test_pct = _maybe_normalize_split_sizes( self.train_size, self.test_size ) sizes = [_validate_shuffle_split(c, test_pct, train_pct) for c in chunks] objs = [ dask.delayed(_generate_idx, nout=2)(chunksize, seed, n_train, n_test) for chunksize, seed, (n_train, n_test) in zip(chunks, seeds, sizes) ] train_objs, test_objs = zip(*objs) offsets = np.hstack([0, np.cumsum(chunks)]) train_idx = da.concatenate( [ da.from_delayed(x + offset, (train_size,), np.dtype("int")) for x, chunksize, (train_size, _), offset in zip( train_objs, chunks, sizes, offsets ) ] ) test_idx = da.concatenate( [ da.from_delayed(x + offset, (test_size,), np.dtype("int")) for x, chunksize, (_, test_size), offset in zip( test_objs, chunks, sizes, offsets ) ] ) return train_idx, test_idx
Example #5
Source File: test_split.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_stratified_shuffle_split_iter(): ys = [np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]), np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2] * 2), np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]), np.array([-1] * 800 + [1] * 50), np.concatenate([[i] * (100 + i) for i in range(11)]), [1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3], ['1', '1', '1', '1', '2', '2', '2', '3', '3', '3', '3', '3'], ] for y in ys: sss = StratifiedShuffleSplit(6, test_size=0.33, random_state=0).split(np.ones(len(y)), y) y = np.asanyarray(y) # To make it indexable for y[train] # this is how test-size is computed internally # in _validate_shuffle_split test_size = np.ceil(0.33 * len(y)) train_size = len(y) - test_size for train, test in sss: assert_array_equal(np.unique(y[train]), np.unique(y[test])) # Checks if folds keep classes proportions p_train = (np.bincount(np.unique(y[train], return_inverse=True)[1]) / float(len(y[train]))) p_test = (np.bincount(np.unique(y[test], return_inverse=True)[1]) / float(len(y[test]))) assert_array_almost_equal(p_train, p_test, 1) assert_equal(len(train) + len(test), y.size) assert_equal(len(train), train_size) assert_equal(len(test), test_size) assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])
Example #6
Source File: test_split.py From Mastering-Elasticsearch-7.0 with MIT License | 4 votes |
def test_stratified_shuffle_split_even(): # Test the StratifiedShuffleSplit, indices are drawn with a # equal chance n_folds = 5 n_splits = 1000 def assert_counts_are_ok(idx_counts, p): # Here we test that the distribution of the counts # per index is close enough to a binomial threshold = 0.05 / n_splits bf = stats.binom(n_splits, p) for count in idx_counts: prob = bf.pmf(count) assert prob > threshold, \ "An index is not drawn with chance corresponding to even draws" for n_samples in (6, 22): groups = np.array((n_samples // 2) * [0, 1]) splits = StratifiedShuffleSplit(n_splits=n_splits, test_size=1. / n_folds, random_state=0) train_counts = [0] * n_samples test_counts = [0] * n_samples n_splits_actual = 0 for train, test in splits.split(X=np.ones(n_samples), y=groups): n_splits_actual += 1 for counter, ids in [(train_counts, train), (test_counts, test)]: for id in ids: counter[id] += 1 assert_equal(n_splits_actual, n_splits) n_train, n_test = _validate_shuffle_split( n_samples, test_size=1. / n_folds, train_size=1. - (1. / n_folds)) assert_equal(len(train), n_train) assert_equal(len(test), n_test) assert_equal(len(set(train).intersection(test)), 0) group_counts = np.unique(groups) assert_equal(splits.test_size, 1.0 / n_folds) assert_equal(n_train + n_test, len(groups)) assert_equal(len(group_counts), 2) ex_test_p = float(n_test) / n_samples ex_train_p = float(n_train) / n_samples assert_counts_are_ok(train_counts, ex_train_p) assert_counts_are_ok(test_counts, ex_test_p)
Example #7
Source File: test_split.py From twitter-stock-recommendation with MIT License | 4 votes |
def test_stratified_shuffle_split_even(): # Test the StratifiedShuffleSplit, indices are drawn with a # equal chance n_folds = 5 n_splits = 1000 def assert_counts_are_ok(idx_counts, p): # Here we test that the distribution of the counts # per index is close enough to a binomial threshold = 0.05 / n_splits bf = stats.binom(n_splits, p) for count in idx_counts: prob = bf.pmf(count) assert_true(prob > threshold, "An index is not drawn with chance corresponding " "to even draws") for n_samples in (6, 22): groups = np.array((n_samples // 2) * [0, 1]) splits = StratifiedShuffleSplit(n_splits=n_splits, test_size=1. / n_folds, random_state=0) train_counts = [0] * n_samples test_counts = [0] * n_samples n_splits_actual = 0 for train, test in splits.split(X=np.ones(n_samples), y=groups): n_splits_actual += 1 for counter, ids in [(train_counts, train), (test_counts, test)]: for id in ids: counter[id] += 1 assert_equal(n_splits_actual, n_splits) n_train, n_test = _validate_shuffle_split( n_samples, test_size=1. / n_folds, train_size=1. - (1. / n_folds)) assert_equal(len(train), n_train) assert_equal(len(test), n_test) assert_equal(len(set(train).intersection(test)), 0) group_counts = np.unique(groups) assert_equal(splits.test_size, 1.0 / n_folds) assert_equal(n_train + n_test, len(groups)) assert_equal(len(group_counts), 2) ex_test_p = float(n_test) / n_samples ex_train_p = float(n_train) / n_samples assert_counts_are_ok(train_counts, ex_train_p) assert_counts_are_ok(test_counts, ex_test_p)