import dask.array as da import dask.dataframe as dd import numpy as np import pandas as pd import pytest from sklearn.datasets import fetch_20newsgroups, make_regression import dask_ml.model_selection from dask_ml._compat import DASK_2130 X, y = make_regression(n_samples=110, n_features=5) dX = da.from_array(X, 50) dy = da.from_array(y, 50) def test_20_newsgroups(): data = fetch_20newsgroups() X, y = data.data, data.target r = dask_ml.model_selection.train_test_split(X, y) X_train, X_test, y_train, y_test = r for X in [X_train, X_test]: assert isinstance(X, list) assert isinstance(X[0], str) for y in [y_train, y_test]: assert isinstance(y, np.ndarray) assert y.dtype == int def test_blockwise_shufflesplit(): splitter = dask_ml.model_selection.ShuffleSplit(random_state=0) assert splitter.get_n_splits() == 10 gen = splitter.split(dX) train_idx, test_idx = next(gen) assert isinstance(train_idx, da.Array) assert isinstance(test_idx, da.Array) assert train_idx.shape == (99,) # 90% of 110 assert test_idx.shape == (11,) assert train_idx.chunks == ((45, 45, 9),) assert test_idx.chunks == ((5, 5, 1),) counts = pd.value_counts(train_idx.compute()) assert counts.max() == 1 N = len(X) np.testing.assert_array_equal( np.unique(da.concatenate([train_idx, test_idx])), np.arange(N) ) def test_blockwise_shufflesplit_rng(): # Regression test for issue #380 n_splits = 2 splitter = dask_ml.model_selection.ShuffleSplit(n_splits=n_splits, random_state=0) gen = splitter.split(dX) train_indices = [] test_indices = [] for train_idx, test_idx in gen: train_indices.append(train_idx) test_indices.append(test_idx) assert not np.array_equal(train_indices[0], train_indices[1]) assert not np.array_equal(test_indices[0], test_indices[1]) # Test that splitting is reproducible n_splits = 2 split1 = dask_ml.model_selection.ShuffleSplit(n_splits=n_splits, random_state=0) split2 = dask_ml.model_selection.ShuffleSplit(n_splits=n_splits, random_state=0) for (train_1, test_1), (train_2, test_2) in zip(split1.split(dX), split2.split(dX)): da.utils.assert_eq(train_1, train_2) da.utils.assert_eq(test_1, test_2) @pytest.mark.parametrize("shuffle", [False, True]) def test_kfold(shuffle): splitter = dask_ml.model_selection.KFold( n_splits=5, random_state=0, shuffle=shuffle ) assert splitter.get_n_splits() == 5 gen = splitter.split(dX) train_idx, test_idx = next(gen) assert isinstance(train_idx, da.Array) assert isinstance(test_idx, da.Array) assert train_idx.shape == (88,) # 80% of 110 assert test_idx.shape == (22,) assert train_idx.chunks == ((28, 50, 10),) assert test_idx.chunks == ((22,),) counts = pd.value_counts(train_idx.compute()) assert counts.max() == 1 N = len(X) np.testing.assert_array_equal( np.unique(da.concatenate([train_idx, test_idx])), np.arange(N) ) expected_chunks = [ (((22, 6, 50, 10),), ((22,),)), (((44, 34, 10),), ((6, 16),)), (((50, 16, 12, 10),), ((22,),)), (((50, 38),), ((12, 10),)), ] for (exp_train_idx, exp_test_idx), (train_idx, test_idx) in zip( expected_chunks, gen ): assert train_idx.chunks == exp_train_idx assert test_idx.chunks == exp_test_idx def test_train_test_split(): X_train, X_test, y_train, y_test = dask_ml.model_selection.train_test_split(dX, dy) assert len(X_train) == 99 assert len(X_test) == 11 assert X_train.chunks[0] == y_train.chunks[0] assert X_test.chunks[0] == y_test.chunks[0] def test_train_test_split_test_size(): X_train, X_test, y_train, y_test = dask_ml.model_selection.train_test_split( dX, dy, random_state=10, test_size=0.8 ) def test_train_test_split_shuffle_array(): with pytest.raises(NotImplementedError): dask_ml.model_selection.train_test_split(dX, dy, shuffle=False) @pytest.mark.xfail( not DASK_2130, reason="DataFrame blockwise shuffling implemented in dask2.13.0." ) def test_train_test_split_shuffle_dataframe(xy_classification_pandas): X, y = xy_classification_pandas X_train, X_test, y_train, y_test = dask_ml.model_selection.train_test_split( X, y, random_state=42, shuffle=True ) with pytest.raises(AssertionError): np.testing.assert_array_equal(X_train.index, sorted(X_train.index)) X_train, X_test, y_train, y_test = dask_ml.model_selection.train_test_split( X, y, random_state=42, shuffle=False ) np.testing.assert_array_equal(X_train.index, sorted(X_train.index)) def test_train_test_split_blockwise_dataframe(xy_classification_pandas): X, y = xy_classification_pandas with pytest.raises(NotImplementedError): dask_ml.model_selection.train_test_split( X, y, random_state=42, shuffle=False, blockwise=False ) @pytest.mark.parametrize( "kwargs", [{"train_size": 10}, {"test_size": 10}, {"test_size": 10, "train_size": 0.1}], ) def test_absolute_raises(kwargs): with pytest.raises(ValueError) as m: dask_ml.model_selection.train_test_split(dX, **kwargs) assert m.match("Dask-ML does not support absolute sizes") def test_non_complement_raises(): with pytest.raises(ValueError) as m: dask_ml.model_selection._split._maybe_normalize_split_sizes(0.1, 0.2) assert m.match("The sum of ") def test_complement(): train_size, test_size = dask_ml.model_selection._split._maybe_normalize_split_sizes( 0.1, None ) assert train_size == 0.1 assert test_size == 0.9 train_size, test_size = dask_ml.model_selection._split._maybe_normalize_split_sizes( None, 0.2 ) assert train_size == 0.8 assert test_size == 0.2 @pytest.mark.parametrize( "train_size, test_size", [(None, None), (0.9, None), (None, 0.1), (0.9, 0.1)] ) def test_train_test_split_dask_dataframe( xy_classification_pandas, train_size, test_size ): X, y = xy_classification_pandas kwargs = {"shuffle": True} if DASK_2130 else {} X_train, X_test, y_train, y_test = dask_ml.model_selection.train_test_split( X, y, train_size=train_size, test_size=test_size, **kwargs ) assert isinstance(X_train, dd.DataFrame) assert isinstance(y_train, dd.Series) assert (y_train.size + y_test.size).compute() == len(y) def test_train_test_split_dask_dataframe_rng(xy_classification_pandas): X, y = xy_classification_pandas kwargs = {"shuffle": True} if DASK_2130 else {} split1 = dask_ml.model_selection.train_test_split( X, y, train_size=0.25, test_size=0.75, random_state=0, **kwargs ) split2 = dask_ml.model_selection.train_test_split( X, y, train_size=0.25, test_size=0.75, random_state=0, **kwargs ) for a, b in zip(split1, split2): dd.utils.assert_eq(a, b) def test_split_mixed(): y_series = dd.from_dask_array(dy) with pytest.raises(TypeError, match="convert_mixed_types"): dask_ml.model_selection.train_test_split(dX, y_series) expected = dask_ml.model_selection.train_test_split(dX, dy, random_state=0) results = dask_ml.model_selection.train_test_split( dX, y_series, random_state=0, convert_mixed_types=True ) assert len(expected) == len(results) for a, b in zip(expected, results): da.utils.assert_eq(a, b)