import numpy as np from scipy.stats import mode from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_equal from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from missingpy import MissForest def gen_array(n_rows=20, n_cols=5, missingness=0.2, min_val=0, max_val=10, missing_values=np.nan, rand_seed=1337): """Generate an array with NaNs""" rand_gen = np.random.RandomState(seed=rand_seed) X = rand_gen.randint( min_val, max_val, n_rows * n_cols).reshape(n_rows, n_cols).astype( np.float) # Introduce NaNs if missingness > 0 if missingness > 0: # If missingness >= 1 then use it as approximate (see below) count if missingness >= 1: n_missing = missingness else: # If missingness is between (0, 1] then use it as approximate % # of total cells that are NaNs n_missing = int(np.ceil(missingness * n_rows * n_cols)) # Generate row, col index pairs and introduce NaNs # NOTE: Below does not account for repeated index pairs so NaN # count/percentage might be less than specified in function call nan_row_idx = rand_gen.randint(0, n_rows, n_missing) nan_col_idx = rand_gen.randint(0, n_cols, n_missing) X[nan_row_idx, nan_col_idx] = missing_values return X def test_missforest_imputation_shape(): # Verify the shapes of the imputed matrix n_rows = 10 n_cols = 2 X = gen_array(n_rows, n_cols) imputer = MissForest() X_imputed = imputer.fit_transform(X) assert_equal(X_imputed.shape, (n_rows, n_cols)) def test_missforest_zero(): # Test imputation when missing_values == 0 missing_values = 0 imputer = MissForest(missing_values=missing_values, random_state=0) # Test with missing_values=0 when NaN present X = gen_array(min_val=0) msg = "Input contains NaN, infinity or a value too large for %r." % X.dtype assert_raise_message(ValueError, msg, imputer.fit, X) # Test with all zeroes in a column X = np.array([ [1, 0, 0, 0, 5], [2, 1, 0, 2, 3], [3, 2, 0, 0, 0], [4, 6, 0, 5, 13], ]) msg = "One or more columns have all rows missing." assert_raise_message(ValueError, msg, imputer.fit, X) def test_missforest_zero_part2(): # Test with an imputable matrix and compare with missing_values="NaN" X_zero = gen_array(min_val=1, missing_values=0) X_nan = gen_array(min_val=1, missing_values=np.nan) statistics_mean = np.nanmean(X_nan, axis=0) imputer_zero = MissForest(missing_values=0, random_state=1337) imputer_nan = MissForest(missing_values=np.nan, random_state=1337) assert_array_equal(imputer_zero.fit_transform(X_zero), imputer_nan.fit_transform(X_nan)) assert_array_equal(imputer_zero.statistics_.get("col_means"), statistics_mean) def test_missforest_numerical_single(): # Test imputation with default parameter values # Test with a single missing value df = np.array([ [1, 0, 0, 1], [2, 1, 2, 2], [3, 2, 3, 2], [np.nan, 4, 5, 5], [6, 7, 6, 7], [8, 8, 8, 8], [16, 15, 18, 19], ]) statistics_mean = np.nanmean(df, axis=0) y = df[:, 0] X = df[:, 1:] good_rows = np.where(~np.isnan(y))[0] bad_rows = np.where(np.isnan(y))[0] rf = RandomForestRegressor(n_estimators=10, random_state=1337) rf.fit(X=X[good_rows], y=y[good_rows]) pred_val = rf.predict(X[bad_rows]) df_imputed = np.array([ [1, 0, 0, 1], [2, 1, 2, 2], [3, 2, 3, 2], [pred_val, 4, 5, 5], [6, 7, 6, 7], [8, 8, 8, 8], [16, 15, 18, 19], ]) imputer = MissForest(n_estimators=10, random_state=1337) assert_array_equal(imputer.fit_transform(df), df_imputed) assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean) def test_missforest_numerical_multiple(): # Test with two missing values for multiple iterations df = np.array([ [1, 0, np.nan, 1], [2, 1, 2, 2], [3, 2, 3, 2], [np.nan, 4, 5, 5], [6, 7, 6, 7], [8, 8, 8, 8], [16, 15, 18, 19], ]) statistics_mean = np.nanmean(df, axis=0) n_rows, n_cols = df.shape # Fit missforest and transform imputer = MissForest(random_state=1337) df_imp1 = imputer.fit_transform(df) # Get iterations used by missforest above max_iter = imputer.iter_count_ # Get NaN mask nan_mask = np.isnan(df) nan_rows, nan_cols = np.where(nan_mask) # Make initial guess for missing values df_imp2 = df.copy() df_imp2[nan_rows, nan_cols] = np.take(statistics_mean, nan_cols) # Loop for max_iter count over the columns with NaNs for _ in range(max_iter): for c in nan_cols: # Identify all other columns (i.e. predictors) not_c = np.setdiff1d(np.arange(n_cols), c) # Identify rows with NaN and those without in 'c' y = df_imp2[:, c] X = df_imp2[:, not_c] good_rows = np.where(~nan_mask[:, c])[0] bad_rows = np.where(nan_mask[:, c])[0] # Fit model and predict rf = RandomForestRegressor(n_estimators=100, random_state=1337) rf.fit(X=X[good_rows], y=y[good_rows]) pred_val = rf.predict(X[bad_rows]) # Fill in values df_imp2[bad_rows, c] = pred_val assert_array_equal(df_imp1, df_imp2) assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean) def test_missforest_categorical_single(): # Test imputation with default parameter values # Test with a single missing value df = np.array([ [0, 0, 0, 1], [0, 1, 2, 2], [0, 2, 3, 2], [np.nan, 4, 5, 5], [1, 7, 6, 7], [1, 8, 8, 8], [1, 15, 18, 19], ]) y = df[:, 0] X = df[:, 1:] good_rows = np.where(~np.isnan(y))[0] bad_rows = np.where(np.isnan(y))[0] rf = RandomForestClassifier(n_estimators=10, random_state=1337) rf.fit(X=X[good_rows], y=y[good_rows]) pred_val = rf.predict(X[bad_rows]) df_imputed = np.array([ [0, 0, 0, 1], [0, 1, 2, 2], [0, 2, 3, 2], [pred_val, 4, 5, 5], [1, 7, 6, 7], [1, 8, 8, 8], [1, 15, 18, 19], ]) imputer = MissForest(n_estimators=10, random_state=1337) assert_array_equal(imputer.fit_transform(df, cat_vars=0), df_imputed) assert_array_equal(imputer.fit_transform(df, cat_vars=[0]), df_imputed) def test_missforest_categorical_multiple(): # Test with two missing values for multiple iterations df = np.array([ [0, 0, np.nan, 1], [0, 1, 1, 2], [0, 2, 1, 2], [np.nan, 4, 1, 5], [1, 7, 0, 7], [1, 8, 0, 8], [1, 15, 0, 19], [1, 18, 0, 17], ]) cat_vars = [0, 2] statistics_mode = mode(df, axis=0, nan_policy='omit').mode[0] n_rows, n_cols = df.shape # Fit missforest and transform imputer = MissForest(random_state=1337) df_imp1 = imputer.fit_transform(df, cat_vars=cat_vars) # Get iterations used by missforest above max_iter = imputer.iter_count_ # Get NaN mask nan_mask = np.isnan(df) nan_rows, nan_cols = np.where(nan_mask) # Make initial guess for missing values df_imp2 = df.copy() df_imp2[nan_rows, nan_cols] = np.take(statistics_mode, nan_cols) # Loop for max_iter count over the columns with NaNs for _ in range(max_iter): for c in nan_cols: # Identify all other columns (i.e. predictors) not_c = np.setdiff1d(np.arange(n_cols), c) # Identify rows with NaN and those without in 'c' y = df_imp2[:, c] X = df_imp2[:, not_c] good_rows = np.where(~nan_mask[:, c])[0] bad_rows = np.where(nan_mask[:, c])[0] # Fit model and predict rf = RandomForestClassifier(n_estimators=100, random_state=1337) rf.fit(X=X[good_rows], y=y[good_rows]) pred_val = rf.predict(X[bad_rows]) # Fill in values df_imp2[bad_rows, c] = pred_val assert_array_equal(df_imp1, df_imp2) assert_array_equal(imputer.statistics_.get('col_modes')[0], statistics_mode[cat_vars]) def test_missforest_mixed_multiple(): # Test with mixed data type df = np.array([ [np.nan, 0, 0, 1], [0, 1, 2, 2], [0, 2, 3, 2], [1, 4, 5, 5], [1, 7, 6, 7], [1, 8, 8, 8], [1, 15, 18, np.nan], ]) n_rows, n_cols = df.shape cat_vars = [0] num_vars = np.setdiff1d(range(n_cols), cat_vars) statistics_mode = mode(df, axis=0, nan_policy='omit').mode[0] statistics_mean = np.nanmean(df, axis=0) # Fit missforest and transform imputer = MissForest(random_state=1337) df_imp1 = imputer.fit_transform(df, cat_vars=cat_vars) # Get iterations used by missforest above max_iter = imputer.iter_count_ # Get NaN mask nan_mask = np.isnan(df) nan_rows, nan_cols = np.where(nan_mask) # Make initial guess for missing values df_imp2 = df.copy() df_imp2[0, 0] = statistics_mode[0] df_imp2[6, 3] = statistics_mean[3] # Loop for max_iter count over the columns with NaNs for _ in range(max_iter): for c in nan_cols: # Identify all other columns (i.e. predictors) not_c = np.setdiff1d(np.arange(n_cols), c) # Identify rows with NaN and those without in 'c' y = df_imp2[:, c] X = df_imp2[:, not_c] good_rows = np.where(~nan_mask[:, c])[0] bad_rows = np.where(nan_mask[:, c])[0] # Fit model and predict if c in cat_vars: rf = RandomForestClassifier(n_estimators=100, random_state=1337) else: rf = RandomForestRegressor(n_estimators=100, random_state=1337) rf.fit(X=X[good_rows], y=y[good_rows]) pred_val = rf.predict(X[bad_rows]) # Fill in values df_imp2[bad_rows, c] = pred_val assert_array_equal(df_imp1, df_imp2) assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean[num_vars]) assert_array_equal(imputer.statistics_.get('col_modes')[0], statistics_mode[cat_vars]) def test_statstics_fit_transform(): # Test statistics_ when data in fit() and transform() are different X = np.array([ [1, 0, 0, 1], [2, 1, 2, 2], [3, 2, 3, 2], [np.nan, 4, 5, 5], [6, 7, 6, 7], [8, 8, 8, 8], [16, 15, 18, 19], ]) statistics_mean = np.nanmean(X, axis=0) Y = np.array([ [0, 0, 0, 0], [2, 2, 2, 1], [3, 2, 3, 2], [np.nan, 4, 5, 5], [6, 7, 6, 7], [9, 9, 8, 8], [16, 15, 18, 19], ]) imputer = MissForest() imputer.fit(X).transform(Y) assert_array_equal(imputer.statistics_.get('col_means'), statistics_mean) def test_default_with_invalid_input(): # Test imputation with default values and invalid input # Test with all rows missing in a column X = np.array([ [np.nan, 0, 0, 1], [np.nan, 1, 2, np.nan], [np.nan, 2, 3, np.nan], [np.nan, 4, 5, 5], ]) imputer = MissForest(random_state=1337) msg = "One or more columns have all rows missing." assert_raise_message(ValueError, msg, imputer.fit, X) # Test with inf present X = np.array([ [np.inf, 1, 1, 2, np.nan], [2, 1, 2, 2, 3], [3, 2, 3, 3, 8], [np.nan, 6, 0, 5, 13], [np.nan, 7, 0, 7, 8], [6, 6, 2, 5, 7], ]) msg = "+/- inf values are not supported." assert_raise_message(ValueError, msg, MissForest().fit, X) # Test with inf present in matrix passed in transform() X = np.array([ [np.inf, 1, 1, 2, np.nan], [2, 1, 2, 2, 3], [3, 2, 3, 3, 8], [np.nan, 6, 0, 5, 13], [np.nan, 7, 0, 7, 8], [6, 6, 2, 5, 7], ]) X_fit = np.array([ [0, 1, 1, 2, np.nan], [2, 1, 2, 2, 3], [3, 2, 3, 3, 8], [np.nan, 6, 0, 5, 13], [np.nan, 7, 0, 7, 8], [6, 6, 2, 5, 7], ]) msg = "+/- inf values are not supported." assert_raise_message(ValueError, msg, MissForest().fit(X_fit).transform, X)