# coding:utf-8 import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.datasets import load_boston from sklearn.datasets import load_iris from heamy.feature import onehot_features, factorize, woe, mean_target def test_onehot(): data = load_boston() X, y = data['data'], data['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=333) train = pd.DataFrame(X_train) test = pd.DataFrame(X_test) t_train, t_test = onehot_features(train.copy(deep=True), test.copy(deep=True), [8, 1, 12], full=False, dummy_na=True) assert t_train.shape[1] == t_test.shape[1] assert t_train.shape[1] == 441 t_train, t_test = onehot_features(train.copy(deep=True), test.copy(deep=True), [8, 1, 12], full=True, dummy_na=False) assert t_train.shape[1] == t_test.shape[1] assert t_train.shape[1] == 500 def test_factorize(): train = pd.DataFrame({'a': ['a', 'b', 'c'], 'b': ['a', 'b', 'c']}) test = pd.DataFrame({'a': ['a', 'b', 'c'], 'b': ['z', 'b', 'c']}) t_train, t_test = factorize(train.copy(deep=True), test.copy(deep=True), ['a', 'b'], full=True, ) assert len(t_train.select_dtypes(include=[np.int64, np.int32]).columns) == 2 assert len(t_test.select_dtypes(include=[np.int64, np.int32]).columns) == 2 assert t_test['b'].nunique() == 3 t_train, t_test = factorize(train.copy(deep=True), test.copy(deep=True), ['a', 'b'], full=False, na_value=np.nan) assert t_test['b'].dropna().nunique() == 2 def test_target_transformations(): X = pd.DataFrame(np.random.randint(2, size=(100, 2)), columns=('x', 'target')) output = woe(X, 'x', 'target') assert output.shape[0] == 100 output = mean_target(X, 'x', 'target', C=10) assert output.shape[0] == 100 output = mean_target(X, 'x', 'target', C=None) assert output.shape[0] == 100