""" Semantic variation in the "Midwest" =================================== Here's some survey data with one dirty column, consisting of an open-ended question, on which one-hot encoding does not work well. The other columns are more traditional categorical or numerical variables. Let's see how different encoding for the dirty column impact on the score of a classification problem. """ ################################################################################ # Loading the data # ---------------- from dirty_cat.datasets import fetch_midwest_survey import pandas as pd dataset = fetch_midwest_survey() df = pd.read_csv(dataset['path']).astype(str) ################################################################################ # The challenge with this data is that it contains a free-form input # column, where people put whatever they want: dirty_column = 'In your own words, what would you call the part of the country you live in now?' print(df[dirty_column].value_counts()[-10:]) ################################################################################ # Separating clean, and dirty columns as well a a column we will try to predict # ------------------------------------------------------------------------------ target_column = 'Location (Census Region)' clean_columns = [ 'Personally identification as a Midwesterner?', 'Illinois in MW?', 'Indiana in MW?', 'Kansas in MW?', 'Iowa in MW?', 'Michigan in MW?', 'Minnesota in MW?', 'Missouri in MW?', 'Nebraska in MW?', 'North Dakota in MW?', 'Ohio in MW?', 'South Dakota in MW?', 'Wisconsin in MW?', 'Arkansas in MW?', 'Colorado in MW?', 'Kentucky in MW?', 'Oklahoma in MW?', 'Pennsylvania in MW?', 'West Virginia in MW?', 'Montana in MW?', 'Wyoming in MW?', 'Gender', 'Age', 'Household Income', 'Education'] y = df[target_column].values.ravel() ############################################################################## # A pipeline for data fitting and prediction # ------------------------------------------- # We first import the right encoders to transform our clean/dirty data: from sklearn.preprocessing import FunctionTransformer, OneHotEncoder from dirty_cat import SimilarityEncoder, MinHashEncoder encoder_dict = { 'one-hot': OneHotEncoder(handle_unknown='ignore', sparse=False), 'similarity': SimilarityEncoder(similarity='ngram'), 'minhash': MinHashEncoder(n_components=10, ngram_range=(2, 4), hashing='fast', minmax_hash=False), 'num': FunctionTransformer(None) } ############################################################################## # All the clean columns are encoded once and for all, but since we # benchmark different categorical encodings for the dirty variable, # we create a function that takes an encoding as an input, and returns a \ # scikit-learn pipeline for our problem. from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier def make_pipeline(encoding_method): # static transformers from the other columns transformers = [('one-hot-clean', encoder_dict['one-hot'], clean_columns)] # adding the encoded column transformers += [(encoding_method + '-dirty', encoder_dict[encoding_method], [dirty_column])] pipeline = Pipeline([ # Use ColumnTransformer to combine the features ('union', ColumnTransformer( transformers=transformers, remainder='drop')), ('scaler', StandardScaler(with_mean=False)), ('classifier', RandomForestClassifier(random_state=5)) ]) return pipeline ############################################################################### # Evaluation of different encoding methods # ----------------------------------------- # We then loop over encoding methods, scoring the different pipeline predictions # using a cross validation score: from sklearn.model_selection import cross_val_score from sklearn.model_selection import StratifiedKFold cv = StratifiedKFold(n_splits=3, random_state=12, shuffle=True) all_scores = {} for method in ['one-hot', 'similarity', 'minhash']: pipeline = make_pipeline(method) # Now predict the census region of each participant scores = cross_val_score(pipeline, df, y, cv=cv) all_scores[method] = scores print('%s encoding' % method) print('Accuracy score: mean: %.3f; std: %.3f\n' % (scores.mean(), scores.std())) ############################################################################### # Plot the results # ------------------ import seaborn import matplotlib.pyplot as plt plt.figure(figsize=(4, 3)) ax = seaborn.boxplot(data=pd.DataFrame(all_scores), orient='h') plt.ylabel('Encoding', size=20) plt.xlabel('Prediction accuracy ', size=20) plt.yticks(size=20) plt.tight_layout() ############################################################################### # We can see that encoding the data using a SimilarityEncoder instead of # OneHotEncoder helps a lot in improving the cross validation score!