python source code of preprocessor

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.base import TransformerMixin
from sklearn.feature_extraction import FeatureHasher

# Custom transformer for preprocessing data based on feature definitions
class Preprocessor(TransformerMixin):
    """
    A class that preprocesses a given dataset based on feature definitions.
    This class automates One Hot Encoding, Hashing and Scaling.
    """
    
    def __init__(self, features, return_type='np', scale_hashed=True, missing="zeros", scaler="standard", **kwargs):
        """
        Initialize the Preprocessor object based on the features dataframe.
        
        The features dataframe must include these columns: Name, Variable_Type, Feature_Strategy.      
        If Feature_Strategy includes hashing, the Hash_Features column must also be included.
        The dataframe must be indexed by Name.
                
        For further information on the columns refer to the project documentation: 
        https://github.com/nabeel-qlik/qlik-py-tools
        """
        
        self.features = features
        self.return_type = return_type
        self.scale_hashed = scale_hashed
        self.missing = missing
        self.scaler = scaler
        self.kwargs = kwargs
        self.ohe = False
        self.hash = False
        self.scale = False
        self.no_prep = False
        
        # Collect features for one hot encoding
        self.ohe_meta = features.loc[features["feature_strategy"] == "one hot encoding"].copy()
        
        # Set a flag if one hot encoding will be required
        if len(self.ohe_meta) > 0:
            self.ohe = True
        
        # Collect features for hashing
        self.hash_meta = features.loc[features["feature_strategy"] == "hashing"].copy()
        
        # Set a flag if feature hashing will be required
        if len(self.hash_meta) > 0:
            self.hash = True
            
            # Convert Hash_Features column to integers
            self.hash_meta.loc[:,"hash_features"] = self.hash_meta.loc[:,"hash_features"].astype(np.int64)
        
        # Collect features for scaling
        self.scale_meta = features.loc[features["feature_strategy"] == "scaling"].copy()
        
        # Set a flag if scaling will be required
        if len(self.scale_meta) > 0:
            self.scale = True
        
        # Collect other features
        self.none_meta = features.loc[features["feature_strategy"] == "none"].copy()
        
        # Set a flag if there are features that don't require preprocessing
        if len(self.none_meta) > 0:
            self.no_prep = True
    
    
    def fit(self, X, y=None, features=None, retrain=False):
        """
        Fit to the training dataset, storing information that will be needed for the transform dataset.
        Return the Preprocessor object.
        Optionally re-initizialise the object by passing retrain=True, and resending the features dataframe
        """
        
        # Reinitialize this Preprocessor instance if required
        if retrain:
            if features is None:
                features = self.features
            
            self = self.__init__(features)
        
        # Get a subset of the data that requires one hot encoding
        self.ohe_df = X[self.ohe_meta.index.tolist()]
              
        # Apply one hot encoding to relevant columns
        self.ohe_df = pd.get_dummies(self.ohe_df, columns=self.ohe_df.columns)
        
        # Keep a copy of the OHE dataframe structure so we can align the transform dataset 
        self.ohe_df_structure = pd.DataFrame().reindex_like(self.ohe_df)
        
        # Scaling needs to be fit exclusively on the training data so as not to influence the results
        if self.scale:
            # Get a subset of the data that requires scaling
            self.scale_df = X[self.scale_meta.index.tolist()]
                   
        # If hashed columns need to be scaled, these need to be considered when setting up the scaler as well
        if self.hash and self.scale_hashed:
            # Get a subset of the data that requires feature hashing
            self.hash_df = X[self.hash_meta.index.tolist()]
            hash_cols = self.hash_df.columns

            # Hash unique values for each relevant column and then join to a dataframe for hashed data
            for c in hash_cols:
                unique = self.hasher(self.hash_df, c, self.hash_meta["hash_features"].loc[c])
                self.hash_df = self.hash_df.join(unique, on=c)
                self.hash_df = self.hash_df.drop(c, axis=1)
                
            if self.scale:
                self.scale_df = self.scale_df.join(self.hash_df)
            else:
                self.scale_df = self.hash_df 
        
        if len(self.scale_df) > 0:
            # Get an instance of the sklearn scaler fit to X
            self.scaler_instance = self.get_scaler(self.scale_df, missing=self.missing, scaler=self.scaler, **self.kwargs)
        
        return self
    
    
    def transform(self, X, y=None):
        """
        Transform X with the encoding and scaling requirements set by fit().
        This function will perform One Hot Encoding, Feature Hashing and Scaling on X.
        Returns X_transform as a numpy array or a pandas dataframe based on return_type set in constructor.
        """
        
        self.X_transform = None
        
        if self.ohe:
            # Get a subset of the data that requires one hot encoding
            self.ohe_df = X[self.ohe_meta.index.tolist()]

            # Apply one hot encoding to relevant columns
            self.ohe_df = pd.get_dummies(self.ohe_df, columns=self.ohe_df.columns)

            # Align the columns with the original dataset. 
            # This is to prevent different number or order of features between training and test datasets.
            self.ohe_df = self.ohe_df.align(self.ohe_df_structure, join='right', axis=1)[0]

            # Fill missing values in the OHE dataframe, that may appear after alignment, with zeros.
            self.ohe_df = self.fillna(self.ohe_df, missing="zeros")
            
            # Add the encoded columns to the result dataset
            self.X_transform = self.ohe_df
        
        if self.hash:
            # Get a subset of the data that requires feature hashing
            self.hash_df = X[self.hash_meta.index.tolist()]
            hash_cols = self.hash_df.columns

            # Hash unique values for each relevant column and then join to a dataframe for hashed data
            for c in hash_cols:
                unique = self.hasher(self.hash_df, c, self.hash_meta["hash_features"].loc[c])
                self.hash_df = self.hash_df.join(unique, on=c)
                self.hash_df = self.hash_df.drop(c, axis=1)
        
        if self.scale:
            # Get a subset of the data that requires scaling
            self.scale_df = X[self.scale_meta.index.tolist()]

        # If scale_hashed = True join the hashed columns to the scaling dataframe
        if self.hash and self.scale_hashed:
            if self.scale:
                self.scale_df = self.scale_df.join(self.hash_df)
            else:
                self.scale_df = self.hash_df
                # If only hashed columns are being scaled, the scaler needs to be instantiated
                self.scaler_instance = self.get_scaler(self.scale_df, missing=self.missing, scaler=self.scaler, **self.kwargs)
        elif self.hash:
            # Add the hashed columns to the result dataset
            if self.X_transform is None:
                self.X_transform = self.hash_df
            else:
                self.X_transform = self.X_transform.join(self.hash_df)
        
        # Perform scaling on the relevant data
        if len(self.scale_df) > 0:
            self.scale_df = self.fillna(self.scale_df, missing=self.missing)
            self.scale_df = pd.DataFrame(self.scaler_instance.transform(self.scale_df), index=self.scale_df.index, columns=self.scale_df.columns)
            
            # Add the scaled columns to the result dataset
            if self.X_transform is None:
                self.X_transform = self.scale_df
            else:
                self.X_transform = self.X_transform.join(self.scale_df)
               
        if self.no_prep:
            # Get a subset of the data that doesn't require preprocessing
            self.no_prep_df = X[self.none_meta.index.tolist()]
        
            # Finally join the columns that do not require preprocessing to the result dataset
            if self.X_transform is None:
                self.X_transform = self.no_prep_df
            else:
                self.X_transform = self.X_transform.join(self.no_prep_df)
        
        if self.return_type == 'np':
            return self.X_transform.values
        
        return self.X_transform
    
    
    def fit_transform(self, X, y=None, features=None, retrain=False):
        """
        Apply fit() then transform()
        """
        
        if features is None:
            features = self.features
        
        return self.fit(X, y, features, retrain).transform(X, y)
    
    
    @staticmethod
    def hasher(df, col, n_features):
        """
        Hash the unique values in the specified column in the given dataframe, creating n_features
        """
        
        unique = pd.DataFrame(df[col].unique(), columns=[col])
        fh = FeatureHasher(n_features=n_features, input_type="string")
        hashed = fh.fit_transform(unique.loc[:, col])
        unique = unique.join(pd.DataFrame(hashed.toarray()).add_prefix(col))
        return unique.set_index(col)
    
    @staticmethod
    def fillna(df, missing="zeros"):
        """
        Fill empty values in a Data Frame with the chosen method.
        Valid options for missing are: zeros, mean, median, mode
        """

        if missing == "mean":
            return df.fillna(df.mean())
        elif missing == "median":
            return df.fillna(df.median())
        elif missing == "mode":
            return df.fillna(df.mode().iloc[0])
        elif missing == "none":
            return df
        else:
            return df.fillna(0)
    
    @staticmethod
    def get_scaler(df, missing="zeros", scaler="standard", **kwargs):
        """
        Fit a sklearn scaler on a Data Frame and return the scaler.
        Valid options for the scaler are: standard, minmax, maxabs, robust, quantile
        Missing values must be dealt with before the scaling is applied. 
        Valid options specified through the missing parameter are: zeros, mean, median, mode
        """

        scalers = {'standard':'StandardScaler', 'minmax':'MinMaxScaler', 'maxabs':'MaxAbsScaler',\
                   'robust':'RobustScaler', 'quantile':'QuantileTransformer'}

        s = getattr(preprocessing, scalers[scaler])
        s = s(**kwargs)

        df = Preprocessor.fillna(df, missing=missing)
        
        return s.fit(df)