python source code of _machine

import os
import gc
import sys
import time
import copy
import joblib
import numpy as np
import pandas as pd
import warnings

# Suppress warnings 
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    
from pathlib import Path
from sklearn import preprocessing
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.metaestimators import if_delegate_has_method

# Workaround for Keras issue #1406
# "Using X backend." always printed to stdout #1406 
# https://github.com/keras-team/keras/issues/1406
stderr = sys.stderr
sys.stderr = open(os.devnull, 'w')
import keras
from keras import backend as kerasbackend
from keras.wrappers.scikit_learn import KerasClassifier
from keras.wrappers.scikit_learn import KerasRegressor
sys.stderr = stderr

import _utils as utils

class PersistentModel:
    """
    A general class to manage persistent models
    """
    
    def __init__(self):
        """
        Basic contructor
        """
        
        self.name = None
        self.state = None
        self.state_timestamp = None
        self.using_keras = False
        
    def save(self, name, path, overwrite=True, compress=3, locked_timeout=2):
        """
        Save the model to disk at the specified path.
        If the model already exists and overwrite=False, throw an exception.
        If overwrite=True, replace any existing file with the same name at the path.
        If the model is found to be locked, wait 'locked_timeout' seconds and try again before quitting.
        """
        
        # Create string for path and file name
        f = path + name + '.joblib'

        # Create a path for the lock file
        f_lock = f + '.lock'
                
        # Create the directory if required
        try:
            Path(path).mkdir(parents=True, exist_ok=False) 
        except FileExistsError:
            pass
        
        # If the file exists and overwriting is not allowed, raise an exception
        if Path(f).exists() and not overwrite:
            raise FileExistsError("The specified model name already exists: {0}.".format(name + '.joblib')\
                                  +"\nPass overwrite=True if it is ok to overwrite.")
        # Check if the file is currently locked
        elif Path(f_lock).exists():
            # Wait a few seconds and check again
            time.sleep(locked_timeout)
            # If the file is still locked raise an exception
            if Path(f_lock).exists():
                raise TimeoutError("The specified model is locked. If you believe this to be wrong, please delete file {0}".format(f_lock))
        else:
            # Update properties
            self.name = name
            self.state = 'saved'
            self.state_timestamp = time.time()

            # Keras models are excluded from the joblib file as they are saved to a special HDF5 file in _sklearn.py
            try:
                if self.using_keras:
                    # Get the trained keras model from the pipeline's estimator
                    keras_model = self.pipe.named_steps['estimator'].model
                    # Save the keras model architecture and weights to disk
                    keras_model.save(path + name + '.h5', overwrite=overwrite)
                                        
                    # The Keras estimator is excluded from the model saved to the joblib file
                    self.pipe.named_steps['estimator'].model = None
            except AttributeError:
                pass
            
            # Create the lock file
            joblib.dump(f_lock, filename=Path(f_lock), compress=compress)

            try:
                # Store this instance to file
                joblib.dump(self, filename=Path(f), compress=compress)
            finally:
                # Delete the lock file
                Path(f_lock).unlink()
                
        return self
    
    def load(self, name, path):
        """
        Check if the model exists at the specified path and return it to the caller.
        If the model is not found throw an exception.
        """
        
        with open(Path(path + name + '.joblib'), 'rb') as f:
            self = joblib.load(f)
        
        # If using Keras we need to load the HDF5 file as well
        # The model will only be available if the fit method has been called previously
        if self.using_keras and hasattr(self, 'pipe'):
            # Avoid tensorflow error for keras models
            # https://github.com/tensorflow/tensorflow/issues/14356
            # https://stackoverflow.com/questions/40785224/tensorflow-cannot-interpret-feed-dict-key-as-tensor
            kerasbackend.clear_session()
            
            # Load the keras model architecture and weights from disk
            keras_model = keras.models.load_model(path + name + '.h5')
            keras_model._make_predict_function()                
            # Point the estimator in the sklearn pipeline to the keras model architecture and weights 
            self.pipe.named_steps['estimator'].model = keras_model

        return self

class Preprocessor(TransformerMixin):
    """
    A class that preprocesses a given dataset based on feature definitions passed as a dataframe.
    This class automates One Hot Encoding, Hashing, Text Vectorizing and Scaling.
    """
    
    def __init__(self, features, return_type='np', scale_hashed=True, scale_vectors=True, missing="zeros", scaler="StandardScaler", logfile=None, **kwargs):
        """
        Initialize the Preprocessor object based on the features dataframe.
        
        **kwargs are keyword arguments passed to the sklearn scaler instance.

        The features dataframe must include these columns: name, variable_type, feature_strategy.      
        If Feature_Strategy includes hashing or text vectorizing, the strategy_args column must also be included.
        The dataframe must be indexed by name.
                
        For further information on the columns refer to the project documentation: 
        https://github.com/nabeel-oz/qlik-py-tools
        """
        
        self.features = features
        self.return_type = return_type
        self.scale_hashed = scale_hashed
        self.scale_vectors = scale_vectors
        self.missing = missing
        self.scaler = scaler
        self.kwargs = kwargs
        self.ohe = False
        self.hash = False
        self.cv = False
        self.tfidf = False
        self.text = False
        self.scale = False
        self.no_prep = False
        self.log = logfile
        
        # Collect features for one hot encoding
        self.ohe_meta = features.loc[features["feature_strategy"] == "one hot encoding"].copy()
        
        # Set a flag if one hot encoding will be required
        if len(self.ohe_meta) > 0:
            self.ohe = True
        
        # Collect features for hashing
        self.hash_meta = features.loc[features["feature_strategy"] == "hashing"].copy()
        
        # Set a flag if feature hashing will be required
        if len(self.hash_meta) > 0:
            self.hash = True
            
            # Convert strategy_args column to integers
            self.hash_meta.loc[:,"strategy_args"] = self.hash_meta.loc[:,"strategy_args"].astype(np.int64, errors="ignore")
        
        # Collect features for count vectorizing
        self.cv_meta = features.loc[features["feature_strategy"] == "count_vectorizing"].copy()
        
        # Set a flag if count vectorizing will be required
        if len(self.cv_meta) > 0:
            self.cv = True
            
            # Convert strategy_args column to key word arguments for the sklearn CountVectorizer class
            self.cv_meta.loc[:,"strategy_args"] = self.cv_meta.loc[:,"strategy_args"].apply(utils.get_kwargs).\
            apply(utils.get_kwargs_by_type)
        
        # Collect features for term frequency inverse document frequency (TF-IDF) vectorizing
        self.tfidf_meta = features.loc[features["feature_strategy"] == "tf_idf"].copy()
        
        # Set a flag if tfidf vectorizing will be required
        if len(self.tfidf_meta) > 0:
            self.tfidf = True
            
            # Convert strategy_args column to key word arguments for the sklearn TfidfVectorizer class
            self.tfidf_meta.loc[:,"strategy_args"] = self.tfidf_meta.loc[:,"strategy_args"].apply(utils.get_kwargs).\
            apply(utils.get_kwargs_by_type)
        
         # Collect features for text similarity one hot encoding
        self.text_meta = features.loc[features["feature_strategy"] == "text_similarity"].copy()
        
        # Set a flag if text similarity OHE will be required
        if len(self.text_meta) > 0:
            self.text = True
        
        # Collect features for scaling
        self.scale_meta = features.loc[features["feature_strategy"] == "scaling"].copy()
        
        # Set a flag if scaling will be required
        if len(self.scale_meta) > 0:
            self.scale = True
        
        # Collect other features
        self.none_meta = features.loc[features["feature_strategy"] == "none"].copy()
        
        # Set a flag if there are features that don't require preprocessing
        if len(self.none_meta) > 0:
            self.no_prep = True

        # Output information to the terminal and log file if required
        if self.log is not None:
            self._print_log(1)
    
    def fit(self, X, y=None, features=None, retrain=False):
        """
        Fit to the training dataset, storing information that will be needed for the transform dataset.
        Return the Preprocessor object.
        Optionally re-initizialise the object by passing retrain=True, and resending the features dataframe
        """

        # Reinitialize this Preprocessor instance if required
        if retrain:
            if features is None:
                features = self.features
            
            self.__init__(features)
        
        # Set up an empty data frame for data to be scaled
        scale_df = pd.DataFrame()

        ohe_df = None
        hash_df = None
        cv_df = None
        tfidf_df = None
        text_df = None
        
        if self.ohe:
            # Get a subset of the data that requires one hot encoding
            ohe_df = X[self.ohe_meta.index.tolist()]
                
            # Apply one hot encoding to relevant columns
            ohe_df = pd.get_dummies(ohe_df, columns=ohe_df.columns)
            
            # Keep a copy of the OHE dataframe structure so we can align the transform dataset 
            self.ohe_df_structure = pd.DataFrame().reindex_like(ohe_df)
        
        # Scaling needs to be fit exclusively on the training data so as not to influence the results
        if self.scale:
            # Get a subset of the data that requires scaling
            scale_df = X[self.scale_meta.index.tolist()]
                   
        if self.hash:
            # Get a subset of the data that requires feature hashing
            hash_df = X[self.hash_meta.index.tolist()]
            hash_cols = hash_df.columns

            # Hash unique values for each relevant column and then join to a dataframe for hashed data
            for c in hash_cols:
                unique = self.hasher(hash_df, c, self.hash_meta["strategy_args"].loc[c])
                hash_df = hash_df.join(unique, on=c)
                hash_df = hash_df.drop(c, axis=1)

            # If hashed columns need to be scaled, these need to be considered when setting up the scaler as well    
            if self.scale_hashed:
                if self.scale:
                    scale_df = scale_df.join(hash_df)
                else:
                    scale_df = hash_df 
        
        if self.cv:
            # Get a subset of the data that requires count vectorizing
            cv_df = X[self.cv_meta.index.tolist()]
            cv_cols = cv_df.columns

            # Get count vectors for each relevant column and then join to a dataframe for count vectorized data
            for c in cv_cols:
                unique = self.text_vectorizer(cv_df, c, type="count", **self.cv_meta["strategy_args"].loc[c])
                cv_df = cv_df.join(unique, on=c)
                cv_df = cv_df.drop(c, axis=1)

            # Keep a copy of the count vectorized dataframe structure so we can align the transform dataset 
            self.cv_df_structure = pd.DataFrame().reindex_like(cv_df)

            # If text vector columns need to be scaled, these need to be considered when setting up the scaler as well    
            if self.scale_vectors:
                if self.scale or (self.scale_hashed and self.hash):
                    scale_df = scale_df.join(cv_df)
                else:
                    scale_df = cv_df 

        if self.tfidf:
            # Get a subset of the data that requires tfidf vectorizing
            tfidf_df = X[self.tfidf_meta.index.tolist()]
            tfidf_cols = tfidf_df.columns

            # Get tfidf vectors for each relevant column and then join to a dataframe for tfidf vectorized data
            for c in tfidf_cols:
                unique = self.text_vectorizer(tfidf_df, c, type="tfidf", **self.tfidf_meta["strategy_args"].loc[c])
                tfidf_df = tfidf_df.join(unique, on=c)
                tfidf_df = tfidf_df.drop(c, axis=1)

            # Keep a copy of the tfidf vectorized dataframe structure so we can align the transform dataset 
            self.tfidf_df_structure = pd.DataFrame().reindex_like(tfidf_df)
            
            # If text vector columns need to be scaled, these need to be considered when setting up the scaler as well    
            if self.scale_vectors:
                if self.scale or (self.scale_hashed and self.hash) or self.cv:
                    scale_df = scale_df.join(tfidf_df)
                else:
                    scale_df = tfidf_df 
        
        if self.text:
            # Get a subset of the data that requires text similarity OHE
            text_df = X[self.text_meta.index.tolist()]
            text_cols = text_df.columns

            # Get text similarity OHE for each relevant column and then join to a dataframe for text similarity OHE data
            for c in text_cols:
                unique = self.text_similarity(text_df, c)
                text_df = text_df.join(unique, on=c)
                text_df = text_df.drop(c, axis=1)

            # Keep a copy of the text similarity OHE dataframe structure so we can align the transform dataset 
            self.text_df_structure = pd.DataFrame().reindex_like(text_df)

        try:
            if len(scale_df) > 0:
                # Get an instance of the sklearn scaler fit to X
                self.scaler_instance = utils.get_scaler(scale_df, missing=self.missing, scaler=self.scaler, **self.kwargs)

                # Keep a copy of the scaling dataframe structure so we can align the transform dataset 
                self.scale_df_structure = pd.DataFrame().reindex_like(scale_df)
        except AttributeError:
            pass

        # Output information to the terminal and log file if required
        if self.log is not None:
            self._print_log(2, ohe_df=ohe_df, scale_df=scale_df, hash_df=hash_df, cv_df=cv_df, tfidf_df=tfidf_df, text_df=text_df)

        return self
      
    def transform(self, X, y=None):
        """
        Transform X with the encoding and scaling requirements set by fit().
        This function will perform One Hot Encoding, Feature Hashing and Scaling on X.
        Returns X_transform as a numpy array or a pandas dataframe based on return_type set in constructor.
        """        
        
        X_transform = None
        scale_df = pd.DataFrame() # Initialize as empty Data Frame for convenience of concat operations below
        ohe_df = None
        hash_df = None
        cv_df = None
        tfidf_df = None
        text_df = None
        
        if self.ohe:
            # Get a subset of the data that requires one hot encoding
            ohe_df = X[self.ohe_meta.index.tolist()]

            # Apply one hot encoding to relevant columns
            ohe_df = pd.get_dummies(ohe_df, columns=ohe_df.columns)

            # Align the columns with the original dataset. 
            # This is to prevent different number or order of features between training and test datasets.
            ohe_df = ohe_df.align(self.ohe_df_structure, join='right', axis=1)[0]

            # Fill missing values in the OHE dataframe, that may appear after alignment, with zeros.
            ohe_df = utils.fillna(ohe_df, method="zeros")
            
            # Add the encoded columns to the result dataset
            X_transform = ohe_df

        if self.hash:
            # Get a subset of the data that requires feature hashing
            hash_df = X[self.hash_meta.index.tolist()]
            hash_cols = hash_df.columns

            # Hash unique values for each relevant column and then join to a dataframe for hashed data
            for c in hash_cols:
                unique = self.hasher(hash_df, c, self.hash_meta["strategy_args"].loc[c])
                hash_df = hash_df.join(unique, on=c)
                hash_df = hash_df.drop(c, axis=1)
                # Fill any missing values in the hash dataframe
                hash_df = utils.fillna(hash_df, method="zeros")
        
        if self.cv:
            # Get a subset of the data that requires count vectorizing
            cv_df = X[self.cv_meta.index.tolist()]
            cv_cols = cv_df.columns

            # Get count vectors for each relevant column and then join to a dataframe for count vectorized data
            for c in cv_cols:
                unique = self.text_vectorizer(cv_df, c, type="count", **self.cv_meta["strategy_args"].loc[c])
                cv_df = cv_df.join(unique, on=c)
                cv_df = cv_df.drop(c, axis=1)

            # Align the columns with the original dataset. 
            # This is to prevent different number or order of features between training and test datasets.
            cv_df = cv_df.align(self.cv_df_structure, join='right', axis=1)[0]

            # Fill missing values in the dataframe that may appear after alignment with zeros.
            cv_df = utils.fillna(cv_df, method="zeros")

        if self.tfidf:
            # Get a subset of the data that requires tfidf vectorizing
            tfidf_df = X[self.tfidf_meta.index.tolist()]
            tfidf_cols = tfidf_df.columns

            # Get tfidf vectors for each relevant column and then join to a dataframe for tfidf vectorized data
            for c in tfidf_cols:
                unique = self.text_vectorizer(tfidf_df, c, type="tfidf", **self.tfidf_meta["strategy_args"].loc[c])
                tfidf_df = tfidf_df.join(unique, on=c)
                tfidf_df = tfidf_df.drop(c, axis=1)

            # Align the columns with the original dataset. 
            # This is to prevent different number or order of features between training and test datasets.
            tfidf_df = tfidf_df.align(self.tfidf_df_structure, join='right', axis=1)[0]

            # Fill missing values in the dataframe that may appear after alignment with zeros.
            tfidf_df = utils.fillna(tfidf_df, method="zeros")
        
        if self.text:
            # Get a subset of the data that requires text similarity OHE
            text_df = X[self.text_meta.index.tolist()]
            text_cols = text_df.columns

            # Get text similarity OHE for each relevant column and then join to a dataframe for text similarity OHE data
            for c in text_cols:
                unique = self.text_similarity(text_df, c)
                text_df = text_df.join(unique, on=c)
                text_df = text_df.drop(c, axis=1)

            # Align the columns with the original dataset. 
            # This is to prevent different number or order of features between training and test datasets.
            text_df = text_df.align(self.text_df_structure, join='right', axis=1)[0]

            # Fill missing values in the dataframe that may appear after alignment with zeros.
            text_df = utils.fillna(text_df, method="zeros")

            # Add the text similary OHE data to the result dataset
            if X_transform is None:
                X_transform = text_df
            else:
                X_transform = pd.concat([X_transform, text_df], join='outer', axis=1, sort=False)

        if self.scale:
            # Get a subset of the data that requires scaling
            scale_df = X[self.scale_meta.index.tolist()]

        # If scale_hashed = True join the hashed columns to the scaling dataframe
        if self.hash and self.scale_hashed:
            if self.scale:
                scale_df = pd.concat([scale_df, hash_df], join='outer', axis=1, sort=False)
            else:
                scale_df = hash_df
                # If only hashed columns are being scaled, the scaler needs to be instantiated
                self.scaler_instance = utils.get_scaler(scale_df, missing=self.missing, scaler=self.scaler, **self.kwargs)
        elif self.hash:
            # Add the hashed columns to the result dataset
            if X_transform is None:
                X_transform = hash_df
            else:
                X_transform = pd.concat([X_transform, hash_df], join='outer', axis=1, sort=False)

        # If scale_vectors = True join the count vectorized columns to the scaling dataframe
        if self.cv and self.scale_vectors:
            if self.scale or (self.hash and self.scale_hashed):
                scale_df = pd.concat([scale_df, cv_df], join='outer', axis=1, sort=False)
            else:
                scale_df = cv_df
                # If only count vectorized columns are being scaled, the scaler needs to be instantiated
                self.scaler_instance = utils.get_scaler(scale_df, missing=self.missing, scaler=self.scaler, **self.kwargs)
        elif self.cv:
            # Add the count vectorized columns to the result dataset
            if X_transform is None:
                X_transform = cv_df
            else:
                X_transform = pd.concat([X_transform, cv_df], join='outer', axis=1, sort=False)

        # If scale_vectors = True join the tfidf vectorized columns to the scaling dataframe
        if self.tfidf and self.scale_vectors:
            if self.scale or (self.hash and self.scale_hashed) or self.cv:
                scale_df = pd.concat([scale_df, tfidf_df], join='outer', axis=1, sort=False)
            else:
                scale_df = tfidf_df
                # If only tfidf vectorized columns are being scaled, the scaler needs to be instantiated
                self.scaler_instance = utils.get_scaler(scale_df, missing=self.missing, scaler=self.scaler, **self.kwargs)
        elif self.tfidf:
            # Add the count vectorized columns to the result dataset
            if X_transform is None:
                X_transform = tfidf_df
            else:
                X_transform = pd.concat([X_transform, tfidf_df], join='outer', axis=1, sort=False)

        try:
            # Perform scaling on the relevant data
            if len(scale_df) > 0:
                # Align the columns with the original dataset. 
                # This is to prevent different number or order of features between training and test datasets.
                scale_df = scale_df.align(self.scale_df_structure, join='right', axis=1)[0]
                
                scale_df = utils.fillna(scale_df, method=self.missing)

                scale_df = pd.DataFrame(self.scaler_instance.transform(scale_df), index=scale_df.index, columns=scale_df.columns)
                
                # Add the scaled columns to the result dataset
                if X_transform is None:
                    X_transform = scale_df
                else:
                    X_transform = pd.concat([X_transform, scale_df], join='outer', axis=1, sort=False)
        except AttributeError:
            pass

        if self.no_prep:
            # Get a subset of the data that doesn't require preprocessing
            no_prep_df = X[self.none_meta.index.tolist()]
            # Fill any missing values in the no prep dataframe
            no_prep_df = utils.fillna(no_prep_df, method="zeros")
        
            # Finally join the columns that do not require preprocessing to the result dataset
            if X_transform is None:
                X_transform = no_prep_df
            else:
                X_transform = pd.concat([X_transform, no_prep_df], join='outer', axis=1, sort=False)
        
        # Output information to the terminal and log file if required
        if self.log is not None:
            self._print_log(3, ohe_df=ohe_df, scale_df=scale_df, hash_df=hash_df, cv_df=cv_df, tfidf_df=tfidf_df, text_df=text_df, X_transform=X_transform)

        if self.return_type == 'np':
            return X_transform.values
        
        return X_transform
    
    def fit_transform(self, X, y=None, features=None, retrain=False):
        """
        Apply fit() then transform()
        """
        
        if features is None:
            features = self.features
        
        return self.fit(X, y, features, retrain).transform(X, y)
    
    def _print_log(self, step, **kwargs):
        """
        Output useful information to stdout and the log file if debugging is required.
        step: Print the corresponding step in the log
        kwargs: dictionary of dataframes to be used in the log
        """
        
        if step == 1:
            if self.ohe:
                sys.stdout.write("Features for one hot encoding: \n{0}\n\n".format(self.ohe_meta))
                
                with open(self.log,'a', encoding='utf-8') as f:
                    f.write("Features for one hot encoding: \n{0}\n\n".format(self.ohe_meta))
            
            if self.hash:
                sys.stdout.write("Features for hashing: \n{0}\n\n".format(self.hash_meta))
                
                with open(self.log,'a', encoding='utf-8') as f:
                    f.write("Features for hashing: \n{0}\n\n".format(self.hash_meta))
            
            if self.cv:
                sys.stdout.write("Features for count vectorization: \n{0}\n\n".format(self.cv_meta))
                
                with open(self.log,'a', encoding='utf-8') as f:
                    f.write("Features for count vectorization: \n{0}\n\n".format(self.cv_meta))
            
            if self.tfidf:
                sys.stdout.write("Features for tfidf vectorization: \n{0}\n\n".format(self.tfidf_meta))
                
                with open(self.log,'a', encoding='utf-8') as f:
                    f.write("Features for tfidf vectorization: \n{0}\n\n".format(self.tfidf_meta))
            
            if self.scale:
                sys.stdout.write("Features for scaling: \n{0}\n\n".format(self.scale_meta))
                
                with open(self.log,'a', encoding='utf-8') as f:
                    f.write("Features for scaling: \n{0}\n\n".format(self.scale_meta))

        elif step == 2:
            if self.ohe:
                sys.stdout.write("Fit ohe_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['ohe_df'].shape, kwargs['ohe_df'].head()))
                
                with open(self.log,'a', encoding='utf-8') as f:
                    f.write("Fit ohe_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['ohe_df'].shape, kwargs['ohe_df'].head()))
            
            if self.hash:
                sys.stdout.write("Fit hash_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['hash_df'].shape, kwargs['hash_df'].head()))
                
                with open(self.log,'a', encoding='utf-8') as f:
                    f.write("Fit hash_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['hash_df'].shape, kwargs['hash_df'].head()))
            
            if self.cv:
                sys.stdout.write("Fit cv_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['cv_df'].shape, kwargs['cv_df'].head()))
                
                with open(self.log,'a', encoding='utf-8') as f:
                    f.write("Fit cv_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['cv_df'].shape, kwargs['cv_df'].head()))
            
            if self.tfidf:
                sys.stdout.write("Fit tfidf_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['tfidf_df'].shape, kwargs['tfidf_df'].head()))
                
                with open(self.log,'a', encoding='utf-8') as f:
                    f.write("Fit tfidf_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['tfidf_df'].shape, kwargs['tfidf_df'].head()))
            
            try:
                if len(kwargs['scale_df']) > 0:
                    sys.stdout.write("Fit scale_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['scale_df'].shape, kwargs['scale_df'].head()))
                    
                    with open(self.log,'a', encoding='utf-8') as f:
                        f.write("Fit scale_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['scale_df'].shape, kwargs['scale_df'].head()))
            except AttributeError:
                pass
        
        elif step == 3:
            if self.ohe:
                sys.stdout.write("Transform ohe_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['ohe_df'].shape, kwargs['ohe_df'].head()))
                
                with open(self.log,'a', encoding='utf-8') as f:
                    f.write("Transform ohe_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['ohe_df'].shape, kwargs['ohe_df'].head()))
            
            if self.hash:
                sys.stdout.write("Transform hash_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['hash_df'].shape, kwargs['hash_df'].head()))
                
                with open(self.log,'a', encoding='utf-8') as f:
                    f.write("Transform hash_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['hash_df'].shape, kwargs['hash_df'].head()))
            
            if self.cv:
                sys.stdout.write("Transform cv_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['cv_df'].shape, kwargs['cv_df'].head()))
                
                with open(self.log,'a', encoding='utf-8') as f:
                    f.write("Transform cv_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['cv_df'].shape, kwargs['cv_df'].head()))
            
            if self.tfidf:
                sys.stdout.write("Transform tfidf_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['tfidf_df'].shape, kwargs['tfidf_df'].head()))
                
                with open(self.log,'a', encoding='utf-8') as f:
                    f.write("Transform tfidf_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['tfidf_df'].shape, kwargs['tfidf_df'].head()))
            
            try:
                if len(kwargs['scale_df']) > 0:
                    sys.stdout.write("Transform scale_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['scale_df'].shape, kwargs['scale_df'].head()))
                    
                    with open(self.log,'a', encoding='utf-8') as f:
                        f.write("Transform scale_df shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['scale_df'].shape, kwargs['scale_df'].head()))
            except AttributeError:
                pass

            try:
                sys.stdout.write("X_transform shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['X_transform'].shape, kwargs['X_transform'].head()))
                    
                with open(self.log,'a', encoding='utf-8') as f:
                    f.write("X_transform shape:{0}\nSample Data:\n{1}\n\n".format(kwargs['X_transform'].shape, kwargs['X_transform'].head()))
            except AttributeError:
                pass

    @staticmethod
    def hasher(df, col, n_features):
        """
        Hash the unique values in the specified column in the given dataframe, creating n_features
        """
        
        unique = pd.DataFrame(df[col].unique(), columns=[col])
        fh = FeatureHasher(n_features=n_features, input_type="string")
        hashed = fh.fit_transform(unique.loc[:, col])
        unique = unique.join(pd.DataFrame(hashed.toarray()).add_prefix(col))
        return unique.set_index(col)
    
    @staticmethod
    def text_vectorizer(df, col, type="count", **kwargs):
        """
        Create count vectors using the sklearn TfidfVectorizer or CountVectorizer for the specified column in the given dataframe.
        The type argument can be "tfidf" referring to TfidfVectorizer, anything else defaults to CountVectorizer.
        """
        
        unique = pd.DataFrame(df[col].unique(), columns=[col])
        
        if type == "tfidf":
            v = TfidfVectorizer(**kwargs)
        else:
            v = CountVectorizer(**kwargs)
        
        vectorized = v.fit_transform(unique.loc[:, col])

        feature_names = v.get_feature_names()
        col_names = []

        for i,j in enumerate(feature_names):
            col_names.append("{}_{}".format(i,j))

        unique = unique.join(pd.DataFrame(vectorized.toarray(), columns=col_names).add_prefix(col+"_"))
        return unique.set_index(col)
    
    @staticmethod
    def text_similarity(df, col):
        """
        Convert strings to their unicode representation and then apply one hot encoding, creating one feature for each unique character in the column. 
        This can be useful when similarity between strings is significant.
        """
        
        unique = pd.DataFrame(df[col].unique(), columns=[col])
        
        encoded = pd.DataFrame(unique.loc[:,col].apply(lambda s: [ord(a) for a in s]), index=unique.index)
        
        mlb = preprocessing.MultiLabelBinarizer()
        encoded = pd.DataFrame(mlb.fit_transform(encoded[col]),columns=mlb.classes_, index=encoded.index).add_prefix(col+"_")
        
        unique = unique.join(encoded)
        
        return unique.set_index(col)

class TargetTransformer:
    """
    A class to transform the target variable.
    This class can scale the target using the specified sklearn scaler.
    It can also make the series stationary by differencing the values. Note that this is only valid when predictions will include multiple samples.
    An inverse transform method allows for reversing the transformations.
    """

    def __init__(self, scale=True, make_stationary=None, stationarity_lags=[1], missing="zeros", scaler="StandardScaler", logfile=None, **kwargs):
        """
        Initialize the TargetTransformer instance.
        
        scale is a boolean parameter to determine if the target will be scaled.
        
        make_stationary is a parameter to determine if the target will be made stationary. This should only be used for sequential data.
        Passing make_stationary='log' will apply a logarithm to the target and use an exponential for the inverse transform.
        Passing make_stationary='difference' will difference the values to make the target series stationary.
        By default the difference will be done with lag = 1. Alternate lags can be provided by passing a list using the stationarity_lags parameter.
        e.g. stationarity_lags=[1, 12]
        
        missing deteremines how missing values are dealt with before the scaling is applied. 
        Valid options specified through the missing parameter are: zeros, mean, median, mode

        Valid options for scaler are the scaler classes in sklearn.preprocessing

        Other kwargs are keyword arguments passed to the sklearn scaler instance.
        """
        
        self.scale = scale
        self.make_stationary = make_stationary
        self.missing = missing
        self.scaler = scaler
        self.logfile = logfile
        self.lags = stationarity_lags
        self.kwargs = kwargs
    
    def fit(self, y):
        """
        Make the data stationary if required.
        Fit the scaler to target values from the training set.
        """

        # Apply a logarithm to make the array stationary
        if self.make_stationary == 'log':
            y = np.log(y)

        # Apply stationarity lags by differencing the array
        elif self.make_stationary == 'difference':
            y_diff = y.copy()
            len_y = len(y_diff)

            for i in range(max(self.lags), len_y):
                for lag in self.lags:
                    if isinstance(y_diff, (pd.Series, pd.DataFrame)):
                        y_diff.iloc[i] = y_diff.iloc[i] - y.iloc[i - lag]
                    else:
                        y_diff[i] = y_diff[i] - y[i - lag]
            
            # Remove targets with insufficient lag periods
            if isinstance(y_diff, (pd.Series, pd.DataFrame)):
                y = y_diff.iloc[max(self.lags):]
            else:
                y = y_diff[max(self.lags):]

        # Fit the scaler
        if self.scale:
            # Get an instance of the sklearn scaler fit to y
            self.scaler_instance = utils.get_scaler(y, missing=self.missing, scaler=self.scaler, **self.kwargs)
        
        return self

    def transform(self, y, array_like=True):
        """
        Transform new targets using the previously fit scaler.
        Also apply a logarithm or differencing if required for making the series stationary.

        array_like determines if y is expected to be multiple values or a single value.
        Note that the differencing won't be done if array_like=False.
        """

        y_transform = y      

        # Apply a logarithm to make the array stationary
        if self.make_stationary == 'log':
            y_transform = np.log(y)

        # Apply stationarity lags by differencing the array
        elif self.make_stationary == 'difference' and array_like:
            y_diff = y_transform.copy()
            len_y = len(y_diff)

            for i in range(max(self.lags), len_y):
                for lag in self.lags:
                    if isinstance(y_diff, (pd.Series, pd.DataFrame)):
                        y_diff.iloc[i] = y_diff.iloc[i] - y_transform.iloc[i - lag]
                    else:
                        y_diff[i] = y_diff[i] - y_transform[i - lag]
            
            # Remove targets with insufficient lag periods
            # NOTE: The corresponding samples will need to be dropped at this function call's origin
            if isinstance(y_diff, (pd.Series, pd.DataFrame)):
                y_transform = y_diff.iloc[max(self.lags):]
            else:
                y_transform = y_diff[max(self.lags):]

        # Scale the targets using the previously fit scaler
        if self.scale:
            y_transform_scaled = self.scaler_instance.transform(y_transform)
            
            if isinstance(y_transform, pd.DataFrame):
                # The scaler returns a numpy array which needs to be converted back to a data frame
                y_transform = pd.DataFrame(y_transform_scaled, columns=y_transform.columns, index=y_transform.index)

        if self.logfile is not None:
            self._print_log(1, data=y_transform, array_like=array_like)
        
        return y_transform

    def fit_transform(self, y):
        """
        Apply fit then transform
        """

        return self.fit(y).transform(y)

    def inverse_transform(self, y_transform, array_like=True):
        """
        Reverse the transformations and return the target in it's original form.

        array_like determines if y_transform is expected to be multiple values or a single value.
        Note that the differencing won't be done if array_like=False.

        For an inverse of differencing, y_transform needs to include sufficient actual lag values.
        """

        y = y_transform

        # If inversing differencing, we assume sufficient lag values. 
        # These should be actual values that have not been scaled previously.
        if self.make_stationary == 'difference' and array_like:
            if isinstance(y_transform, pd.DataFrame):
                y_lags = y_transform.iloc[:max(self.lags)]
                y = y_transform.iloc[max(self.lags):]
            else:
                y_lags = y_transform[:max(self.lags)]
                y = y_transform[max(self.lags):]
        
        # Inverse the scaling for non-lag values
        if self.scale:
            if isinstance(y, pd.DataFrame):
                y = self.scaler_instance.inverse_transform(np.reshape(y.values, (-1, 1)))
                # The scaler returns a numpy array which needs to be converted back to a data frame
                y = pd.DataFrame(y, columns=y.columns, index=y.index)
            else:
                y = self.scaler_instance.inverse_transform(np.reshape(y, (-1, 1)))
                # Flatten the reshaping done above
                y = y.ravel()

        # Apply an exponential to reverse the logarithm applied during transform
        if self.make_stationary == 'log':
            y = np.exp(y)

        # Reverse the differencing applied during transform
        # NOTE: y_transform will need to include actual lag values
        elif self.make_stationary == 'difference' and array_like:
            if isinstance(y, (pd.Series, pd.DataFrame)):
                y = pd.concat([y_lags, y])
            else:
                y = np.append(y_lags, y)

            len_y = len(y)
            
            for i in range(max(self.lags), len_y):
                for lag in self.lags:
                    if isinstance(y, (pd.Series, pd.DataFrame)):
                        y.iloc[i] = y.iloc[i] + y.iloc[i - lag]
                    else:
                        y[i] = y[i] + y[i - lag]

        if self.logfile is not None:
            self._print_log(2, data=y, array_like=array_like)

        return y
    
    def _print_log(self, step, data=None, array_like=True):
        """
        Print debug info to the log
        """
        
        # Set mode to append to log file
        mode = 'a'
        output = ''

        if step == 1:
            # Output the transformed targets
            output = "Targets transformed"
        elif step == 2:
            # Output sample data after adding lag observations
            output = "Targets inverse transformed"

        if array_like:
            output += " {0}:\nSample Data:\n{1}\n\n".format(data.shape, data.head())
        else:
            output += " {0}".format(data)

        sys.stdout.write(output)
        with open(self.logfile, mode, encoding='utf-8') as f:
            f.write(output)

class Reshaper(TransformerMixin):
    """
    A class that reshapes the feature matrix based on the input_shape.
    This class is built for Keras estimators where recurrent and convolutional layers can require 3D or 4D inputs.
    It is meant to be used after preprocessing and before fitting the estimator.
    """

    def __init__(self, first_layer_kwargs=None, logfile=None, **kwargs):
        """
        Initialize the Reshaper with the Keras model first layer's kwargs.
        Additionally take in the number of lag observations to be used in reshaping the data.
        If lag_target is True, an additional feature will be created for each sample i.e. the previous value of y. 
        first_layer_kwargs should be a reference to the first layer kwargs of the Keras architecture being used to build the model.
        Optional arguments are a logfile to output debug info.
        """

        self.first_layer_kwargs = first_layer_kwargs
        self.logfile = logfile
    
    def fit(self, X, y=None):
        """
        Update the input shape based on the number of samples in X.
        Return this Reshaper object.
        """

        # Create the input_shape property as a list
        self.input_shape = list(self.first_layer_kwargs['input_shape'])

        # Debug information is printed to the terminal and logs if required
        if self.logfile:
            self._print_log(1)

        return self
    
    def transform(self, X, y=None):
        """
        Apply the new shape to the data provided in X.
        X is expected to be a 2D DataFrame of samples and features.
        The data will be reshaped according to self.input_shape. 
        """
        
        # Add the number of samples to the input_shape
        input_shape = self.input_shape.copy()
        input_shape.insert(0, X.shape[0])

        # Debug information is printed to the terminal and logs if required
        if self.logfile:
            self._print_log(2, data=input_shape)

        # If the final shape is n_samples by n_features we have nothing to do here
        if (len(input_shape) == 2):
            return X
        
        # 2D, 3D and 4D data is valid. 
        # e.g. The input_shape can be a tuple of (samples, subsequences, timesteps, features), with subsequences and timesteps as optional.
        # A 5D shape may be valid for e.g. a ConvLSTM with (samples, timesteps, rows, columns, features) 
        if len(input_shape) > 5:
            err = "Unsupported input_shape: {}".format(input_shape)
            raise Exception(err)
        # Reshape the data
        elif len(input_shape) > 2:
            # Reshape input data using numpy
            X_transform = X.values.reshape(input_shape)

            # Debug information is printed to the terminal and logs if required
            if self.logfile:
                self._print_log(3, data=X_transform)

        return X_transform
    
    def fit_transform(self, X, y=None):
        """
        Apply fit() then transform()
        """
        
        return self.fit(X, y).transform(X, y)
    
    def _print_log(self, step, data=None):
        """
        Print debug info to the log
        """
        
        # Set mode to append to log file
        mode = 'a'

        if step == 1:
            # Output the updated input shape
            output = "Input shape specification for Keras: {0}\n\n".format(self.first_layer_kwargs['input_shape'])
        elif step == 2:
            # Output the updated input shape
            output = "{0} samples added to the input shape. Data will be reshaped to: {1}\n\n".format(data[0], tuple(data))
        elif step == 3:
            # Output sample data after reshaping
            output = "Input data reshaped to {0}.\nSample Data:\n{1}\n\n".format(data.shape, data[:5])

        sys.stdout.write(output)
        with open(self.logfile, mode, encoding='utf-8') as f:
            f.write(output)

class KerasClassifierForQlik(KerasClassifier):
    """
    A subclass of the KerasClassifier Scikit-Learn wrapper.
    This class takes in a compiled Keras model as part of sk_params and uses the __call__ method as the default build_fn.
    It also stores a histories dataframe to provide metrics for each time the model is fit.
    """
      
    def __init__(self, **sk_params):
        """
        Initialize the KerasClassifierForQlik.
        The compiled Keras model should be included in sk_params under the 'neural_net' keyword argument.
        """
        
        # Assign the parameters to a class variable
        self.sk_params = sk_params
        
        # Set build_fn to the function supplied in sk_params
        self.build_fn = self.sk_params.pop('build_fn')

        # DataFrame to contain history of every training cycle
        # This DataFrame will provide metrics such as loss for each run of the fit method
        # Columns will be ['iteration', 'epoch', 'loss'] and any other metrics being calculated during training
        self.histories = pd.DataFrame()
        self.iteration = 0
        
        # Check the parameters using the super class method
        self.check_params(self.sk_params)  

    def get_params(self, **params):
        """Gets parameters for this estimator.
        # Arguments
            **params: ignored (exists for API compatibility).
        # Returns
            Dictionary of parameter names mapped to their values.
        """
        res = self.sk_params
        # Add back the Keras build function that was popped out of sk_params
        res.update({'build_fn': self.build_fn})
        return res

    def fit(self, x, y, sample_weight=None, **kwargs):
        """
        Call the super class' fit method and store metrics from the history.
        Also cater for multi-step predictions.
        """

        # Match the samples to the targets. 
        # x and y can be out of sync due to dropped samples in the Reshaper transformer.
        if len(y) > len(x):
            y = y[len(y)-len(x):] 
        
        # Fit the model to the data and store information on the training
        history = super().fit(x, y, sample_weight, **kwargs)

        sys.stdout.write("\n\nKeras Model Summary:\n")
        self.model.summary()
        sys.stdout.write("\n\n")

        # Set up a data frame with the epochs and a counter to track multiple histories
        history_df = pd.DataFrame({'iteration': self.iteration+1, 'epoch': history.epoch})

        # Add a column per metric for each epoch e.g. loss, acc
        for key in history.history:
            history_df[key] = pd.Series(history.history[key])

        # Concatenate results from the training to the history data frame
        self.histories = pd.concat([self.histories, history_df], sort=True).sort_values(by=['iteration', 'epoch']).reset_index(drop=True)
        self.iteration += 1

        return history

class KerasRegressorForQlik(KerasRegressor):
    """
    A subclass of the KerasRegressor Scikit-Learn wrapper.
    This class takes in a compiled Keras model as part of sk_params and uses the __call__ method as the default build_fn.
    It also stores a histories dataframe to provide metrics for each time the model is fit.
    """
      
    def __init__(self, **sk_params):
        """
        Initialize the KerasRegressorForQlik.
        The compiled Keras model should be included in sk_params under the 'neural_net' keyword argument.
        """

        # Assign the parameters to a class variable
        self.sk_params = sk_params
        
        # Set build_fn to the function supplied in sk_params
        self.build_fn = self.sk_params.pop('build_fn')
        
        # DataFrame to contain history of every training cycle
        # This DataFrame will provide metrics such as loss for each run of the fit method
        # Columns will be ['iteration', 'epoch', 'loss'] and any other metrics being calculated during training
        self.histories = pd.DataFrame()
        self.iteration = 0

        # Check the parameters using the super class method
        self.check_params(self.sk_params)

    def get_params(self, **params):
        """
        Gets parameters for this estimator.
        Overrides super class method for compatibility with sklearn cross_validate.
        """

        res = self.sk_params
        # Add back the Keras build function that was popped out of sk_params
        res.update({'build_fn': self.build_fn})
        return res

    def fit(self, x, y, **kwargs):
        """
        Call the super class' fit method and store metrics from the history.
        Also cater for multi-step predictions.
        """

        # Match the samples to the targets. 
        # x and y can be out of sync due to dropped samples in the Reshaper transformer.
        if len(y) > len(x):
            y = y[len(y)-len(x):] 

        # Fit the model to the data and store information on the training
        history = super().fit(x, y, **kwargs)

        sys.stdout.write("\n\nKeras Model Summary:\n")
        self.model.summary()
        sys.stdout.write("\n\n")

        # Set up a data frame with the epochs and a counter to track multiple histories
        history_df = pd.DataFrame({'iteration': self.iteration+1, 'epoch': history.epoch})
       
        # Add a column per metric for each epoch e.g. loss
        for key in history.history:
            history_df[key] = pd.Series(history.history[key])

        # Concatenate results from the training to the history data frame
        self.histories = pd.concat([self.histories, history_df], sort=True).sort_values(by=['iteration', 'epoch']).reset_index(drop=True)
        self.iteration += 1

        return history