python source code of autoencoder

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
from sklearn.metrics import confusion_matrix
from tensorflow.python.keras import regularizers

import input_data
import inputs
import time
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Input, Dense, LSTM, TimeDistributed, Conv1D, MaxPooling1D, UpSampling1D
from tensorflow.keras.models import Model
import tensorflow.keras.optimizers as opt
import configuration

# -- Parameters --
DIM = "1d"
LABELS = ["Normal", "Smell"]

TOKENIZER_OUT_PATH = "/users/pa18/tushar/smellDetectionML/data/tokenizer_out/"
OUT_FOLDER = "/users/pa18/tushar/smellDetectionML/learning_smells/results/rq1/raw"
# TOKENIZER_OUT_PATH = r"..\..\data\tokenizer_out"
# OUT_FOLDER = r"..\results\rq1\raw"
TRAIN_VALIDATE_RATIO = 0.7


# --


def autoencoder_dense(data, smell, layers=1, encoding_dimension=32, epochs=10, with_bottleneck=True, is_final=False, threshold=400000):
    encoding_dim = encoding_dimension
    input_layer = Input(shape=(data.max_input_length,))
    no_of_layers = layers
    prev_layer = input_layer
    for i in range(no_of_layers):
        encoder = Dense(int(encoding_dim / pow(2, i)), activation="relu",
                        activity_regularizer=regularizers.l1(10e-3))(prev_layer)
        prev_layer = encoder
    # bottleneck
    if with_bottleneck:
        prev_layer = Dense(int(encoding_dim / pow(2, no_of_layers)), activation="relu")(prev_layer)
    for j in range(no_of_layers - 1, -1, -1):
        decoder = Dense(int(encoding_dim / pow(2, j)), activation='relu')(prev_layer)
        prev_layer = decoder
    prev_layer = Dense(data.max_input_length, activation='relu')(prev_layer)
    autoencoder = Model(inputs=input_layer, outputs=prev_layer)

    autoencoder.compile(optimizer='adam',
                        loss='mean_squared_error',
                        metrics=['accuracy'])
    autoencoder.summary()

    batch_sizes = [32, 64, 128]
    # batch_sizes = [32, 64, 128, 256, 512]
    b_size = int(len(data.train_data) / batch_sizes[len(batch_sizes) - 1])
    if b_size > len(batch_sizes) - 1:
        b_size = len(batch_sizes) - 1

    val_split = 0.2
    if is_final:
        val_split = 0
    history = autoencoder.fit(data.train_data,
                              data.train_data,
                              epochs=epochs,
                              # batch_size=batch_size,
                              batch_size=batch_sizes[b_size],
                              verbose=1,
                              validation_split=val_split,
                              shuffle=True).history

    # plt.plot(history['loss'])
    # plt.plot(history['val_loss'])
    # plt.title('model loss')
    # plt.ylabel('loss')
    # plt.xlabel('epoch')
    # plt.legend(['train', 'test'], loc='upper right')
    # plt.show()

    predictions = autoencoder.predict(data.eval_data)
    mse = np.mean(np.power(data.eval_data - predictions, 2), axis=1)
    error_df = pd.DataFrame({'Reconstruction_error': mse,
                             'True_class': data.eval_labels})
    # print(error_df.describe())
    if is_final:
        return find_metrics(error_df, threshold)
    else:
        return find_optimal(error_df)


def autoencoder_cnn(data, config):
    data.train_data = data.train_data.reshape((len(data.train_data), data.max_input_length, 1))
    data.eval_data = data.eval_data.reshape((len(data.eval_labels), data.max_input_length, 1))
    # print("train_data shape: " + str(data.train_data.shape))

    input_layer = Input(shape=(data.max_input_length, 1))
    prev_layer = input_layer
    for i in range(config.layers):
        encoder = Conv1D(int(config.filters / pow(2, i)), config.kernel,
                         activation="relu", #input_shape=(data.max_input_length, 1),
                         padding='same',
                         kernel_initializer='random_uniform')(prev_layer)
        prev_layer = MaxPooling1D((config.pooling_window), strides=config.pooling_window)(encoder)

    # bottleneck
    # prev_layer = Conv1D(int(config.filters / pow(2, config.layers)), config.kernel, activation="relu",
    #                         kernel_initializer='random_uniform')(prev_layer)
    # prev_layer = MaxPooling1D((config.pooling_window), strides=2)(prev_layer)

    # decoder
    for j in range(config.layers - 1, -1, -1):
        prev_layer = Conv1D(int(config.filters / pow(2, j)), config.kernel,
                            padding='same',
                            activation="relu",
                            kernel_initializer='random_uniform')(prev_layer)
        prev_layer = UpSampling1D((config.pooling_window))(prev_layer)
    prev_layer = Dense(1, activation='relu')(prev_layer)
    autoencoder = Model(inputs=input_layer, outputs=prev_layer)

    autoencoder.compile(optimizer='adam',
                        loss='mean_squared_error',
                        metrics=['accuracy'])
    autoencoder.summary()

    # batch_sizes = [32, 64, 128]
    batch_sizes = [32, 64, 128, 256, 512]
    b_size = int(len(data.train_data) / batch_sizes[len(batch_sizes) - 1])
    if b_size > len(batch_sizes) - 1:
        b_size = len(batch_sizes) - 1
    history = autoencoder.fit(data.train_data,
                              data.train_data,
                              epochs=config.epochs,
                              # batch_size=batch_size,
                              batch_size=batch_sizes[b_size],
                              verbose=1,
                              validation_split=0.2,
                              shuffle=True).history

    # plt.plot(history['loss'])
    # plt.plot(history['val_loss'])
    # plt.title('model loss')
    # plt.ylabel('loss')
    # plt.xlabel('epoch')
    # plt.legend(['train', 'test'], loc='upper right')
    # plt.show()

    predictions = autoencoder.predict(data.eval_data)
    predictions = predictions.reshape(predictions.shape[0], predictions.shape[1])
    data.eval_data = data.eval_data.reshape(data.eval_data.shape[0], data.eval_data.shape[1])
    mse = np.mean(np.power(data.eval_data - predictions, 2), axis=1)
    error_df = pd.DataFrame({'Reconstruction_error': mse,
                             'True_class': data.eval_labels})
    # print(error_df.describe())
    return find_optimal(error_df)


def find_metrics(error_df, threshold):
    y_pred = [1 if e > threshold else 0 for e in error_df.Reconstruction_error.values]
    conf_matrix = confusion_matrix(error_df.True_class, y_pred)
    precision, recall, f1 = compute_metrics(conf_matrix)
    return threshold, precision, recall, f1


# The following code figures out the optimal threshold
def find_optimal(error_df):
    optimal_threshold = 1000
    max_f1 = 0
    max_pr = 0
    max_re = 0
    for threshold in range(1000, 400000, 5000):
        print("Threshold: " + str(threshold))
        y_pred = [1 if e > threshold else 0 for e in error_df.Reconstruction_error.values]
        conf_matrix = confusion_matrix(error_df.True_class, y_pred)
        precision, recall, f1 = compute_metrics(conf_matrix)
        if f1 > max_f1:
            max_f1 = f1
            optimal_threshold = threshold
            max_pr = precision
            max_re = recall
    return optimal_threshold, max_pr, max_re, max_f1


def autoencoder_lstm(data, smell, layers=1, encoding_dimension=8, no_of_epochs=10, with_bottleneck=True,
                     is_final=False):
    data.train_data = data.train_data.reshape((len(data.train_data), data.max_input_length, 1))
    data.eval_data = data.eval_data.reshape((len(data.eval_labels), data.max_input_length, 1))

    encoding_dim = encoding_dimension
    input_layer = Input(shape=(data.max_input_length, 1))
    # input_layer = BatchNormalization()(input_layer)
    no_of_layers = layers
    prev_layer = input_layer
    for i in range(no_of_layers):
        encoder = LSTM(int(encoding_dim / pow(2, i)),
                       # activation="relu",
                       return_sequences=True,
                       recurrent_dropout=0.1,
                       dropout=0.1)(prev_layer)
        prev_layer = encoder
    # bottleneck
    if with_bottleneck:
        prev_layer = LSTM(int(encoding_dim / pow(2, no_of_layers + 1)),
                          # activation="relu",
                          return_sequences=True,
                          recurrent_dropout=0.1,
                          dropout=0.1)(prev_layer)
    for j in range(no_of_layers - 1, -1, -1):
        decoder = LSTM(int(encoding_dim / pow(2, j)),
                       # activation='relu',
                       return_sequences=True,
                       recurrent_dropout=0.1,
                       dropout=0.1)(prev_layer)
        prev_layer = decoder
    prev_layer = TimeDistributed(Dense(1))(prev_layer)
    autoencoder = Model(inputs=input_layer, outputs=prev_layer)

    autoencoder.compile(optimizer='adam',
                        loss='mean_squared_error',
                        metrics=['accuracy'])
    autoencoder.summary()

    # batch_sizes = [32, 64, 128, 256, 512]
    batch_sizes = [32, 64]
    b_size = int(len(data.train_data) / 512)
    if b_size > len(batch_sizes) - 1:
        b_size = len(batch_sizes) - 1
    history = autoencoder.fit(data.train_data,
                              data.train_data,
                              epochs=no_of_epochs,
                              batch_size=batch_sizes[b_size],
                              verbose=1,
                              validation_split=0.2,
                              shuffle=True).history

    # plt.plot(history['loss'])
    # plt.plot(history['val_loss'])
    # plt.title('model loss')
    # plt.ylabel('loss')
    # plt.xlabel('epoch')
    # plt.legend(['train', 'test'], loc='upper right')
    # plt.show()

    predictions = autoencoder.predict(data.eval_data)
    predictions = predictions.reshape(predictions.shape[0], predictions.shape[1])
    data.eval_data = data.eval_data.reshape(data.eval_data.shape[0], data.eval_data.shape[1])
    mse = np.mean(np.power(data.eval_data - predictions, 2), axis=1)
    error_df = pd.DataFrame({'Reconstruction_error': mse,
                             'True_class': data.eval_labels})
    print(error_df.describe())
    return find_optimal(error_df)


def compute_metrics(conf_matrix):
    precision = conf_matrix[1][1] / (conf_matrix[1][1] + conf_matrix[0][1])
    recall = conf_matrix[1][1] / (conf_matrix[1][1] + conf_matrix[1][0])
    f1 = (2 * precision * recall) / (precision + recall)
    print("precision: " + str(precision) + ", recall: " + str(recall) + ", f1: " + str(f1))
    return precision, recall, f1


def get_all_data(data_path, smell):
    print("reading data...")
    max_eval_samples = 150000
    if smell in ["MultifacetedAbstraction", "FeatureEnvy"]:
        max_eval_samples = 50000

    train_data, eval_data, eval_labels, max_input_length = \
        inputs.get_data_autoencoder(data_path,
                                    train_validate_ratio=TRAIN_VALIDATE_RATIO,
                                    max_training_samples=5000,
                                    max_eval_samples=max_eval_samples,
                                    )
    print("nan count: " + str(np.count_nonzero(np.isnan(train_data))))
    print("train_data: " + str(len(train_data)))
    print("train_data shape: " + str(train_data.shape))
    print("eval_data: " + str(len(eval_data)))
    print("eval_labels: " + str(len(eval_labels)))
    print("reading data... done.")
    return input_data.Input_data(train_data, None, eval_data, eval_labels, max_input_length)


def write_result(file, str):
    f = open(file, "a+")
    f.write(str)
    f.close()


def get_out_file(smell, model):
    now = datetime.datetime.now()
    if not os.path.exists(OUT_FOLDER):
        os.makedirs(OUT_FOLDER)
    return os.path.join(OUT_FOLDER, "ae_rq1_" + smell + "_" + model + "_"
                        + str(now.strftime("%d%m%Y_%H%M") + ".csv"))


def main_lstm(smell, data_path, skip_iter=-1):
    input_data = get_all_data(data_path, smell)
    layers = [1, 2]
    encoding_dim = [8, 16, 32]
    epochs = 10
    outfile = get_out_file(smell, "rnn")
    write_result(outfile,
                 "units,threshold,epoch,bottleneck,layer,precision,recall,f1,time\n")
    cur_iter = 1
    for layer in layers:
        for bottleneck in [True]:
            for encoding in encoding_dim:
                if cur_iter <= skip_iter:
                    cur_iter += 1
                    continue
                cur_iter += 1
                write_result(outfile, "processing layer " + str(layer) + " encoding " + str(encoding))
                start_time = time.time()
                # optimal_threshold, max_pr, max_re, max_f1 = autoencoder(input_data, smell, layers=layer,
                #                                                         with_bottleneck=bottleneck)
                try:
                    optimal_threshold, max_pr, max_re, max_f1 = autoencoder_lstm(input_data, smell, layers=layer,
                                                                                 encoding_dimension=encoding,
                                                                                 no_of_epochs=epochs,
                                                                                 with_bottleneck=bottleneck)
                except ValueError as error:
                    print(error)
                    optimal_threshold = -1
                    max_pr = -1
                    max_re = -1
                    max_f1 = -1
                end_time = time.time()
                time_taken = end_time - start_time
                write_result(outfile,
                             str(encoding) + "," + str(optimal_threshold) + "," + str(epochs) + "," + str(bottleneck)
                             + "," + str(layer) + "," +
                             str(max_pr) + "," + str(max_re) + "," + str(max_f1) + "," + str(time_taken) + "\n")


def main_dense(smell, data_path, max_encoding_dim=1024):
    input_data = get_all_data(data_path, smell)
    layers = [1, 2]
    # batch_sizes = [32, 64, 128, 256, 512]
    max_encoding_dimension = min(max_encoding_dim, input_data.max_input_length)
    encoding_dim = [int(max_encoding_dimension / 4), int(max_encoding_dimension / 2), int(max_encoding_dimension)]
    epochs = 20
    outfile = get_out_file(smell, "dense")
    write_result(outfile,
                 "Encoding_dim,threshold,epoch,bottleneck,layer,precision,recall,f1,time\n")
    for layer in layers:
        for bottleneck in [True]:
            for encoding in encoding_dim:
                # for batch_size in batch_sizes:
                start_time = time.time()
                try:
                    optimal_threshold, max_pr, max_re, max_f1 = autoencoder_dense(input_data, smell, layers=layer,
                                                                                  epochs=epochs,
                                                                                  # batch_size=batch_size,
                                                                                  encoding_dimension=encoding,
                                                                                  with_bottleneck=bottleneck)
                except ValueError as error:
                    print(error)
                    optimal_threshold = -1
                    max_pr = -1
                    max_re = -1
                    max_f1 = -1
                end_time = time.time()
                time_taken = end_time - start_time
                write_result(outfile,
                             str(encoding) + "," + str(optimal_threshold) + "," + str(
                                 epochs) + "," + str(bottleneck) + "," + str(layer) + "," +
                             str(max_pr) + "," + str(max_re) + "," + str(max_f1) + "," + str(time_taken) + "\n")


def main_cnn(smell, data_path, max_encoding_dim=1024):
    input_data = get_all_data(data_path, smell)
    filters = [8, 16, 32, 64]
    kernels = [5, 7, 11]
    pooling_windows = [2, 3, 4, 5]
    layers = [1, 2]
    epochs = 20
    outfile = get_out_file(smell, "cnn")
    write_result(outfile, "conv_layers,filters,kernel,max_pooling_window,epochs,precision,recall,f1,time\n")
    for layer in layers:
        for filter in filters:
            for kernel in kernels:
                for pooling_window in pooling_windows:
                    config = configuration.CNN_config(layer, filter, kernel, pooling_window, epochs)
                    start_time = time.time()
                    try:
                        optimal_threshold, max_pr, max_re, max_f1 = autoencoder_cnn(input_data, config)
                    except ValueError as error:
                        print(error)
                        optimal_threshold = -1
                        max_pr = -1
                        max_re = -1
                        max_f1 = -1
                    end_time = time.time()
                    time_taken = end_time - start_time
                    write_result(outfile,
                                 str(layer) + "," + str(filter) + "," + str(
                                     kernel) + "," + str(pooling_window) + "," + str(epochs) + "," +
                                 str(max_pr) + "," + str(max_re) + "," + str(max_f1) + "," + str(time_taken) + "\n")


def main_dense_with_best_params(smell, input_data, layer, epochs=20, encoding=1024, bottleneck=True, threshold=400000):
    # layers = [1, 2]
    # batch_sizes = [32, 64, 128, 256, 512]
    # max_encoding_dimension = min(max_encoding_dim, input_data.max_input_length)
    # encoding_dim = [int(max_encoding_dimension / 4), int(max_encoding_dimension / 2), int(max_encoding_dimension)]
    # epochs = 20
    outfile = get_out_file(smell, "dense_final")
    write_result(outfile,
                 "Encoding_dim,threshold,epoch,bottleneck,layer,precision,recall,f1,time\n")
    # for layer in layers:
    #     for bottleneck in [True]:
    #         for encoding in encoding_dim:
                # for batch_size in batch_sizes:
    start_time = time.time()
    try:
        optimal_threshold, max_pr, max_re, max_f1 = autoencoder_dense(input_data, smell, layers=layer,
                                                                      epochs=epochs,
                                                                      # batch_size=batch_size,
                                                                      encoding_dimension=encoding,
                                                                      with_bottleneck=bottleneck,
                                                                      threshold=threshold,
                                                                      is_final=True)
    except ValueError as error:
        print(error)
        optimal_threshold = -1
        max_pr = -1
        max_re = -1
        max_f1 = -1
    end_time = time.time()
    time_taken = end_time - start_time
    write_result(outfile,
                 str(encoding) + "," + str(optimal_threshold) + "," + str(
                     epochs) + "," + str(bottleneck) + "," + str(layer) + "," +
                 str(max_pr) + "," + str(max_re) + "," + str(max_f1) + "," + str(time_taken) + "\n")


def run_final():
    smell = "ComplexMethod"
    data_path1 = os.path.join(os.path.join(TOKENIZER_OUT_PATH, smell), DIM)
    input_data1 = get_all_data(data_path1, smell)
    main_dense_with_best_params(smell, input_data=input_data1, layer=1,
                                epochs=20, encoding=32, bottleneck=True, threshold=319000)

    smell = "ComplexConditional"
    data_path2 = os.path.join(os.path.join(TOKENIZER_OUT_PATH, smell), DIM)
    input_data2 = get_all_data(data_path2, smell)
    main_dense_with_best_params(smell, input_data=input_data2, layer=1,
                                epochs=20, encoding=16, bottleneck=True, threshold=328000)

    smell = "FeatureEnvy"
    data_path3 = os.path.join(os.path.join(TOKENIZER_OUT_PATH, smell), DIM)
    input_data3 = get_all_data(data_path3, smell)
    main_dense_with_best_params(smell, input_data=input_data3, layer=2,
                                epochs=20, encoding=16, bottleneck=True, threshold=325000)

    smell = "MultifacetedAbstraction"
    data_path4 = os.path.join(os.path.join(TOKENIZER_OUT_PATH, smell), DIM)
    input_data4 = get_all_data(data_path4, smell)
    main_dense_with_best_params(smell, input_data=input_data4, layer=1,
                                epochs=20, encoding=16, bottleneck=True, threshold=328000)


if __name__ == "__main__":
    smell_list = ["ComplexConditional", "ComplexMethod", "MultifacetedAbstraction", "FeatureEnvy"]
    # smell_list = ["FeatureEnvy"]
    # smell_list = ["ComplexConditional"]
    # smell_list = ["ComplexMethod"]

    # for smell in smell_list:
    #     data_path = os.path.join(TOKENIZER_OUT_PATH, smell, DIM)
    #     # main_lstm(smell, data_path)
    #     main_dense(smell, data_path, max_encoding_dim=1024)
    #     main_cnn(smell, data_path, max_encoding_dim=1024)
    run_final()