import pandas as pd
import numpy as np
import matplotlib
from matplotlib import cm
import matplotlib.pyplot as plt
import h2o
from h2o.estimators.deeplearning import H2OAutoEncoderEstimator

# Start H2O on your local machine
h2o.init()
ecg_data = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/anomaly/ecg_discord_test.csv")
print(ecg_data.shape)
print(ecg_data.types)
print(ecg_data.head())

train_ecg = ecg_data[:20:, :]
test_ecg = ecg_data[:23, :]


def plot_stacked_time_series(df, title):
    stacked = df.stack()
    stacked = stacked.reset_index()
    total = [data[0].values for name, data in stacked.groupby('level_0')]
    # pd.DataFrame({idx: pos for idx, pos in enumerate(total)}, index=stacked['level_1']).plot(title=title)
    pd.DataFrame({idx: pos for idx, pos in enumerate(total)}).plot(title=title)
    plt.legend(bbox_to_anchor=(1.05, 1))
    plt.show()


plot_stacked_time_series(ecg_data.as_data_frame(), "ECG data set")


def plot_bidimensional(model, test, recon_error, layer, title):
    bidimensional_data = model.deepfeatures(test, layer).cbind(recon_error).as_data_frame()

    cmap = cm.get_cmap('Spectral')

    fig, ax = plt.subplots()
    bidimensional_data.plot(kind='scatter',
                            x='DF.L{}.C1'.format(layer + 1),
                            y='DF.L{}.C2'.format(layer + 1),
                            s=500,
                            c='Reconstruction.MSE',
                            title=title,
                            ax=ax,
                            colormap=cmap)
    layer_column = 'DF.L{}.C'.format(layer + 1)
    columns = [layer_column + '1', layer_column + '2']
    for k, v in bidimensional_data[columns].iterrows():
        ax.annotate(k, v, size=20, verticalalignment='bottom', horizontalalignment='left')
    fig.canvas.draw()
    plt.show()


seed = 13
anomaly_model = H2OAutoEncoderEstimator(
    activation="Tanh",
    hidden=[50, 20, 2, 20, 50],
    epochs=100,
    # sparse=True,
    # l1=1e-5,
    seed=seed,
    reproducible=True)

anomaly_model.train(
    x=train_ecg.names,
    training_frame=train_ecg
)

recon_error = anomaly_model.anomaly(test_ecg)
plot_bidimensional(anomaly_model, test_ecg, recon_error, 2, "2D representation of data points seed {}".format(seed))

# plot_stacked_time_series(anomaly_model.predict(ecg_data).as_data_frame(), "Reconstructed test set")

print(anomaly_model)

plt.figure()
df = recon_error.as_data_frame(True)
df["sample_index"] = df.index
df.plot(kind="scatter", x="sample_index", y="Reconstruction.MSE",
        title="reconstruction error", s=500)

len(recon_error)

anomaly_model.deepfeatures(train_ecg, 1).as_data_frame()  # .plot(kind='scatter', x='DF.L2.C1', y='DF.L2.C2')

for seed in range(1, 6):
    model = H2OAutoEncoderEstimator(
        activation="Tanh",
        hidden=[50, 20, 2, 20, 50],
        epochs=100,
        # sparse=True,
        # l1=1e-5,
        seed=seed,
        reproducible=True)
    model.train(
        x=train_ecg.names,
        training_frame=train_ecg)

    recon_error = model.anomaly(test_ecg)
    plot_bidimensional(model, test_ecg, recon_error, 2, "2D representation of data points seed {}".format(seed))
    # compute average and variance of the 2 dimensions

model = H2OAutoEncoderEstimator(
    activation="Tanh",
    hidden=[50, 20, 2, 20, 50],
    epochs=100,
    # sparse=True,
    # l1=1e-5,
    seed=1,
    reproducible=True)
model.train(
    x=train_ecg.names,
    training_frame=train_ecg
)

recon_error = model.anomaly(test_ecg)
bidimensional_data = model.deepfeatures(test_ecg, 2).cbind(recon_error).as_data_frame()
print(bidimensional_data)