import os, csv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.classification import accuracy_score
from sklearn.externals import joblib
from Stock_Prediction_Base import base_model
from Stock_Prediction_Data_Processing import get_all_stocks_feature_data, preprocessing_data, kmeans_claasification

class random_forrest_model(base_model):
    # CROSS VALIDATION : Compute accuracy of a model ##############################
    ## Inputs : X_train, y_train, number of folds, number of trees, max of features
    ## Output : Accuracy of classifier
    def perform_CV(self, X_train, y_train, number_folds, n, m):
        model = RandomForestClassifier(n_estimators=n, max_features=m, n_jobs=8, verbose=self.paras.verbose)
        acc = np.mean(cross_val_score(model, X_train, y_train, cv=number_folds))
        #print 'Size of Forrest : number of trees : ' + str(n) + ', maximum of features : ' + str(m) + '. Accuracy : ' + str(acc)
        return acc

    # MODEL SELECTION : Find best parameters ######################################
    ## Inputs :  X_train, y_train, number of folds, range of number of trees, range of max of features
    ## Outputs : optimal number of trees, optimal max of features, accuracy  
    def best_forrest(self, X_train, y_train, number_folds, t1, t2, f1, f2):  
        # Initialize parameters        
        t_opt = t1
        f_opt = f1
        accur_opt = 0.
        #x_n = []; y_m= []; z_accu = []
        
        # Find best forest 
        for t in range(t1,t2+1):
            for f in range(f1,f2+1):
                t_ = 16 * t
                accur = self.perform_CV(X_train, y_train, number_folds, t_, f)
                if (accur > accur_opt) : t_opt, f_opt, accur_opt = t_, f, accur
                #x_n.append(n), y_m.append(m), z_accu.append(accur)
    
        #my_df = pd.DataFrame([x_n,y_m,z_accu])
        #my_df.to_csv('n_'+str(n)+'_m_'+str(m)+'_.csv', index=False, header=False)
        #fig = pylab.figure()
        #ax = Axes3D(fig)
        #ax.plot_trisurf(x_n,y_m,z_accu)
        #ax.set_xlabel('Number of Trees')
        #ax.set_ylabel('Number of Features')
        #ax.set_zlabel('Accuracy')    
        #plt.show()
    
        #print('Best Forrest : number of trees : ' + str(n_opt) + ', maximum of features : ' + str(m_opt) + ', with accuracy :' + str(accur_opt))
        return t_opt,f_opt,accur_opt


    # BEST WINDOW : Find best window ##############################################    
    def best_window(self, X_train, y_train, w_min, w_max, t_min,t_max,f_min,f_max):
        w_opt = 0
        t_opt = 0
        f_opt = 0
        accur_opt = 0.
        
        x_w = []
        y_accu= []
        
        # range of window : w_min --> w_max     
        for w in range(w_min,w_max+1):
            #X,y = preprocess_data(w)
            #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
            t, f, accur = self.best_forrest(X_train,y_train,10,t_min,t_max,f_min,f_max)
            print('Window = '+str(w)+' days --> Best Forrest : number of trees : ' + str(t) + ', maximum of features : ' + str(f) + ', with accuracy :' + str(accur))
            
            if (accur > accur_opt) : w_opt, t_opt, f_opt, accur_opt = w, t, f, accur
            x_w.append(w), y_accu.append(accur)
            
        print('Best window : w = '+str(w_opt)+'. Best Forrest : number of trees : ' + str(t_opt) + ', maximum of features : ' + str(f_opt) + ', with accuracy :' + str(accur_opt))
        return w_opt, t_opt, f_opt


    def build_model(self, X_train, y_train):
        if self.paras.load == True:
            model = self.load_training_model(self.paras.window_len)
            if model != None:
                return model

        print('build Random Forrest model...')

        # range of number of trees : 5*(1 -> 10) = 5,10,...,50 trees
        t_min = self.paras.tree_min[index]
        t_max = self.paras.tree_max[index]
        # range of max of features : 1 -> 10 features
        f_min = self.paras.feature_min[index]
        f_max = self.paras.feature_max[index]
        # range of window : 1 -> 70 days 
        w_min = self.paras.window_min
        w_max = self.paras.window_max
        
        w_opt, n_opt, m_opt = self.best_window(X_train, y_train, w_min,w_max,t_min,t_max,f_min,f_max)
        model = RandomForestClassifier(n_estimators=n_opt,max_features=m_opt, n_jobs=8, verbose=self.paras.verbose)
        return model

    def save_training_model(self, model, window_len):
        if self.paras.save == True:
            print('save Random Forrest model...')
            filename = self.paras.model_folder + self.get_model_name(window_len) + '.pkl'
            joblib.dump(model, filename) 

    def load_training_model(self, window_len):
        filename = self.paras.model_folder + self.get_model_name(window_len) + '.pkl'
        if os.path.exists(filename):
            print('load Random Forrest model...')
            return joblib.load(filename) 
        return None

# Classification
class random_forrest_classification(random_forrest_model):
    def __init__(self, paras):
        super(random_forrest_classification, self).__init__(paras=paras)


    ###################################
    ###                             ###
    ###          Training           ###
    ###                             ###
    ###################################

    def prepare_train_test_data(self, data_feature, LabelColumnName):
        firstloop = 1
        for ticker, data in data_feature.items():
            X, y = preprocessing_data(self.paras, data[0], LabelColumnName, one_hot_label_proc=False)
            X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X, y, test_size=0.3)
            # print('Train shape X:', X_train_temp.shape, ',y:', y_train_temp.shape)
            # print('Test shape X:', X_test_temp.shape, ',y:', y_test_temp.shape)

            if firstloop == 1:
                firstloop = 0
                X_train = X_train_temp
                X_test = X_test_temp
                y_train = y_train_temp
                y_test = y_test_temp
            else:
                X_train = np.append(X_train, X_train_temp, 0)
                X_test = np.append(X_test, X_test_temp, 0)
                y_train = np.append(y_train, y_train_temp, 0)
                y_test = np.append(y_test, y_test_temp, 0)

        #print('Train shape X:', X_train.shape, ',y:', y_train.shape)
        #print('Test shape X:', X_test.shape, ',y:', y_test.shape)
        return X_train, y_train, X_test, y_test


    def train_data(self, data_feature, LabelColumnName):
        #history = History()
        
        X_train, y_train, X_test, y_test = self.prepare_train_test_data(data_feature, LabelColumnName)

        model = self.build_model(X_train, y_train)

        model.fit(X_train, y_train)

        # save model
        self.save_training_model(model, self.paras.window_len)

        print(' ############## validation on test data ############## ')
        self.predict(model, X_test, y_test)

        # plot training loss/ validation loss
        if self.paras.plot:
            self.plot_training_curve(history)

        return model


    ###################################
    ###                             ###
    ###         Predicting          ###
    ###                             ###
    ###################################

    def predict(self, model, X, y):
        predictions = model.predict_proba(X)
        if np.isfinite(y).all():
            print('Accuracy: ', accuracy_score(y, np.argmax(predictions, axis=1)))
        return predictions


    def predict_data(self, model, data_feature, LabelColumnName):

        if model == None: model = self.load_training_model(self.paras.window_len)

        if model == None:
            print('predict failed, model not exist')
            return

        for ticker in self.paras.predict_tickers:
            try:
                data = data_feature[ticker]
            except:
                print('stock not preparee', ticker)
                continue

            X_train, y_train   = preprocessing_data(self.paras, data[0], LabelColumnName, one_hot_label_proc=False)
            X_valid, y_valid   = preprocessing_data(self.paras, data[1], LabelColumnName, one_hot_label_proc=False)
            X_lately, y_lately = preprocessing_data(self.paras, data[2], LabelColumnName, one_hot_label_proc=False)

            possibility_columns = [str(self.paras.window_len) + '_' + str(idx) for idx in range(self.paras.n_out_class)]

            print('\n ---------- ', ticker, ' ---------- \n')
            print(' ############## validation on train data ############## ')
            predictions_train = self.predict(model, X_train, y_train)
            data[3].loc[data[0].index, 'label'] = y_train#np.argmax(y, axis=1) #- int(self.paras.n_out_class/2)
            data[3].loc[data[0].index, 'pred'] = np.argmax(predictions_train, axis=1) #- int(self.paras.n_out_class/2)
            s = pd.DataFrame(predictions_train, index = data[0].index, columns=possibility_columns)

            print(' ############## validation on valid data ############## ')
            predictions_valid = self.predict(model, X_valid, y_valid)
            data[3].loc[data[1].index, 'label'] = y_valid#np.argmax(y_valid, axis=1) #- int(self.paras.n_out_class/2)
            data[3].loc[data[1].index, 'pred'] = np.argmax(predictions_valid, axis=1) #- int(self.paras.n_out_class/2)
            s = s.append(pd.DataFrame(predictions_valid, index = data[1].index, columns=possibility_columns))

            print(' ############## validation on lately data ############## ')
            predictions_lately = self.predict(model, X_lately, y_lately)
            data[3].loc[data[2].index, 'label'] = np.nan#np.argmax(actual_lately, axis=1)
            data[3].loc[data[2].index, 'pred'] = np.argmax(predictions_lately, axis=1) #- int(self.paras.n_out_class/2)
            s = s.append(pd.DataFrame(predictions_lately, index = data[2].index, columns=possibility_columns))
            
            data[3] = pd.merge(data[3], s, how='outer', left_index=True, right_index=True)

            actual_count = []
            predict_count = []
            for i in range(self.paras.n_out_class):
                actual_count.append(len(data[3][data[3]['label'] == i]))
                predict_count.append(len(data[3][(data[3]['label'] == i) & (data[3]['label'] == data[3]['pred'])]))

            valid_actual_count = []
            valid_predict_count = []
            data.append(data[3][-self.paras.valid_len:])
            for i in range(self.paras.n_out_class):
                valid_actual_count.append(len(data[4][data[4]['label'] == i]))
                valid_predict_count.append(len(data[4][(data[4]['label'] == i) & (data[4]['label'] == data[4]['pred'])]))

            print('\nclassification counter:\n', actual_count)
            print('\nclassification possibility:\n', 100*np.array(actual_count)/np.sum(actual_count))
            print('\nclassification train predict:\n', 100*np.array(predict_count)/np.array(actual_count))
            print('\nclassification valid predict:\n', 100*np.array(valid_predict_count)/np.array(valid_actual_count))

            timePeriod = [22*24, 22*12, 22*6, 22*3, 22*2, 22]
            pred_profit = data[3]["pred_profit"]
            pred_profit_len = len(pred_profit)
            centers_oris = []
            index_oris = []
            for time in timePeriod:
                if pred_profit_len < time: continue
                out_labels, counters, centers_ori = kmeans_claasification(pred_profit[pred_profit_len - time : pred_profit_len], self.paras.n_out_class)
                centers_oris.append(np.sort(centers_ori))
                index_oris.append(time)
            
            df_ori = pd.DataFrame(centers_oris, index=index_oris)
            df_ori.index.name = 'Days'
            print('\nclassification centers:\n', df_ori)
            
            # rewrite data frame and save / update
            data[3] = self.save_data_frame_mse(ticker, data[3], self.paras.window_len, possibility_columns)
            self.df = data[3]

            pd.set_option('display.max_rows', None)
            print('\n -------------------- \n')
            data[3]['label'] = data[3]['label'] - int(self.paras.n_out_class/2)
            data[3]['pred'] = data[3]['pred'] - int(self.paras.n_out_class/2)
            print(data[3][-(self.paras.pred_len + self.paras.valid_len):])
            


    ###################################
    ###                             ###
    ###       Save Data Output      ###
    ###                             ###
    ###################################

    def save_data_frame_mse(self, ticker, df, window_len, possibility_columns):
        df['label'] = df['label']#.astype(int)
        df['pred'] = df['pred']#.astype(int)
        
        # df = df.rename(columns={'label': 'a_+' + str(self.paras.pred_len) + '_d',
        #                         'pred': 'p_+' + str(self.paras.pred_len) + '_d'})
        # new_list = ['a_+' + str(self.paras.pred_len) + '_d', 'p_+' + str(self.paras.pred_len) + '_d']

        #default_list = ['open', 'high', 'low', 'close', 'volume']
        #original_other_list = set(df.columns) - set(default_list) - set(new_list)
        #original_other_list = list(original_other_list)
        default_list = ['close', 'volume', 'pred_profit']
        original_other_list = []
        new_list = ['label', 'pred']
        df = df[default_list + original_other_list + new_list + possibility_columns]
        
        if self.paras.save == True:
            #df.to_csv(self.paras.save_folder + ticker + ('_%.2f' % model_acc) + '_data_frame.csv')
            df.to_csv(self.paras.save_folder + ticker + '_' + str(window_len) + '.csv')
            with open(self.paras.save_folder + 'parameters.txt', 'w') as text_file:
                text_file.write(self.paras.__str__())
                #text_file.write(str(mses[0]) + '\n')
                #text_file.write(str(mses[1]) + '\n')
        return df


    ###################################
    ###                             ###
    ###        Main Enterance       ###
    ###                             ###
    ###################################

    def run(self, train, predict):
        ################################################################################
        self.paras.save_folder = self.get_save_directory()
        print(' Log  Directory: ', self.paras.save_folder)
        self.paras.model_folder = self.get_model_directory()
        print('Model Directory: ', self.paras.model_folder)
        ################################################################################

        LabelColumnName = 'label'
            
        data_feature = get_all_stocks_feature_data(self.paras, self.paras.window_len, LabelColumnName)

        model = None
            
        if train: model = self.train_data(data_feature, LabelColumnName)
            
        if predict: self.predict_data(model, data_feature, LabelColumnName)