# -*- coding: utf-8 -*- """ @author: Christina Papagiannopoulou @purpose: Example script which calculates the linear and non-linear Granger causality quantification for one pixel on the globe. @reference: Papagiannopoulou et al., 2017. A non-linear Granger-causality framework to investigate climate–vegetation dynamics. Geoscentific Model Development, 10, 1–16. DOI: 10.5194/gmd-10-1-2017. @input: test.csv @arguments: 1) Input file 2) linear OR non-linear @execute: python GC_script.py test.csv linear python GC_script.py test.csv non-linear """ import numpy as np from sklearn.preprocessing import Imputer from sklearn.ensemble import RandomForestRegressor from numpy import genfromtxt import time import sys from sklearn.cross_validation import KFold from sklearn.metrics import r2_score from sklearn.linear_model import Ridge from sklearn.grid_search import GridSearchCV import os #%%functions #parameters: vector 'arr' and an integer 'num' #returns: a 'num'-times shifted vector def shift2(arr,num): arr=np.roll(arr,num) if num < 0: np.put(arr,range(len(arr)+num,len(arr)),np.nan) elif num > 0: np.put(arr,range(num),np.nan) return arr #parameters: string 'inpath' which is the path of the .csv dataset #returns: the dataset in a numpy array def readFile(inpath): if os.path.isfile(inpath): dataset = genfromtxt(open(inpath,'r'), delimiter=',', dtype='f8')[0:] imp = Imputer(missing_values='NaN', strategy='mean', axis=0)# fill in the missing values with the mean of each column transformedData = imp.fit_transform(dataset) rmvedCols = imp.statistics_ idxRmved = np.where(np.isnan(rmvedCols))#take the indices of the nan columns nanTarget = dataset.shape[1]-1 in idxRmved[0]#check if the target is a nan column if nanTarget: raise ValueError("The target variable contains only nan values or inf") else: raise ValueError("File does not exist") return transformedData #parameters: vector 'target' which is the target variable #returns: the dataset which includes the previous values of the target def createAuto(target): win=13 # window size, how many previous values we take of the target (here 12 because the range goes from 1-12 without the 13) dataAuto = np.empty((len(target),win-1)) for i in range(1,win): dataAuto[:,i-1] = shift2(target, i) imp = Imputer(missing_values='NaN', strategy='mean', axis=0) transformedDataAuto = imp.fit_transform(dataAuto) X_auto = transformedDataAuto return X_auto #parameters: 'X' the predictors, 'y' the target, 'cvFolds' number of folds, 'estimator' machine learning algorithm #returns: the R squared for each fold def crossValidation(X, y, cvFolds, estimator): r2 = np.zeros((cvFolds,1)) kf = KFold(len(X), n_folds=cvFolds, shuffle=True, random_state = 30) cv_j=0 for train_index, test_index in kf: train_X = X[train_index,:] test_X = X[test_index,:] train_y = y[train_index] test_y = y[test_index] est.fit(train_X,train_y) y_true, y_pred = test_y,est.predict(test_X) r2[cv_j] = r2_score(y_true, y_pred) cv_j = cv_j + 1 return r2 #parameters: 'X' the predictors, 'y' the target, 'cvFolds' number of folds, 'estimator' machine learning algorithm #returns: the R squared for each fold def nestedCrossValidation(X, y, cvFolds, estimator): kf = KFold(len(X), n_folds=cvFolds, shuffle=True, random_state = 30) cv_j=0 param_grid = {'alpha': [0.0000001,0.000001,0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000,10000,100000, 1000000, 10000000,1000000000]} r2 = np.zeros((cvFolds,1)) for train_index, test_index in kf: train_X = X[train_index,:] test_X = X[test_index,:] train_y = y[train_index] test_y = y[test_index] grid = GridSearchCV(estimator, param_grid=param_grid, verbose=0, cv=cvFolds, scoring='mean_squared_error') grid.fit(train_X,train_y) y_true, y_pred = test_y,grid.best_estimator_.predict(test_X) r2[cv_j] = r2_score(y_true, y_pred) cv_j = cv_j + 1 return r2 #%% main script if len(sys.argv) < 3: raise ValueError('Two arguments are needed: inpath \'linear\'(or \'non-linear\')') inpath = sys.argv[1] # 1st arg, the file path GC_mode = sys.argv[2] #2nd arg, the mode "non-linear" or "linear" data = readFile(inpath) X = data[:,1:data.shape[1]-1] #exclude the first column because it is the timestamp column y = data[:,data.shape[1]-1] #the target variable is the last column X1 = createAuto(y) #take the autocorrelation features X = np.concatenate((X,X1), axis=1) cvFolds = 5 #number of folds in cross-validation if(len(X[0])!=0): #check if there is at least one predictor start_time = time.clock() if GC_mode == "non-linear": est = RandomForestRegressor(random_state=0, n_estimators=100, max_features='sqrt') R_squared_full = crossValidation(X, y, cvFolds, est) R_squared_baseline = crossValidation(X1, y, cvFolds, est) elif GC_mode == "linear": est = Ridge(copy_X=True, fit_intercept=True, max_iter=None, normalize=True, solver='lsqr', tol=0.001) R_squared_full = nestedCrossValidation(X, y, cvFolds, est) R_squared_baseline = nestedCrossValidation(X1, y, cvFolds, est) else: raise ValueError("The 2nd argument should be \'non-linear\' or \'linear\'") else: raise ValueError("Number of predictors is zero!") print ('%s Granger causality' %GC_mode) GC_dif = np.mean(R_squared_full) - np.mean(R_squared_baseline) print ("Explained variance of baseline model: %f" %np.mean(R_squared_baseline)) print ("Explained variance of full model: %f" %np.mean(R_squared_full)) print ("Quantification of Granger causality: %f" %GC_dif) runtime = time.clock() - start_time print ("Total time: %d seconds" %runtime)