```from __future__ import print_function

import os
import numpy as np
from scipy import stats
from subprocess import call
from scipy.stats import genextreme, norm
from six import with_metaclass
from abc import ABCMeta, abstractmethod
import pickle
import matplotlib.pyplot as plt
import pandas as pd
import bspline
from bspline import splinelab
from sklearn.datasets import make_regression
import pymc3 as pm

# -----------------
# Utility functions
# -----------------
def create_poly_basis(X, dimpoly):
""" compute a polynomial basis expansion of the specified order"""

if len(X.shape) == 1:
X = X[:, np.newaxis]
D = X.shape
Phi = np.zeros((X.shape, D*dimpoly))
colid = np.arange(0, D)
for d in range(1, dimpoly+1):
Phi[:, colid] = X ** d
colid += D

return Phi

def create_bspline_basis(xmin, xmax, p = 3, nknots = 5):
""" compute a Bspline basis set where:

:param p: order of spline (3 = cubic)
:param nknots: number of knots (endpoints only counted once)
"""

knots = np.linspace(xmin, xmax, nknots)
k = splinelab.augknt(knots, p)       # pad the knot vector
B = bspline.Bspline(k, p)
return B

def squared_dist(x, z=None):
""" compute sum((x-z) ** 2) for all vectors in a 2d array"""

# do some basic checks
if z is None:
z = x
if len(x.shape) == 1:
x = x[:, np.newaxis]
if len(z.shape) == 1:
z = z[:, np.newaxis]

nx, dx = x.shape
nz, dz = z.shape
if dx != dz:
raise ValueError("""
Cannot compute distance: vectors have different length""")

# mean centre for numerical stability
m = np.mean(np.vstack((np.mean(x, axis=0), np.mean(z, axis=0))), axis=0)
x = x - m
z = z - m

xx = np.tile(np.sum((x*x), axis=1)[:, np.newaxis], (1, nz))
zz = np.tile(np.sum((z*z), axis=1), (nx, 1))

dist = (xx - 2*x.dot(z.T) + zz)

return dist

def compute_pearsonr(A, B):
""" Manually computes the Pearson correlation between two matrices.

Basic usage::

compute_pearsonr(A, B)

where

:param A: an N * M data array
:param cov: an N * M array

:returns Rho: N dimensional vector of correlation coefficients
:returns ys2: N dimensional vector of p-values

Notes::

This function is useful when M is large and only the diagonal entries
of the resulting correlation matrix are of interest. This function
does not compute the full correlation matrix as an intermediate step"""

# N = A.shape
N = A.shape

# first mean centre
Am = A - np.mean(A, axis=0)
Bm = B - np.mean(B, axis=0)
# then normalize
An = Am / np.sqrt(np.sum(Am**2, axis=0))
Bn = Bm / np.sqrt(np.sum(Bm**2, axis=0))
del(Am, Bm)

Rho = np.sum(An * Bn, axis=0)
del(An, Bn)

# Fisher r-to-z
Zr = (np.arctanh(Rho) - np.arctanh(0)) * np.sqrt(N - 3)
N = stats.norm()
pRho = 2*N.cdf(-np.abs(Zr))
# pRho = 1-N.cdf(Zr)

return Rho, pRho

def explained_var(ytrue, ypred):
""" Computes the explained variance of predicted values.

Basic usage::

exp_var = explained_var(ytrue, ypred)

where

:ytrue: n*p matrix of true values where n is the number of samples
and p is the number of features.
:ypred: n*p matrix of predicted values where n is the number of samples
and p is the number of features.

:returns exp_var: p dimentional vector of explained variances for each feature.

"""

exp_var = 1 - (ytrue - ypred).var(axis = 0) / ytrue.var(axis = 0)

return exp_var

def compute_MSLL(ytrue, ypred, ypred_var, train_mean = None, train_var = None):
""" Computes the MSLL or MLL (not standardized) if 'train_mean' and 'train_var' are None.

Basic usage::

MSLL = compute_MSLL(ytrue, ypred, ytrue_sig, noise_variance, train_mean, train_var)

where

:ytrue          : n*p matrix of true values where n is the number of samples
and p is the number of features.
:ypred          : n*p matrix of predicted values where n is the number of samples
and p is the number of features.
:ypred_var      : n*p matrix of summed noise variances and prediction variances where n is the number of samples
and p is the number of features.
:train_mean     : p dimensional vector of mean values of the training data for each feature.
:train_var      : p dimensional vector of covariances of the training data for each feature.

:returns loss   : p dimensional vector of MSLL or MLL for each feature.

"""

if train_mean is not None and train_var is not None:

# make sure y_train_mean and y_train_sig have right dimensions (subjects x voxels):
Y_train_mean = np.repeat(train_mean, ytrue.shape, axis = 0)
Y_train_sig = np.repeat(train_var, ytrue.shape, axis = 0)

# compute MSLL:
loss = np.mean(0.5 * np.log(2 * np.pi * ypred_var) + (ytrue - ypred)**2 / (2 * ypred_var) -
0.5 * np.log(2 * np.pi * Y_train_sig) - (ytrue - Y_train_mean)**2 / (2 * Y_train_sig), axis = 0)

else:
# compute MLL:
loss = np.mean(0.5 * np.log(2 * np.pi * ypred_var) + (ytrue - ypred)**2 / (2 * ypred_var), axis = 0)

return loss

class WarpBase(with_metaclass(ABCMeta)):
""" Base class for likelihood warping following:
Rios and Torab (2019) Compositionally-warped Gaussian processes
https://www.sciencedirect.com/science/article/pii/S0893608019301856

All Warps must define the following methods::

Warp.get_n_params() - return number of parameters
Warp.f() - warping function (Non-Gaussian field -> Gaussian)
Warp.invf() - inverse warp
Warp.df() - derivatives
Warp.warp_predictions() - compute predictive distribution
"""

def __init__(self):
self.n_params = np.nan

def get_n_params(self):
""" Report the number of parameters required """

assert not np.isnan(self.n_params), \
"Warp function not initialised"

return self.n_params

def warp_predictions(self, mu, s2, param, percentiles=[0.025, 0.975]):
""" Compute the warped predictions from a gaussian predictive
distribution, specifed by a mean (mu) and variance (s2)

:param mu: Gassian predictive mean
:param s2: Predictive variance
:param param: warping parameters
:param percentiles: Desired percentiles of the warped likelihood

:returns: * median - median of the predictive distribution
* pred_interval - predictive interval(s)
"""

# Compute percentiles of a standard Gaussian
N = norm
Z = N.ppf(percentiles)

# find the median (using mu = median)
median = self.invf(mu, param)

# compute the predictive intervals (non-stationary)
pred_interval = np.zeros((len(mu), len(Z)))
for i, z in enumerate(Z):
pred_interval[:,i] = self.invf(mu + np.sqrt(s2)*z, param)

return median, pred_interval

@abstractmethod
def f(self, x, param):
""" Evaluate the warping function (mapping non-Gaussian respone
variables to Gaussian variables)"""

@abstractmethod
def invf(self, y, param):
""" Evaluate the warping function (mapping Gaussian latent variables
to non-Gaussian response variables) """

@abstractmethod
def df(self, x, param):
""" Return the derivative of the warp, dw(x)/dx """

class WarpAffine(WarpBase):
""" Affine warp
y = a + b*x
"""

def __init__(self):
self.n_params = 2

def _get_params(self, param):
if len(param) != self.n_params:
raise(ValueError,
'number of parameters must be ' + str(self.n_params))
return param, param

def f(self, x, params):
a, b = self._get_params(params)

y = a + b*x
return y

def invf(self, y, params):
a, b = self._get_params(params)

x = (y - a) / b

return x

def df(self, x, params):
a, b = self._get_params(params)

df = np.ones(x.shape)*b
return df

class WarpBoxCox(WarpBase):
""" Box cox transform having a single parameter (lambda), i.e.

y = (sign(x) * abs(x) ** lamda - 1) / lambda

This follows the generalization in Bicken and Doksum (1981) JASA 76
and allows x to assume negative values.
"""

def __init__(self):
self.n_params = 1

def _get_params(self, param):

return np.exp(param)

def f(self, x, params):
lam = self._get_params(params)

if lam == 0:
y = np.log(x)
else:
y = (np.sign(x) * np.abs(x) ** lam - 1) / lam
return y

def invf(self, y, params):
lam = self._get_params(params)

if lam == 0:
x = np.exp(y)
else:
x = np.sign(lam * y + 1) * np.abs(lam * y + 1) ** (1 / lam)

return x

def df(self, x, params):
lam = self._get_params(params)

dx = np.abs(x) ** (lam - 1)

return dx

class WarpSinArcsinh(WarpBase):
""" Sin-hyperbolic arcsin warp having two parameters (a, b) and defined by

y = sinh(b *  arcsinh(x) - a)

see Jones and Pewsey A (2009) Biometrika, 96 (4) (2009)
"""

def __init__(self):
self.n_params = 2

def _get_params(self, param):
if len(param) != self.n_params:
raise(ValueError,
'number of parameters must be ' + str(self.n_params))
return param, param

def f(self, x, params):
a, b = self._get_params(params)

y = np.sinh(b * np.arcsinh(x) - a)
return y

def invf(self, y, params):
a, b = self._get_params(params)

x = np.sinh((np.arcsinh(y)+a)/b)

return x

def df(self, x, params):
a, b = self._get_params(params)

dx = (b *np.cosh(b * np.arcsinh(x) - a))/np.sqrt(1 + x ** 2)

return dx

class WarpCompose(WarpBase):
""" Composition of warps. These are passed in as an array and
intialised automatically. For example::

W = WarpCompose(('WarpBoxCox', 'WarpAffine'))

where ell_i are lengthscale parameters and sf2 is the signal variance
"""

def __init__(self, warpnames=None):

if warpnames is None:
raise ValueError("A list of warp functions is required")
self.warps = []
self.n_params = 0
for wname in warpnames:
warp = eval(wname + '()')
self.n_params += warp.get_n_params()
self.warps.append(warp)

def f(self, x, theta):
theta_offset = 0

for ci, warp in enumerate(self.warps):
n_params_c = warp.get_n_params()
theta_c = [theta[c] for c in
range(theta_offset, theta_offset + n_params_c)]
theta_offset += n_params_c

if ci == 0:
fw = warp.f(x, theta_c)
else:
fw = warp.f(fw, theta_c)
return fw

def invf(self, x, theta):
theta_offset = 0
for ci, warp in enumerate(self.warps):
n_params_c = warp.get_n_params()
theta_c = [theta[c] for c in
range(theta_offset, theta_offset + n_params_c)]
theta_offset += n_params_c

if ci == 0:
finvw = warp.invf(x, theta_c)
else:
finvw = warp.invf(finvw, theta_c)

return finvw

def df(self, x, theta):
theta_offset = 0
for ci, warp in enumerate(self.warps):
n_params_c = warp.get_n_params()

theta_c = [theta[c] for c in
range(theta_offset, theta_offset + n_params_c)]
theta_offset += n_params_c

if ci == 0:
dfw = warp.df(x, theta_c)
else:
dfw = warp.df(dfw, theta_c)

return dfw

# -----------------------
# Functions for inference
# -----------------------

class CustomCV:
""" Custom cross-validation approach. This function does not do much, it
merely provides a wrapper designed to be compatible with
scikit-learn (e.g. sklearn.model_selection...)

:param train: a list of indices of training splits (each itself a list)
:param test: a list of indices of test splits (each itself a list)

:returns tr: Indices for training set
:returns te: Indices for test set """

def __init__(self, train, test, X=None, y=None):
self.train = train
self.test = test
self.n_splits = len(train)
if X is not None:
self.N = X.shape
else:
self.N = None

def split(self, X, y=None):
if self.N is None:
self.N = X.shape

for i in range(0, self.n_splits):
tr = self.train[i]
te = self.test[i]
yield tr, te

# -----------------------
# Functions for inference
# -----------------------

def bashwrap(processing_dir, python_path, script_command, job_name,
bash_environment=None):

""" This function wraps normative modelling into a bash script to run it
on a torque cluster system.

** Input:
* processing_dir     -> Full path to the processing dir
* python_path        -> Full path to the python distribution
* command to execute -> python command to execute
* job_name           -> Name for the bash script that is the output of
this function
* covfile_path       -> Full path to a .txt file that contains all
covariats (subjects x covariates) for the
responsefile
* respfile_path      -> Full path to a .txt that contains all features
(subjects x features)
* cv_folds           -> Number of cross validations
* testcovfile_path   -> Full path to a .txt file that contains all
covariats (subjects x covariates) for the
testresponse file
* testrespfile_path  -> Full path to a .txt file that contains all
test features
* bash_environment   -> A file containing the necessary commands
for your bash environment to work

** Output:
* A bash.sh file containing the commands for normative modelling saved
to the processing directory

witten by Thomas Wolfers
"""

# change to processing dir
os.chdir(processing_dir)
output_changedir = ['cd ' + processing_dir + '\n']

# sets bash environment if necessary
if bash_environment is not None:
bash_environment = [bash_environment]
print("""Your own environment requires in any case:
else:
bash_lines = '#!/bin/bash\n\n'
bash_environment = [bash_lines + bash_cores]

command = [python_path + ' ' + script_command + '\n']

# writes bash file into processing dir
bash_file_name = os.path.join(processing_dir, job_name + '.sh')
with open(bash_file_name, 'w') as bash_file:
bash_file.writelines(bash_environment + output_changedir + command)

# changes permissoins for bash.sh file
os.chmod(bash_file_name, 0o700)

return bash_file_name

def qsub(job_path, memory, duration, logdir=None):
"""
This function submits a job.sh scipt to the torque custer using the qsub
command.

** Input:
* job_path      -> Full path to the job.sh file
* memory        -> Memory requirements written as string for example
4gb or 500mb
* duration       -> The approximate duration of the job, a string with
HH:MM:SS for example 01:01:01

** Output:
* Submission of the job to the (torque) cluster

witten by Thomas Wolfers
"""
if logdir is None:
logdir = os.path.expanduser('~')

# created qsub command
qsub_call = ['echo ' + job_path + ' | qsub -N ' + job_path + ' -l ' +
'mem=' + memory + ',walltime=' + duration +
' -e ' + logdir + ' -o ' + logdir]

# submits job to cluster
call(qsub_call, shell=True)

def extreme_value_prob_fit(NPM, perc):
n = NPM.shape
t = NPM.shape
n_perc = int(round(t * perc))
m = np.zeros(n)
for i in range(n):
temp =  np.abs(NPM[i, :])
temp = np.sort(temp)
temp = temp[t - n_perc:]
temp = temp[0:int(np.floor(0.90*temp.shape))]
m[i] = np.mean(temp)
params = genextreme.fit(m)
return params

def extreme_value_prob(params, NPM, perc):
n = NPM.shape
t = NPM.shape
n_perc = int(round(t * perc))
m = np.zeros(n)
for i in range(n):
temp =  np.abs(NPM[i, :])
temp = np.sort(temp)
temp = temp[t - n_perc:]
temp = temp[0:int(np.floor(0.90*temp.shape))]
m[i] = np.mean(temp)
probs = genextreme.cdf(m,*params)
return probs

def ravel_2D(a):
s = a.shape
return np.reshape(a,[s, np.prod(s[1:])])

def unravel_2D(a, s):
return np.reshape(a,s)

def threshold_NPM(NPMs, fdr_thr=0.05, npm_thr=0.1):
""" Compute voxels with significant NPMs. """
p_values = stats.norm.cdf(-np.abs(NPMs))
results = np.zeros(NPMs.shape)
for i in range(p_values.shape):

def FDR(p_values, alpha):
""" Compute the false discovery rate in all voxels for a subject. """
dim = np.shape(p_values)
p_values = np.reshape(p_values,[np.prod(dim),])
sorted_p_values = np.sort(p_values)
sorted_p_values_idx = np.argsort(p_values);
testNum = len(p_values)
thresh = ((np.array(range(testNum)) + 1)/np.float(testNum))  * alpha
h = sorted_p_values <= thresh
unsort = np.argsort(sorted_p_values_idx)
h = h[unsort]
h = np.reshape(h, dim)
return h

def calibration_error(Y,m,s,cal_levels):
ce = 0
for cl in cal_levels:
z = np.abs(norm.ppf((1-cl)/2))
ub = m + z * s
lb = m - z * s
ce = ce + np.abs(cl - np.sum(np.logical_and(Y>=lb,Y<=ub))/Y.shape)
return ce

def simulate_data(method='linear', n_samples=100, n_features=1, n_grps=1,
working_dir=None, plot=False, random_state=None, noise=None):
"""
This function simulates linear synthetic data for testing nispat methods.

- Inputs:

- method: simulate 'linear' or 'non-linear' function.

- n_samples: number of samples in each group of the training and test sets.
If it is an int then the same sample number will be used for all groups.
It can be also a list of size of n_grps that decides the number of samples
in each group (default=100).

- n_features: A positive integer that decides the number of features
(default=1).

- n_grps: A positive integer that decides the number of groups in data
(default=1).

- working_dir: Directory to save data (default=None).

- plot: Boolean to plot the simulated training data (default=False).

- random_state: random state for generating random numbers (Default=None).

- noise: Type of added noise to the data. The options are 'gaussian',
'exponential', and 'hetero_gaussian' (The defauls is None.).

- Outputs:

- X_train, Y_train, grp_id_train, X_test, Y_test, grp_id_test, coef

"""

if isinstance(n_samples, int):
n_samples = [n_samples for i in range(n_grps)]

X_train, Y_train, X_test, Y_test = [], [], [], []
grp_id_train, grp_id_test = [], []
coef = []
for i in range(n_grps):
bias = np.random.randint(-10, high=10)

if method == 'linear':
X_temp, Y_temp, coef_temp = make_regression(n_samples=n_samples[i]*2,
n_features=n_features, n_targets=1,
noise=10 * np.random.rand(), bias=bias,
n_informative=1, coef=True,
random_state=random_state)
elif method == 'non-linear':
X_temp = np.random.randint(-2,6,[2*n_samples[i], n_features]) \
+ np.random.randn(2*n_samples[i], n_features)
Y_temp = X_temp[:,0] * 20 * np.random.rand() + np.random.randint(10,100) \
* np.sin(2 * np.random.rand() + 2 * np.pi /5 * X_temp[:,0])
coef_temp = 0
elif method == 'combined':
X_temp = np.random.randint(-2,6,[2*n_samples[i], n_features]) \
+ np.random.randn(2*n_samples[i], n_features)
Y_temp = (X_temp[:,0]**3) * np.random.uniform(0, 0.5) \
+ X_temp[:,0] * 20 * np.random.rand() \
+ np.random.randint(10, 100)
coef_temp = 0
else:
raise ValueError("Unknow method. Please specify valid method among \
'linear' or  'non-linear'.")
coef.append(coef_temp/100)
X_train.append(X_temp[:X_temp.shape//2])
Y_train.append(Y_temp[:X_temp.shape//2]/100)
X_test.append(X_temp[X_temp.shape//2:])
Y_test.append(Y_temp[X_temp.shape//2:]/100)
grp_id = np.repeat(i, X_temp.shape)
grp_id_train.append(grp_id[:X_temp.shape//2])
grp_id_test.append(grp_id[X_temp.shape//2:])

if noise == 'hetero_gaussian':
t = np.random.randint(5,10)
Y_train[i] = Y_train[i] + np.random.randn(Y_train[i].shape) / t \
* np.log(1 + np.exp(X_train[i][:,0]))
Y_test[i] = Y_test[i] + np.random.randn(Y_test[i].shape) / t \
* np.log(1 + np.exp(X_test[i][:,0]))
elif noise == 'gaussian':
t = np.random.randint(3,10)
Y_train[i] = Y_train[i] + np.random.randn(Y_train[i].shape)/t
Y_test[i] = Y_test[i] + np.random.randn(Y_test[i].shape)/t
elif noise == 'exponential':
t = np.random.randint(1,3)
Y_train[i] = Y_train[i] + np.random.exponential(1, Y_train[i].shape) / t
Y_test[i] = Y_test[i] + np.random.exponential(1, Y_test[i].shape) / t
elif noise == 'hetero_gaussian_smaller':
t = np.random.randint(5,10)
Y_train[i] = Y_train[i] + np.random.randn(Y_train[i].shape) / t \
* np.log(1 + np.exp(0.3 * X_train[i][:,0]))
Y_test[i] = Y_test[i] + np.random.randn(Y_test[i].shape) / t \
* np.log(1 + np.exp(0.3 * X_test[i][:,0]))
X_train = np.vstack(X_train)
X_test = np.vstack(X_test)
Y_train = np.concatenate(Y_train)
Y_test = np.concatenate(Y_test)
grp_id_train = np.expand_dims(np.concatenate(grp_id_train), axis=1)
grp_id_test = np.expand_dims(np.concatenate(grp_id_test), axis=1)

for i in range(n_features):
plt.figure()
for j in range(n_grps):
plt.scatter(X_train[grp_id_train[:,0]==j,i],
Y_train[grp_id_train[:,0]==j,], label='Group ' + str(j))
plt.xlabel('X' + str(i))
plt.ylabel('Y')
plt.legend()

if working_dir is not None:
if not os.path.isdir(working_dir):
os.mkdir(working_dir)
with open(os.path.join(working_dir ,'trbefile.pkl'), 'wb') as file:
pickle.dump(pd.DataFrame(grp_id_train),file)
with open(os.path.join(working_dir ,'tsbefile.pkl'), 'wb') as file:
pickle.dump(pd.DataFrame(grp_id_test),file)
with open(os.path.join(working_dir ,'X_train.pkl'), 'wb') as file:
pickle.dump(pd.DataFrame(X_train),file)
with open(os.path.join(working_dir ,'X_test.pkl'), 'wb') as file:
pickle.dump(pd.DataFrame(X_test),file)
with open(os.path.join(working_dir ,'Y_train.pkl'), 'wb') as file:
pickle.dump(pd.DataFrame(Y_train),file)
with open(os.path.join(working_dir ,'Y_test.pkl'), 'wb') as file:
pickle.dump(pd.DataFrame(Y_test),file)

return X_train, Y_train, grp_id_train, X_test, Y_test, grp_id_test, coef

def divergence_plot(nm, ylim=None):

if nm.hbr.configs['n_chains'] > 1 and nm.hbr.model_type != 'nn':
a = pm.summary(nm.hbr.trace).round(2)
plt.figure()
plt.hist(a['r_hat'],10)
plt.title('Gelman-Rubin diagnostic for divergence')

divergent = nm.hbr.trace['diverging']

tracedf = pm.trace_to_dataframe(nm.hbr.trace)

_, ax = plt.subplots(2, 1, figsize=(15, 4), sharex=True, sharey=True)
ax.plot(tracedf.values[divergent == 0].T, color='k', alpha=.05)
ax.set_title('No Divergences', fontsize=10)
ax.plot(tracedf.values[divergent == 1].T, color='C2', lw=.5, alpha=.5)
ax.set_title('Divergences', fontsize=10)
plt.ylim(ylim)
plt.xticks(range(tracedf.shape), list(tracedf.columns))
plt.xticks(rotation=90, fontsize=7)
plt.tight_layout()
plt.show()```