"""
multi-layer neural network for single-varibale or two-variable normal distribution.
"""

import os
import sys
import time
import math

import numpy

import theano
import theano.tensor as T

#from Optimizers import AdaGrad, AdaDelta, SGDMomentum, GD
from Adams import Adam
#from LogReg import LogisticRegression as LogReg

# start-snippet-1
class HiddenLayer(object):
    def __init__(self, rng, input, n_in, n_out, W=None, b=None, activation=T.tanh):
        """
        Typical hidden layer of a MLP: units are fully-connected and have
        user-specified activation function. Weight matrix W is of shape (n_in,n_out)
        and the bias vector b is of shape (n_out,).

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type input: theano.tensor.dmatrix
        :param input: a symbolic tensor of shape (n_examples, n_in)

        :type n_in: int
        :param n_in: dimensionality of input

        :type n_out: int
        :param n_out: number of hidden units

        :type activation: theano.Op or function
        :param activation: Non linearity to be applied in the hidden layer
        """

        self.input = input
        self.n_in = n_in
        self.n_out = n_out


        if W is None:
            	W_values = numpy.asarray( rng.uniform( low = -numpy.sqrt(6. / (n_in + n_out)), high = numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out) ), dtype=theano.config.floatX )
            	if activation == T.nnet.sigmoid:
                	W_values *= 4
            	W = theano.shared(value=W_values, name='HL_W', borrow=True)

        if b is None:
            	b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
            	b = theano.shared(value=b_values, name='HL_b', borrow=True)

        self.W = W
        self.b = b

        lin_output = T.dot(input, self.W) + self.b
        self.output = ( lin_output if activation is None else activation(lin_output) )
        
	# parameters of the model
        self.params = [self.W, self.b]
        self.paramL1 = abs(self.W).sum() + abs(self.b).sum()
        self.paramL2 = (self.W**2).sum() + (self.b**2).sum()

"""
## x is a matrix with shape (batchSize, 2)
## this function returns T.prod(x, axis=1, keepdims=True)
## we reimplement this because theano has bugs with T.prod()
def MyProd(x):
	y = T.mul(x[:, 0], x[:, 1])
	return y.dimshuffle(0, 'x')
"""

class NN4Normal(object):
    	"""neural network for single-variable or two-variable normal distribution

    	A multi-layer feedforward artificial neural network for normal distribution
    	that has one layer or more of hidden units and nonlinear activations.
    	"""

	## sigma_sqr_min is the minimum value of sigma_sqr. It needs to be positive
    	def __init__(self, rng, input=None, n_in=1, n_variables=2, n_out=5, n_hiddens=[], mymean=None, sigma_sqr_min=numpy.float32(0.0001), rho_abs_max=numpy.float32(0.99)):
        	"""
        	rng: a random number generator used to initialize weights
	
		input has shape (batchSize, n_in)
		n_in is the number of input features

		n_variables indicates the number of variables. Currently only 1 or 2 variables are supported
		output has shape (batchSize, n_out)
		n_out is the number of parameters defining a normal distribution
		when n_variables = 1, n_out = 1 or 2 
		when n_variables = 2, n_out = 2, 4, or 5

        	n_hidden: a tuple defining the number of hidden units at each hidden layer

		if you already have mean and just want to estimate vaiance, then provide your mean through mymean

        	"""
		## check the consistency between n_variables and n_out
		if n_variables == 1:
			assert ( n_out == 1 or n_out == 2)
		elif n_variables == 2:
			assert ( n_out == 2 or n_out == 4 or n_out == 5)
		else:
			print 'ERROR: n_variables can only be 1 or 2'
			exit(-1)

		self.n_variables = n_variables
        	self.input = input
        	self.n_in = n_in
		self.n_out = n_out
        	self.n_hiddens = n_hiddens

        	self.params = []
        	self.paramL1 =0
        	self.paramL2 =0

        	self.hlayers = []
		self.layers = []

        	output_in_last_layer = input
        	n_out_in_last_layer = n_in

            	## add hidden layers
        	for i in xrange(len(n_hiddens)):
            		hiddenLayer = HiddenLayer( rng = rng, input = output_in_last_layer, n_in = n_out_in_last_layer, n_out = n_hiddens[i], activation = T.nnet.relu ) 
            		self.hlayers.append(hiddenLayer)
            		output_in_last_layer = hiddenLayer.output
            		n_out_in_last_layer = n_hiddens[i]

		self.layers = self.hlayers

		self.mean = None
		self.sigma_sqr = None
		self.corr = None

		self.params4var = []
                self.paramL14var = 0
                self.paramL24var = 0


		if mymean is not None:
			self.mean = mymean
		else:
			## calculate the mean
			uLayer = HiddenLayer( rng = rng, input = output_in_last_layer, n_in = n_out_in_last_layer, n_out = n_variables, activation = None )
			self.mean = uLayer.output
			self.layers.append(uLayer)

		if n_out >= (2 * n_variables):
			##calculate sigma_sqr, sigma and its square are positive, so we use ReLU here
			sigmaLayer = HiddenLayer( rng = rng, input = output_in_last_layer, n_in = n_out_in_last_layer, n_out = n_variables, activation = T.nnet.relu )
			self.sigma_sqr = sigmaLayer.output + sigma_sqr_min
			self.layers.append(sigmaLayer)

			self.params4var += sigmaLayer.params
                        self.paramL14var += sigmaLayer.paramL1
                        self.paramL24var += sigmaLayer.paramL2

		if n_out == 5:
			##calculate correlation, need to make sure that correlation falls into [-1, 1]
			corrLayer = HiddenLayer( rng = rng, input = output_in_last_layer, n_in = n_out_in_last_layer, n_out = 1, activation = T.tanh )
			self.corr = corrLayer.output * rho_abs_max
			self.layers.append(corrLayer)

			self.params4var += corrLayer.params
                        self.paramL14var += corrLayer.paramL1
                        self.paramL24var += corrLayer.paramL2


		for layer in self.layers:
        		self.params += layer.params
			self.paramL1 += layer.paramL1
        		self.paramL2 += layer.paramL2

		self.y_pred = self.mean 

        	outputList = [ self.mean ]
		if self.sigma_sqr is not None:
			outputList.append(self.sigma_sqr)

		if self.corr is not None:
			outputList.append(self.corr)

		self.output = T.concatenate(outputList, axis=1)

	## y has shape (batchSize, n_variables), sampleWeight has shape (batchSize, 1) instead of (batchSize,)
	## this function returns a scalar
    	def NLL(self, y, useMeanOnly=False, sampleWeight=None):

		assert (y.ndim == 2)

                pi = numpy.pi

		if self.n_variables == 1:
			e = T.sqr( y -self.mean )/2.
			nll = numpy.log(2*pi)/2.
			
			if useMeanOnly or (self.sigma_sqr is None):
				nll = nll + e
			else:
				e = e / self.sigma_sqr
				nll = nll + e + T.log(self.sigma_sqr)/2.

		else:
			err = y - self.mean
			err_sqr = T.sqr( err )

			if useMeanOnly or (self.sigma_sqr is None):
				sig_sqr = T.ones_like(e)
			else:
				sig_sqr = self.sigma_sqr

			nll = T.sum(T.log(sig_sqr) + numpy.log(2*pi), axis=1, keepdims=True)/2.

			e = T.sum( err_sqr/sig_sqr, axis=1, keepdims=True )

			sig = T.sqrt( sig_sqr )
			f = T.prod( err/sig, axis=1, keepdims=True )

			if useMeanOnly or (self.corr is None):
				rho = T.zeros_like(e)
			else:
				rho = T.corr
				
			g = e - T.mul(rho, f) * 2.

			rho_sqr = T.sqr(rho)
			h = g / (2 * ( 1 - rho_sqr ) )

			nll = nll + h + T.log(1 - rho_sqr)/2. 


		if sampleWeight is None:
			return T.mean(nll)
		return T.sum(T.mul(nll, sampleWeight) )/T.sum(sampleWeight)


	## y has shape (batchSize, n_variables), sampleWeight shall have shape (batchSize, 1) instead of (batchSize,)
	## this function returns a vector
    	def errors(self, y, sampleWeight=None):
		assert (y.ndim == 2)
		err_sqr = T.sqr( y - self.y_pred )
		if sampleWeight is None:
			return T.sqrt(T.mean(err_sqr, axis=0 ) )

		assert (sampleWeight.ndim == 2)
		if self.n_variables == 1:
			weight = sampleWeight
		else:
			weight = T.concatenate( [ sampleWeight, sampleWeight], axis=1 )
		return T.sqrt( T.sum(T.mul( err_sqr, weight ), axis=0)/ T.sum(sampleWeight) )

	## y has shape (batchSize, n_variables), sampleWeight shall have shape (batchSize, 1) instead of (batchSize,)
    	def loss(self, y, useMeanOnly=False, sampleWeight=None):
		if useMeanOnly:
			return self.NLL(y, useMeanOnly=useMeanOnly, sampleWeight=sampleWeight)
		else:
        		return self.NLL(y, sampleWeight=sampleWeight)



def testNN4Normal(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=2000, n_hiddens=[100, 200], trainData=None, testData=None):

    	## generate some random train and test data
	batchSize = 200000
	nFeatures = 50

    	trainX = numpy.random.uniform(0, 1, (batchSize, nFeatures)).astype(numpy.float32)
    	u1 = numpy.sum( trainX[:,:30], axis=1, keepdims=True)
    	u2 = numpy.sum( trainX[:,21:], axis=1, keepdims=True)
	trainY = (numpy.random.normal(0, 2., (batchSize, 2)) +  numpy.concatenate( (u1, u2), axis=1) ).astype(numpy.float32)

	testBatchSize = 500
    	testX = numpy.random.uniform(0, 1, (testBatchSize, nFeatures)).astype(numpy.float32)
    	testu1 = numpy.sum(testX[:,:30], axis=1, keepdims=True)
    	testu2 = numpy.sum(testX[:,21:], axis=1, keepdims=True)
	testY = (numpy.random.normal(0, 2., (testBatchSize, 2)) + numpy.concatenate( (testu1, testu2), axis=1) ).astype(numpy.float32)
	testCorr = numpy.sum(testX[:, 21:30], axis=1, keepdims=True)/numpy.sum(testX, axis=1, keepdims=True)

    	######################
    	# BUILD ACTUAL MODEL #
    	######################
    	print('... building the model')

    	# allocate symbolic variables for the data

    	x = T.matrix('x')  # the input feature
    	y = T.matrix('y')  # the response

    	rng = numpy.random.RandomState()

    	regressor = NN4Normal(rng, input=x, n_in=trainX.shape[1], n_variables = 2, n_out = 5, n_hiddens=n_hiddens, sigma_sqr_min=0.01)

	loss = regressor.loss(y)
    	cost = loss + L1_reg * regressor.paramL1 + L2_reg * regressor.paramL2
	error = regressor.errors(y)

    	gparams = [T.grad(cost, param) for param in regressor.params]
    	param_shapes = [ param.shape.eval() for param in regressor.params ]
    	updates, others = Adam(regressor.params, gparams) 

    	train = theano.function( inputs=[x,y], outputs=[loss, error, regressor.paramL1, regressor.paramL2], updates=updates)
    	test = theano.function( inputs=[x,y], outputs=error)
    	calculate = theano.function( inputs=[x], outputs=regressor.output )

    	step = 200
	numEpochs = 13
	for j in range(0, numEpochs):
		results = []
    		for i in range(0,trainX.shape[0], step):
        		los, err, l1, l2 = train(trainX[i:i+step, :], trainY[i:i+step, :])
			results.append( los )
			if i%5000 == 0:
				print 'i=', i, ' loss=', los, ' error=', err, ' L1norm=', l1, ' L2norm=', l2
		print 'j=', j, ' avgLos, avgErr=', numpy.mean(results, axis=0)


	out = calculate(testX)
	print numpy.concatenate( (out, testCorr, testY), axis=1).astype(numpy.float16)

	print 'err=', test(testX, testY)
	corr = numpy.concatenate( (out[:,4:5], testCorr), axis=1)
	print numpy.corrcoef( numpy.transpose(corr) )

	import scipy
        print scipy.stats.mstats.spearmanr(corr[:,0], corr[:,1])


if __name__ == '__main__':
    	testNN4Normal()