from __future__ import division
import tensorflow as tf
import numpy as np


class CRBM(object):
  """CONVOLUTIONAL RESTRICTED BOLTZMANN MACHINE"""
  
  def __init__(self, name, fully_connected = True, v_height = 1, v_width = 1, v_channels = 784, f_height = 1, f_width = 1, f_number = 400, 
               init_biases_H = -3, init_biases_V = 0.01, init_weight_stddev = 0.01, 
               gaussian_unit = True, gaussian_variance = 0.2, 
               prob_maxpooling = False, padding = False,
               batch_size = 20, learning_rate = 0.0001, learning_rate_decay = 0.5, momentum = 0.9, decay_step = 50000,  
               weight_decay = 0.1, sparsity_target = 0.1, sparsity_coef = 0.1):
    """INTENT : Initialization of a Convolutional Restricted Boltzmann Machine
    ------------------------------------------------------------------------------------------------------------------------------------------
    PARAMETERS :
    name                  :         name of the RBM
    fully_connected       :         specify if the RBM is fully connected (True) or convolutional (False)     |   if True then obviously all height and width are 1
    v_height              :         height of the visible layer (input)
    v_width               :         width of the visible layer (input)
    v_channels            :         numbers of channels of the visible layer (input)
    f_height              :         height of the filter to apply to the visible layer 
    f_width               :         width of the filter to apply to the visible layer 
    f_number              :         number of filters to apply to the visible layer
    init_biases_H         :         initialization value for the bias of the hidden layer
    init_biases_V         :         initialization value for the bias of the visible layer
    init_weight_stddev    :         initialization value of the standard deviation for the kernel
    gaussian_unit         :         True if using gaussian unit for the visible layer, false if using binary unit
    gaussian_variance     :         Value of the variance of the gaussian distribution of the visible layer (only for gaussian visible unit)
    prob_maxpooling       :         True if the CRBM also include a probabilistic max pooling layer on top of the hidden layer (only for convolutional RBM)
    padding               :         True if the visible and hidden layer have same dimension (only for convolutional RBM)
    batch_size            :     size of the batch
    learning_rate         :     learning rate for gradient update    
    learning_rate_decay   :     value of the exponential decay
    momentum              :     coefficient of the momemtum in the gradient descent
    decay_step            :     number of step before applying gradient decay
    weight_decay          :     coefficient of the weight l2 norm regularization
    sparsity_target       :     probability target of the activation of the hidden units
    sparsity_coef         :     coefficient of the sparsity regularization term
    ------------------------------------------------------------------------------------------------------------------------------------------
    REMARK : Attributs self.vitesse_kernels | self.vitesse_biases_V |  self.vitesse_biases_H are required to compute gradient with momentum
             Attribut self.sigma  is the variance of the distribution fromn which samples are drawn for visible layer if it gaussian unit
    ------------------------------------------------------------------
    REMARK : We have that     visible     =   [batch_size,visible_height,visible_width,visible_channels] 
            convolved with   kernels     =   [filter_height, filter_width,visible_channels,filter_number] 
            to give          hidden      =   [batch_size,hidden_height,hidden_width,filter_number]
    ------------------------------------------------------------------
    REMARK : This class can represent differents kinds of RBM
             fully connected RBM              if  fully_connected = True and then v_height=v_width=f_height=f_width=1
             gaussian RBM                     if  gaussian_unit = True     (https://web.eecs.umich.edu/~honglak/iccv2011-sparseConvLearning.pdf)
             sparse   RBM                     if  sparsity_coef <> 0        (http://ai.stanford.edu/~ang/papers/nips07-sparsedeepbeliefnetworkv2.pdf   -    https://www.ii.pwr.edu.pl/~gonczarek/papers/rbm_sparse.pdf)   
             probabilistic maxpooling CRBM    if  prob_maxpooling = True   (http://www.cs.toronto.edu/~rgrosse/icml09-cdbn.pdf)"""

    try:
      if fully_connected and not (v_height == 1 and v_width == 1 and f_height == 1 and f_width == 1):
        raise ValueError('Trying to initialize CRBM ' + name + ' which is fully connected but height and width of visible and filters are not set to 1') 
      if fully_connected and prob_maxpooling:
        raise ValueError('Trying to initialize CRBM ' + name + ' which is fully connected but with max pooling enabled (should set prob_maxpooling to False)') 
      if fully_connected and padding:
        raise ValueError('Trying to initialize CRBM ' + name + ' which is fully connected but with padding enabled (should set padding to False)') 
      if padding and ((f_height % 2 == 0) or (f_width % 2 == 0)):
        raise ValueError('Trying to initialize CRBM ' + name + ' which has padded enable but filter dimension are not odd (padding feature only support odd size for filter dimension)') 
      
      self.name                  = name
      self.fully_connected       = fully_connected
      self.visible_height        = v_height
      self.visible_width         = v_width
      self.visible_channels      = v_channels
      self.filter_height         = f_height
      self.filter_width          = f_width
      self.filter_number         = f_number
      self.gaussian_unit         = gaussian_unit
      if gaussian_unit:
        self.gaussian_variance   = gaussian_variance
      self.prob_maxpooling       = prob_maxpooling
      self.padding               = padding
      if padding:
        self.hidden_height       = v_height
        self.hidden_width        = v_width
      else:
        self.hidden_height       = v_height - f_height + 1
        self.hidden_width        = v_width - f_width + 1
      
      self.batch_size            =     batch_size
      self.learning_rate         =     learning_rate
      self.learning_rate_decay   =     learning_rate_decay
      self.momentum              =     momentum
      self.decay_step            =     decay_step
      self.weight_decay          =     weight_decay
      self.sparsity_target       =     sparsity_target
      self.sparsity_coef         =     sparsity_coef
    
      with tf.variable_scope(name):
         with tf.device('/cpu:0'):
          self.kernels            = tf.get_variable('weights', (f_height, f_width,v_channels,f_number), initializer=tf.truncated_normal_initializer(stddev=init_weight_stddev, dtype=tf.float32), dtype=tf.float32)
          self.biases_V           = tf.get_variable('biases_V', (v_channels), initializer=tf.constant_initializer(init_biases_V), dtype=tf.float32)
          self.biases_H           = tf.get_variable('biases_H', (f_number), initializer=tf.constant_initializer(init_biases_H), dtype=tf.float32)
          self.vitesse_kernels    = tf.get_variable('vitesse_weights', (f_height, f_width,v_channels,f_number), initializer=tf.constant_initializer(0), dtype=tf.float32)
          self.vitesse_biases_V   = tf.get_variable('vitesse_biases_V', (v_channels), initializer=tf.constant_initializer(0), dtype=tf.float32)
          self.vitesse_biases_H   = tf.get_variable('vitesse_biases_H', (f_number), initializer=tf.constant_initializer(0), dtype=tf.float32)
          if gaussian_unit:
            self.sigma              = tf.get_variable('sigma', (v_width*v_height*batch_size*v_channels), initializer=tf.constant_initializer(self.gaussian_variance * self.gaussian_variance), dtype = tf.float32)
          
    except ValueError as error:
      print('--------------------------')
      print(error.args)
      print('--------------------------')
      
      
      
        

  def compute_energy(self, visible, hidden, method = 'forward'):
    """INTENT : Compute the energy of the configuration (visible,hidden) given in parameters
    ------------------------------------------------------------------------------------------------------------------------------------------
    PARAMETERS :
    visible         :        visible layer configuration
    hidden           :        hidden layer configuration
    method          :        method to compute energy
    ------------------------------------------------------------------------------------------------------------------------------------------
    REMARK : The last parameter (method) that specify how to compute it (forward or backward) does not really matter since both method yields same configuration
    The difference simply is in the order of operations convolution and elementwise multiplication
    ------------------------------------------------------------------
    REMARK : The returned result is per configuration so does not depend on the batch_size"""
    
    if method == 'forward':
      if self.padding:
        conv = tf.nn.conv2d(visible, self.kernels, [1, 1, 1, 1], padding='SAME')
      else:
        conv = tf.nn.conv2d(visible, self.kernels, [1, 1, 1, 1], padding='VALID')
      operand = hidden                      
    elif method == 'backward':
      if self.padding:
        conv = tf.nn.conv2d(hidden, self._get_flipped_kernel(), [1, 1, 1, 1], padding='SAME')
      else:
        conv = tf.nn.conv2d(self._get_padded_hidden(hidden), self._get_flipped_kernel(), [1, 1, 1, 1], padding='VALID')
      operand = visible
    weight =  tf.reduce_sum(tf.mul(operand,conv)) 
    bias_H =  tf.reduce_sum(tf.mul(self.biases_H,tf.reduce_sum(hidden,[0,1,2]))) 
    
    if self.gaussian_unit: 
      'GAUSSIAN UNIT IN VISIBLE LAYER'   
      weight = tf.div(weight,self.gaussian_variance)
      bias_V = tf.reduce_sum(tf.square(tf.sub(visible,tf.reshape(self.biases_V,[1,1,1,self.visible_channels]))))
      bias_V = tf.div(bias_V,2*self.gaussian_variance*self.gaussian_variance)
      output = tf.sub(bias_V,tf.add(bias_H,weight))
    else: 
      'BINARY UNIT IN VISIBLE LAYER'        
      bias_V =  tf.reduce_sum(tf.mul(self.biases_V,tf.reduce_sum(visible,[0,1,2])))
      output =  tf.mul(-1,tf.add(weight,tf.add(bias_H,bias_V)))
    return tf.div(output,self.batch_size)
      
        
        
        

  def infer_probability(self, operand, method, result = 'hidden'):
    """INTENT : Compute the probabily of activation of one layer given the other
    ------------------------------------------------------------------------------------------------------------------------------------------
    PARAMETERS :
    operand         :        given layer
    method          :        which direction for inference ie forward or backward
    result          :        should we return the hidden layer of the pooling layer activation (only if prob_maxpooling is True and method is FORWARD)
    ------------------------------------------------------------------------------------------------------------------------------------------
    REMARK : If method is FORWARD then compute hidden layer probability and operand is VISIBLE layer
             If method is BACKWARD then compute visible layer probability and operand is HIDDEN layer"""
    
    'Computing HIDDEN layer with VISIBLE layer given'
    'Gaussian visible or not, hidden layer activation is a sigmoid'
    if method == 'forward': 
      if self.padding:
        conv = tf.nn.conv2d(operand, self.kernels, [1, 1, 1, 1], padding='SAME')
      else:
        conv = tf.nn.conv2d(operand, self.kernels, [1, 1, 1, 1], padding='VALID')
      if self.gaussian_unit:  
        conv = tf.div(conv,self.gaussian_variance)
      bias = tf.nn.bias_add(conv, self.biases_H)
      if self.prob_maxpooling: 
        'SPECIFIC CASE where we enable probabilistic max pooling'
        exp = tf.exp(bias)
        custom_kernel = tf.constant(1.0, shape=[2,2,self.filter_number,1])
        sum = tf.nn.depthwise_conv2d(exp, custom_kernel, [1, 2, 2, 1], padding='VALID')
        sum = tf.add(1.0,sum)
        ret_kernel = np.zeros((2,2,self.filter_number,self.filter_number))
        for i in range(2):
          for j in range(2):
            for k in range(self.filter_number):
              ret_kernel[i,j,k,k] = 1
        custom_kernel_bis = tf.constant(ret_kernel,dtype = tf.float32)
        sum_bis =  tf.nn.conv2d_transpose(sum, custom_kernel_bis, (self.batch_size,self.hidden_height,self.hidden_width,self.filter_number), strides= [1, 2, 2, 1], padding='VALID', name=None)
        if result == 'hidden': 
          'We want to obtain HIDDEN layer configuration'
          return tf.div(exp,sum_bis)
        elif result == 'pooling': 
          'We want to obtain POOLING layer configuration'
          return tf.sub(1.0,tf.div(1.0,sum))
      return tf.sigmoid(bias)
    
    'Computing VISIBLE layer with HIDDEN layer given'
    'If gaussian then return the mean of the normal distribution from wich visible unit are drawn, covariance matrix being self.gaussian_variance square identity'
    'If not gaussian then return the binary probability wich is sigmoid'
    if method == 'backward':
      if self.padding:
        conv = tf.nn.conv2d(operand, self._get_flipped_kernel(), [1, 1, 1, 1], padding='SAME')
      else:
        conv = tf.nn.conv2d(self._get_padded_hidden(operand), self._get_flipped_kernel(), [1, 1, 1, 1], padding='VALID')
      if self.gaussian_unit:       
        conv = tf.mul(conv,self.gaussian_variance)
      bias = tf.nn.bias_add(conv, self.biases_V)
      if self.gaussian_unit:       
        return bias
      return tf.sigmoid(bias)
      
        
        
        

  def draw_samples(self, mean_activation, method ='forward'):
    """INTENT : Draw samples from distribution of specified parameter
    ------------------------------------------------------------------------------------------------------------------------------------------
    PARAMETERS :
    mean_activation         :        parameter of the distribution to draw sampels from
    method                  :        which direction for drawing sample ie forward or backward
    ------------------------------------------------------------------------------------------------------------------------------------------
    REMARK : If FORWARD then samples for HIDDEN layer (BERNOULLI)
             If BACKWARD then samples for VISIBLE layer (BERNOULLI OR GAUSSIAN if self.gaussian_unit = True)"""

    if self.gaussian_unit and method == 'backward': 
      'In this case mean_activation is the mean of the normal distribution, variance being self.variance^2'
      mu = tf.reshape(mean_activation, [-1])
      dist = tf.contrib.distributions.MultivariateNormalDiag(mu, self.sigma)
      samples = dist.sample()
      return tf.reshape(samples,[self.batch_size,self.visible_height,self.visible_width,self.visible_channels])
    elif method == 'forward':
      height   =  self.hidden_height
      width    =  self.hidden_width
      channels =  self.filter_number  
    elif method == 'backward':
      height   =  self.visible_height
      width    =  self.visible_width
      channels =  self.visible_channels
    return tf.select(tf.random_uniform([self.batch_size,height,width,channels]) - mean_activation < 0, tf.ones([self.batch_size,height,width,channels]), tf.zeros([self.batch_size,height,width,channels]))
      
        
      
        

  def do_contrastive_divergence(self, data, n = 1, global_step = 0):
    """INTENT : Do one step of n-contrastive divergence algorithm for leaning weight and biases of the CRBM
    ------------------------------------------------------------------------------------------------------------------------------------------
    PARAMETERS :
    data               :        data to clamp on the visible layer
    n                  :        length of the gibbs chain
    global_step        :         how many step since we started contrastive divergence
    ------------------------------------------------------------------------------------------------------------------------------------------
    REMARK : global_step is only needed for computing the learning rate ie adjust the decay"""
    
    V0,Q0,VN,QN = self._do_gibbs_chain(data, n = n)
    
    'FOR WEIGHT'
    #[vchannels,vheight,vwidth,batch_size] conv with [hiddenhei,hiddenwidth,batch_size,hiddenchannel] give a [vchannels,filterhei,filterwidth,hiddenchannel] filter
    if self.padding:
      positive = tf.nn.conv2d(self._get_padded_visible(tf.transpose(V0,perm=[3,1,2,0])), tf.transpose(Q0,perm=[1,2,0,3]), [1, 1, 1, 1], padding='VALID')
      negative = tf.nn.conv2d(self._get_padded_visible(tf.transpose(VN,perm=[3,1,2,0])), tf.transpose(QN,perm=[1,2,0,3]), [1, 1, 1, 1], padding='VALID')
    else:
      positive = tf.nn.conv2d(tf.transpose(V0,perm=[3,1,2,0]), tf.transpose(Q0,perm=[1,2,0,3]), [1, 1, 1, 1], padding='VALID')
      negative = tf.nn.conv2d(tf.transpose(VN,perm=[3,1,2,0]), tf.transpose(QN,perm=[1,2,0,3]), [1, 1, 1, 1], padding='VALID')
    ret = positive - negative
    if self.gaussian_unit:
      ret = tf.div(ret,self.gaussian_variance)
    g_weight          = tf.div(tf.transpose(ret,perm=[1,2,0,3]),self.batch_size)
    g_weight_sparsity = self._get_param_sparsity_penalty('kernel', Q0, V0)
    g_weight_l2       = self._get_param_weight_penalty(self.kernels)
    
    'FOR BIAS VISIBLE'
    g_biais_V = tf.div(tf.reduce_sum(tf.sub(V0,VN), [0,1,2]),self.batch_size)
    if self.gaussian_unit:
      g_biais_V = tf.div(g_biais_V,self.gaussian_variance * self.gaussian_variance)
    
    'FOR BIAS HIDDEN' 
    g_biais_H            =   tf.div(tf.reduce_sum(tf.sub(Q0,QN), [0,1,2]),self.batch_size)
    g_biais_H_sparsity   =   self._get_param_sparsity_penalty('hidden_bias', Q0, V0)
    
    'UPDATE ALL'
    ret_w  = self._apply_grad(self.kernels,  g_weight,  self.vitesse_kernels, wd = True, wd_value = g_weight_l2, sparsity = True, sparsity_value = g_weight_sparsity,  global_step = global_step)
    ret_bv = self._apply_grad(self.biases_V, g_biais_V, self.vitesse_biases_V,                                                                                         global_step = global_step)
    ret_bh = self._apply_grad(self.biases_H, g_biais_H, self.vitesse_biases_H,                                   sparsity = True, sparsity_value = g_biais_H_sparsity, global_step = global_step)
    cost = tf.reduce_sum(tf.square(tf.sub(data,VN)))
    update = tf.reduce_sum(VN)
    return ret_w, ret_bv, ret_bh, cost, update
      
        
      
        

  def save_parameter(self, path, sess, step):
    """INTENT : Save parameter of the RBM
    ------------------------------------------------------------------------------------------------------------------------------------------
    PARAMETERS :
    path        :       where to save
    sess        :       which session = context to take value from
    step        :       how many step are we at
    """
    saver = tf.train.Saver([self.kernels, self.biases_V, self.biases_H])
    return saver.save(sess, path, global_step=step)
      
        
        
        

  def load_parameter(self, path, sess):
    """INTENT : Load parameter of this RBM if the save file exist
    ------------------------------------------------------------------------------------------------------------------------------------------
    PARAMETERS :
    path               :        where is the save file
    sess               :        the session = context we want to load the parameter"""
    
    saver = tf.train.Saver([self.kernels, self.biases_V, self.biases_H])
    return saver.restore(sess, path)
      
    
        
        

  def init_parameter(self,from_scratch = True):
    """INTENT : Return the tensorflow operation for initializing the parameter of this RBM
    ------------------------------------------------------------------------------------------------------------------------------------------
    PARAMETERS :
    from_scratch               :        specifiy if this RBM is pretrained (True) or restored in order to adjust which variable to initialize"""
    
    if from_scratch:
      return tf.initialize_all_variables()
    elif self.gaussian_unit:
      return tf.initialize_variables([self.vitesse_kernels, self.vitesse_biases_V, self.vitesse_biases_H, self.sigma])
    else:
      return tf.initialize_variables([self.vitesse_kernels, self.vitesse_biases_V, self.vitesse_biases_H])
      
        
        
        

  def _do_gibbs_chain(self, data, n = 1):
    """INTENT : Do one chain of length n starting from input and yields V0 Q0 VN QN in order to compute gradient
    ------------------------------------------------------------------------------------------------------------------------------------------
    PARAMETERS :
    data        :     input ie visible layer
    n           :     chain length"""
    
    V0 = data
    Q0 = self.infer_probability(data,'forward')
    ret = self.draw_samples(Q0, 'forward') 
    'ret is HO'
    for i in range(n):
      VN = self.draw_samples(self.infer_probability(ret,'backward'), 'backward')
      QN = self.infer_probability(VN,'forward')
      ret = self.draw_samples(QN, 'forward') 
      'At the end ret is HN'
    return V0,Q0,VN,QN 
      
        
      
        

  def _apply_grad(self, parameter, grad_value, vitesse, wd = False, wd_value = None, sparsity = False, sparsity_value = None, global_step = 0):
    """INTENT : Apply gradient descent to provided variable. Also modify it vitesse
    ------------------------------------------------------------------------------------------------------------------------------------------
    PARAMETERS :
    parameter         :    variable to apply gradient descent to
    grad_value        :    value of gradient
    vitesse           :    value of vitesse (for momentum)
    wd                :    true if apply l2 norm penalty
    wd_value          :    value of l2 norm penalty
    sparsity          :    true if apply sparsity regularization
    sparsity_value    :    value of sparsity penalty
    global_step       :    how far are we in the training, required for computing learning rate decay"""
    
    lr = tf.train.exponential_decay(self.learning_rate,global_step,self.decay_step,self.learning_rate_decay,staircase=True)
    ret = tf.add(tf.mul(self.momentum,vitesse),tf.mul(lr,grad_value))
    ret = vitesse.assign(ret)
    if wd:
      ret = tf.sub(ret,tf.mul(lr,wd_value))  
    if sparsity:
      ret = tf.sub(ret,tf.mul(lr,sparsity_value))
    return parameter.assign_add(ret)
      
        
      
        

  def _get_param_sparsity_penalty(self, name, Q0, V0):
    """INTENT : Compute sparsity penalty term (l2 norm of target minus mean activation)
    ------------------------------------------------------------------------------------------------------------------------------------------
    PARAMETERS :
    name               :        parameter that we want to compute it sparsity pernalty term (hidden layer bias   or kenel weight)
    Q0                 :        infered probabilities for hidden layer
    V0                 :        visible layer value (input)
    ------------------------------------------------------------------------------------------------------------------------------------------
    REMARK : see http://ai.stanford.edu/~ang/papers/nips07-sparsedeepbeliefnetworkv2.pdf
    ------------------------------------------------------------------------------------------------------------------------------------------
    REMARK : formula is for hidden_bias: -2lambda*sumonwidth/height[meanonbatch[(1-Q0)Q0(target-meanonbatch[Q0])]]  
                        for kernel      -2lambda*sumonwidth/height[meanonbatch[(1-Q0)Q0(target-meanonbatch[Q0])v]]     vi+r-1,j+s-1,(l) = v"""
    
    ret = -2*self.sparsity_coef/self.batch_size
    if self.gaussian_unit:
      ret = ret/self.gaussian_variance
    mean = tf.reduce_sum(Q0, [0], keep_dims = True)
    baseline = tf.mul(tf.sub(self.sparsity_target,mean),tf.mul(tf.sub(1.0,Q0),Q0))
    if name == 'hidden_bias':
      return tf.mul(ret,tf.reduce_sum(baseline, [0,1,2]))
    if name == 'kernel':
      if self.padding:
        retBis = tf.nn.conv2d(self._get_padded_visible(tf.transpose(V0,perm=[3,1,2,0])), tf.transpose(baseline,perm=[1,2,0,3]), [1, 1, 1, 1], padding='VALID')
      else:
        retBis = tf.nn.conv2d(tf.transpose(V0,perm=[3,1,2,0]), tf.transpose(baseline,perm=[1,2,0,3]), [1, 1, 1, 1], padding='VALID')
      retBis = tf.transpose(retBis,perm=[1,2,0,3])
      return tf.mul(ret,retBis)
      
        
        
        

  def _get_param_weight_penalty(self, operand):
    """INTENT : Compute l2 regularization pernalty term
    ------------------------------------------------------------------------------------------------------------------------------------------
    PARAMETERS :
    operand               :        parameter to be regularized"""
    
    return tf.mul(self.weight_decay,operand)
      
        
        
        

  def _get_flipped_kernel(self):
    """INTENT : Not only flip kernel horizontally and vertically but also swap in and out channels"""
    
    return tf.transpose(tf.reverse(self.kernels,[True,True,False,False]),perm=[0, 1, 3, 2])
      
        
        
        

  def _get_padded_hidden(self, hidden):
    """INTENT : Add padding to the hidden layer so that it can be convolved with flipped kernel and give the same size as input
    ------------------------------------------------------------------------------------------------------------------------------------------
    PARAMETERS :
    hidden               :        hidden layer to pad
    ------------------------------------------------------------------------------------------------------------------------------------------
    REMARK : only is self.padding is False because if self.padding is True then this is not needed"""
    
    return tf.pad(hidden,[[0,0],[self.filter_height-1,self.filter_height-1],[self.filter_width-1,self.filter_width-1],[0,0]]) 
      
        
        
        

  def _get_padded_visible(self, visible):
    """INTENT : Add padding to the visible layer so that it can be convolved with hidden layer to compute weight gradient update
    ------------------------------------------------------------------------------------------------------------------------------------------
    PARAMETERS :
    visible               :        visible layer to pad
    ------------------------------------------------------------------------------------------------------------------------------------------
    REMARK : only is self.padding is True because if self.padding is False then this is not needed"""
    
    return tf.pad(visible,[[0,0],[np.floor((self.filter_height-1)/2).astype(int),np.ceil((self.filter_height-1)/2).astype(int)],[np.floor((self.filter_width-1)/2).astype(int),np.ceil((self.filter_width-1)/2).astype(int)],[0,0]])