python source code of utils

import numpy as np
import random
import theano
import theano.tensor as T
import math

def LoadFASTAFile(seqFile):

        ##load in the seq file
        seqfh = open(seqFile, 'r')
        content = [ line.strip() for line in list(seqfh) ]
        seqfh.close()
        content2 = [ c for c in content if not c.startswith('>') and not c.startswith('#') ]
        sequence = ''.join(content2)

        return sequence


##input has a shape (batchSize, seqLen, n_in)
##this function returns a output with shape (batchSize, seqLen, seqLen, 3*n_in) such as
## output[:,i ,j, :] = input[:, (i+j)/2, :], input[:, i,:], input[:, j,:]

def MidpointFeature(input, n_in):

    seqLen = input.shape[1]
    x = T.mgrid[0:seqLen, 0:seqLen]
    y1 = x[0]
    y2 = (x[0] + x[1])/2
    y3 = x[1]

    input2 = input.dimshuffle(1, 0, 2)

    out1 = input2[y1]
    out2 = input2[y2]
    out3 = input2[y3]

    out = T.concatenate([out1, out2, out3], axis=3)
    final_out = out.dimshuffle(2, 0, 1, 3)
    n_out = 3 * n_in

    return final_out, n_out


def TestMidpointFeature():
    x = T.tensor3('x')
    y = MidpointFeature(x)
    f= theano.function([x], y)
    a = np.random.uniform(0, 1, (3, 10, 2)).astype(theano.config.floatX)
    b,c  = f(a)
    print c
    #return
    print '**********0*********'
    print a[0]
    print b[0][0]
    print '********4*******'
    print a[0]
    print b[0][4]
    print '**********9******'
    print a[0]
    print b[0][9]


def OuterConcatenate(input):
    ##an operation similar to the outer product of two vectors, but here we do concatenation instead of product of one vector with itself
    ##input has a shape (batchSize, seqLen, n_in), output has shape (batchSize, seqLen, seqLen, 2*n_in)

    seqLen = input.shape[1]
    input2 = input.dimshuffle(1, 0, 2)
    x = T.mgrid[0:seqLen, 0:seqLen]
    out = input2[x]
    output = T.concatenate((out[0], out[1]), axis=3)

    return output.dimshuffle(2, 0, 1, 3)

def TestOuterConcatenate():

    x = T.tensor3('x')
    f = theano.function([x], OuterConcatenate(x))

    a = np.random.uniform(0, 1, (2, 10, 2)).astype(np.float32)
    print a[0]
    print f(a)[0]

##pretty print a 2D array or array_like objects such as list and tuple
def PrettyPrint(count):
	for row in count:
		line = ' '.join( '%.6f' % v for v in row)
		print line


##calculate the row-wise outer product of two matrix
## A has shape n*m, B has shape n*l, the result has shape n*(m*l)
def RowWiseOuterProduct(A, B):
    a = A[:, :, np.newaxis ]
    b = B[:, np.newaxis, : ]
    c = a * b
    return c.reshape( (A.shape[0], A.shape[1] * B.shape[1]) )

##sample a bounding box from a distance matrix. Currently only square box is sampled.
def SampleBoundingBox(original_shape, sizelimit):

    assert original_shape[0] == original_shape[1]
    #sample a bounding box such that its size is <= sizelimit
    if np.prod(original_shape) <= sizelimit:
	return np.array([0, 0, original_shape[0], original_shape[1] ]).astype(np.int32)

    if (sizelimit < 20*np.array(original_shape) ).any():
	print 'sizelimit is too small or the orignal shape is too big!'
	sys.exit(-1)

    """	
    nRows_min = int(round( (sizelimit+original_shape[1]-1)/original_shape[1] ) )
    nRows_max = original_shape[0]

    nRows = random.randrange(nRows_min, nRows_max)
    nCols = int(round( sizelimit/nRows  ) )
    """
    ## due to the difficulty in dealing with 2d mask, currently the sampled bounding box has to be a square and share the same diagonal with the orignal matrix
    nRows = int( round( math.sqrt(sizelimit) ) )
    nCols = nRows

    top = random.randrange(0, original_shape[0] - nRows + 1)
    bottom = top + nRows
    left = top
    right = left + nCols
    return np.array([top, left, bottom, right]).astype(np.int32)
	
    
##check if two lists have the same shape
## list1 contains a set of theano shared variables
## list2 contains a set of np.ndarray
def Compatible(list1, list2):
    if len(list1) != len(list2):
	return False

    for l1, l2 in zip(list1, list2):
	if type(l1.get_value()) != type(l2):
		return False

	if np.isscalar(l1.get_value()):
	    continue

	if l1.get_value().shape != l2.shape:
	    return False

    return True

##generate the tile of a small tensor x, the first 2 dims will be expanded
## x is a small matrix or tensor3 to be tiled, y is a tuple of 2 elements
## This function generates a tile of x by copying it y*y times
## The resultant matrix shall have dimension ( x.shape[0]*y, x.shape[1]*y), consisting of y*y copies of x

def MyTile(x, y):
    p = T.mgrid[ 0: x.shape[0] * y[0], 0: x.shape[1] * y[1] ]
    q0 = p[0] % x.shape[0]
    q1 = p[1] % x.shape[1]
    z = x[(q0, q1)]
    return z

def TestMyTile():
    x = T.matrix('x')
    y = T.iscalar('y')
    z = MyTile(x, (y, y))

    f=theano.function([x, y], z)
    a = np.array([ [11, 12, 13, 14], [21, 22, 23, 24], [31, 32, 33, 34] ] ).astype(theano.config.floatX)
    b = 3
    c = f(a, b)

    print a
    print c
    print c.shape

    x3d = T.tensor3('x3d')
    g = theano.function([x3d, y], MyTile(x3d, (y, y)) )
    a = np.random.uniform(0, 1, (3, 3, 2)).astype(theano.config.floatX)
    c = g(a, b)
    print c.shape
    print c[:,:,0]
    print c[:,:,1]


##repeat each element in x by reps at the axis given by axes
def MyRepeat(x, reps, axes):
    assert len(reps) == len(axes)
    y = x
    for r, a in zip(reps, axes):
	y = T.repeat(y, [r], axis=a)
    return y


## Replace each vector (the last dim) of x by the linear combination of this vector and patterns 
## x is tensor4 with shape (bSize, nRows, nCols, n_in) where n_in = pshape[0]
## x can be interpreted as a set of reduced contact maps, where each contact map has size (nRows, nCols)

## the resultant matrix shall have shape (batchSize, nRows * pshape[1], nCols* pshape[2], 2), indicating the predicted prob of contacts and non-contacts

def ExpandBy3dPattern(x, patterns):

    ## patterns is a binary tensor3 with shape (numPatterns, nRows, nCols), where the 2nd and 3rd dims indicate the size of a pattern
    ## for each pattern element, 1 indicates a non-contact while  0 indicates a contact
    pshape = patterns.shape

    ## y1 has shape (batchSize, nRows * pshape[1], nCols * pshape[2], pshape[0])
    y1 = MyRepeat(x, (pshape[1], pshape[2]), axes=[1, 2])

    ##expand each pattern to a big matrix
    ## expandedPatterns has shape (1, nRows * pshape[1], nCols * pshape[2], pshape[0])
    #expandedPatterns = MyTile(patterns.dimshuffle(1, 2, 0), (x.shape[1], x.shape[2]) ).dimshuffle('x', 0, 1, 2)
    expandedPatterns = T.tile(patterns, (1, x.shape[1], x.shape[2]) ).dimshuffle('x', 1, 2, 0)


    ## calculate linear combination and the prob of non-contacts
    y2 = T.mul( y1, expandedPatterns)
    y3 = T.sum( y2, axis=3, keepdims=True)

    ##calculate the prob of contacts
    y4 = 1 - y3

    ##both y3 and y4 have shape (bSize, nRows*pshape[1], nCols*pshape[2])
    ## y3 contains the probability of non-contacts
    ## y4 contains the probability of contacts
    return T.concatenate([y4, y3], axis=3)

#this function returns a tensor with shape (bSize, nRows*nPatternRows, nCols*nPatternCols, numLabels)
def ExpandBy4dPattern(x, patterns):
    ##patterns has shape (numPatterns, nPatternRows, nPatternCols, numLabels)
    ##each element is between 0 and 1 and the sum of the vector patterns[i, j, k, :] is equal to 1
    pshape = patterns.shape

    ## y1 has shape (batchSize, nRows * pshape[1], nCols * pshape[2], pshape[0])
    y1 = MyRepeat(x, (pshape[1], pshape[2]), axes=[1, 2])
    expandedPatterns = T.tile(patterns, (1, x.shape[1], x.shape[2], 1) ).dimshuffle('x', 1, 2, 0, 3)

    ylist = []
    for i in xrange(pshape[3]):
	y2 = T.mul( y1, expandedPatterns[:, :, :, :, i] )
	y3 = T.sum( y2, axis=3, keepdims=True)
	ylist.append(y3)
    return T.concatenate( ylist, axis=3)

def ExpandByPattern(x, patterns):
    if patterns.ndim == 3:
	return ExpandBy3dPattern(x, patterns)
    elif patterns.ndim == 4:
	return ExpandBy4dPattern(x, patterns)
    else:
	print 'unsupported ndim of patterns: ', patterns.ndim
	sys.exit(-1)

## x has shape (bSize, nRows, nCols, numPatterns)
## patterns has shape (numPatterns, patternshape, numLabels)
def ConvByPattern(x, patterns, mask=None):
    W = np.transpose(patterns, (3, 0, 1, 2))
    out2 = T.nnet.conv2d(x.dimshuffle(0, 3, 1, 2), W, filter_shape=W.shape, border_mode='half')
    if mask is not None:
        ## mask has shape (batchSize, #rows_to_be_masked, nCols)

        ## a subtensor of out2 along the horiz direction
        out2_sub_horiz = out2[:, :, :mask.shape[1], :]
        mask_horiz = mask.dimshuffle(0, 'x', 1, 2)
        out3 = T.set_subtensor(out2_sub_horiz, T.mul(out2_sub_horiz, mask_horiz) )

        ## a subtensor of out3 along the vertical direction
        out3_sub_vertical = out3[:, :, :, :mask.shape[1] ]
        mask_vertical = mask.dimshuffle(0, 'x', 2, 1)
        y = T.set_subtensor(out3_sub_vertical, T.mul(out3_sub_vertical, mask_vertical) )
    else:
	y = out2

    y = y.dimshuffle(0, 2, 3, 1)

    return y/np.prod(patterns.shape[1:3])

  
def TestConvByPattern():
    x = T.tensor4('x')
    
    numPatterns = 10
    psize1 = 3
    psize2 = 3
    numLabels = 3
    bSize = 2
    nRows = 20
    nCols = 20

    pshape = (numPatterns, psize1, psize2, numLabels)
    patterns = np.random.uniform(0, 1, pshape).astype(theano.config.floatX)
    psum = np.sum(patterns, axis=3, keepdims=True)
    patterns = patterns / psum

    f = theano.function([x], ConvByPattern(x, patterns) )

    xshape = (bSize, nRows, nCols, numPatterns)
    a = np.random.uniform(0, 1, xshape).astype(theano.config.floatX)
    asum = np.sum(a, axis=3, keepdims=True)
    a = a / asum

    b = f(a)
    print b.shape
    print b[0][1][5]
    print b[1][3][3]

    print np.sum(b, axis=3)
    


def TestExpandByPattern():
    x = T.tensor4('x')
    pa = np.array([ [ 0, 0, 0], [ 0, 0, 1], [ 0, 1, 0] ]).astype(np.float32)
    pb = np.array([ [ 0, 1, 0], [ 0, 1, 1], [ 0, 1, 1] ]).astype(np.float32)
    pc = np.array([ [ 1, 1, 0], [ 1, 1, 1], [ 1, 0, 1] ]).astype(np.float32)
    pd = np.array([ [ 1, 1, 0], [ 1, 1, 1], [ 1, 0, 1] ]).astype(np.float32)
    pe = np.array([ [ 1, 1, 0], [ 1, 1, 1], [ 1, 0, 1] ]).astype(np.float32)

    patterns = np.array([ pa, pb, pc, pd, pe])

    pshape = patterns.shape

    ashape = (2, 4, 4, pshape[0])

    f = theano.function([x], ExpandByPattern(x, patterns) )

    a = np.random.uniform(0, 1, ashape).astype(theano.config.floatX)

    ##normalize a so that the sum of its last dim is 1
    b = np.sum(a, axis=3, keepdims=True)
    a = a/b

    b = f(a)
    print b
    print b.shape

    patterns = np.random.uniform(0, 1, (5, 3, 3, 3)).astype(np.float32)
    p2 = np.sum(patterns, axis=3, keepdims=True)
    p3 = patterns / p2
    f = theano.function([x], ExpandByPattern(x, p3) )
    b = f(a)
    print b
    print b.shape

def TestMyRepeat():

    x=T.tensor4('x')
    y=MyRepeat(x, (2, 2), axes=[1, 2])
    f=theano.function([x], y)

    a=np.random.uniform(0,1,(2, 3, 3, 2)).astype(theano.config.floatX)
    b=f(a)

    print a
    print b

def TestStack():
    x = T.matrix('x')
    y = T.matrix('y')
    z = T.matrix('z')

    f = theano.function([x, y, z], T.stacklists([ x, y, z]) )

    a = np.ones((5,4), dtype=np.float32)
    b = np.ones((5,4), dtype=np.float32)
    c = np.ones((5,4), dtype=np.float32)

    d = f(a, b, c)
    print d.shape
    print d


if __name__ == "__main__":
    #TestMyRepeat()
    #TestStack()
    #TestMyTile()
    #TestExpandByPattern()
    TestConvByPattern()