import numpy as np import cPickle import os import os.path import sys import time import datetime import random import gzip import theano from utils import RowWiseOuterProduct, SampleBoundingBox import DistanceUtils import config from config import Response2LabelName, Response2LabelType ##this file contains some functions for data processing def PriorDistancePotential(sequence=None, paramfile=None): ##add pairwise distance potential here ## an example paramfile is data4contact/pdb25-pair-dist.pkl if not os.path.isfile(paramfile): print 'cannot find the parameter file: ', paramfile exit(-1) fh = open(paramfile,'rb') potential = cPickle.load(fh)[0].astype(np.float32) fh.close() assert (len(potential.shape) == 4) potentialFeature = np.zeros((len(sequence), len(sequence), potential.shape[-1]), dtype=theano.config.floatX) ##convert AAs to integers ids = [ ord(AA) - ord('A') for AA in sequence ] ##the below procedure is not very effective. What we can do is to generate a full matrix of only long-range potential using OuterConcatenate and np.choose ##and then using the np.diagonal() function to replace near-, short- and medium-range potential in the full matrix for i, id0 in zip(xrange(len(ids)), ids): for j, id1 in zip(xrange(i+1, len(ids)), ids[i+1:]): if j-i<6: sepIndex = 0 elif j-i < 12: sepIndex = 1 elif j-i < 24: sepIndex = 2 else: sepIndex = 3 if id0 <=id1: potentialFeature[i][j]=potential[sepIndex][id0][id1] else: potentialFeature[i][j]=potential[sepIndex][id1][id0] potentialFeature[j][i]=potentialFeature[i][j] return potentialFeature ##d is a dictionary for a protein def LocationFeature(d): ##add one specific location feature here, i.e., posFeature[i, j]=min(1, abs(i-j)/30.0 ) posFeature = np.ones_like(d['ccmpredZ']).astype(theano.config.floatX) separation_cutoff = 30 end = min(separation_cutoff - 1, posFeature.shape[0]) for offset in xrange(0, end): i = np.arange(0, posFeature.shape[0]-offset) j = i + offset posFeature[i, j] = offset/(1. * separation_cutoff) for offset in xrange(1, end): i = np.arange(offset, posFeature.shape[0]) j = i - offset posFeature[i, j] = offset/(1. * separation_cutoff) return posFeature def CubeRootFeature(d): ##the cube root of the difference between the two positions, the radius of protein is related to this feature seqLen = len(d['sequence']) feature = [] for i in range(seqLen): dVector = [ abs(j-i) for j in range(seqLen) ] feature.append(dVector) posFeature = np.cbrt( np.array( feature ).astype(theano.config.floatX) ) return posFeature ## load native distance matrix from a file def LoadNativeDistMatrixFromFile(filename): if not os.path.isfile(filename): print 'WARNING: cannot find the native distance matrix file: ', filename exit(-1) fh = open(filename, 'rb') distMatrix = cPickle.load(fh) fh.close() return distMatrix ## load native distance matrix by protein name and location def LoadNativeDistMatrix(name, location='pdb25-7952-atomDistMatrix/'): filename = os.path.join(location, name+'.atomDistMatrix.pkl') if not os.path.isfile(filename): print 'WARNING: cannot find the native distance matrix file: ', filename return None fh = open(filename, 'rb') distMatrix = cPickle.load(fh) fh.close() return distMatrix def LoadDistanceFeatures(files=None, modelSpecs=None, forTrainValidation=True): if files is None or len(files)==0: print 'the feature file is empty' exit(-1) fhs = [ open(file, 'rb') for file in files ] data = sum([ cPickle.load(fh) for fh in fhs ], []) [ fh.close() for fh in fhs ] ## each protein has sequential and pairwise features as input and distance matrix as label proteinFeatures=[] counter = 0 for d in data: oneprotein = dict() oneprotein['name'] = d['name'] ## convert the primary sequence to a one-hot encoding oneHotEncoding = config.SeqOneHotEncoding(d['sequence']) ## prepare features for embedding. Currently we may embed a pair of residues or a pair of residue+secondary structure if config.EmbeddingUsed(modelSpecs): if modelSpecs['seq2matrixMode'].has_key('Seq+SS'): embedFeature = RowWiseOuterProduct(oneHotEncoding, d['SS3']) else: embedFeature = oneHotEncoding oneprotein['embedFeatures'] = embedFeature ##collecting sequential features... seqMatrices = [ oneHotEncoding ] ## 3-state secondary structure shall always be placed before the other features, why? if modelSpecs.has_key('UseSS') and (modelSpecs['UseSS'] is True ): seqMatrices.append( d['SS3'] ) if modelSpecs.has_key('UseACC') and (modelSpecs['UseACC'] is True ) : seqMatrices.append( d['ACC'] ) if modelSpecs.has_key('UsePSSM') and (modelSpecs['UsePSSM'] is True ) : seqMatrices.append( d['PSSM'] ) if modelSpecs.has_key('UseDisorder') and modelSpecs['UseDisorder'] is True: seqMatrices.append( d['DISO'] ) ##membrane protein specific features useMPSpecificFeatures = modelSpecs.has_key('UseMPSpecificFeatures') and (modelSpecs['UseMPSpecificFeatures'] is True) if useMPSpecificFeatures: if d.has_key('MemAcc'): seqMatrices.append(d['MemAcc']) else: print 'The data does not have a feature called MemAcc' exit(-1) if d.has_key('MemTopo'): seqMatrices.append(d['MemTopo']) else: print 'The data does not have a feature called MemTopo' exit(-1) ## Add sequence-template similarity score here. This is used to predict distance matrix from a sequence-template alignment. ## this is mainly used for homology modeling if modelSpecs.has_key('UseTemplate') and modelSpecs['UseTemplate']: #print 'Using template similarity score...' if not d.has_key('tplSimScore'): print 'the data has no key tplSimScore, which is needed since you specify to use template information' exit(-1) if d['tplSimScore'].shape[1] != 11: print 'The number of features for query-template similarity shall be equal to 11' exit(-1) seqMatrices.append( d['tplSimScore'] ) seqFeature = np.concatenate( seqMatrices, axis=1).astype(np.float32) ##collecting pairwise features... pairfeatures = [] ##add one specific location feature here, i.e., posFeature[i, j]=min(1, abs(i-j)/30.0 ) posFeature = LocationFeature(d) pairfeatures.append(posFeature) cbrtFeature = CubeRootFeature(d) pairfeatures.append(cbrtFeature) if modelSpecs.has_key('UseCCM') and (modelSpecs['UseCCM'] is True ) : if not d.has_key('ccmpredZ'): print 'Something must be wrong. The data for protein ', d['name'], ' does not have the normalized ccmpred feature!' exit(-1) pairfeatures.append( d['ccmpredZ'] ) """ ##add pairwise distance potential here, we depreciate this to save memory and speed up if modelSpecs['UsePriorDistancePotential'] is True: INSTALLDIR = os.getenv('DL4DistancePredHome') if INSTALLDIR is None: print 'please set the environment variable DL4DistancePredHome as the installation directory of the contact prediction program' sys.exit(-1) potentialFeature = PriorDistancePotential(sequence=d['sequence'].upper(), paramfile=os.path.join(INSTALLDIR, 'data4contact/pdb25-pair-dist.pkl') ) pairfeatures.append(potentialFeature) """ if modelSpecs['UsePSICOV'] is True: pairfeatures.append(d['psicovZ']) if modelSpecs.has_key('UseOtherPairs') and (modelSpecs['UseOtherPairs'] is True ): pairfeatures.append( d['OtherPairs'] ) ##add template-related distance matrix. This code needs modification later ## somewhere we shall also write code to add template-related sequential features such as secondary structure? if modelSpecs.has_key('UseTemplate') and modelSpecs['UseTemplate']: #print 'Using template distance matrix...' if not d.has_key('tplDistMatrix'): print 'the data for ', d['name'], ' has no tplDistMatrix, which is needed since you specify to use template information' exit(-1) ## Check to make sure that we use exactly the same set of inter-atom distance information from templates ## currently we do not use HB and Beta information from template apts = d['tplDistMatrix'].keys() assert ( set(apts) == set(config.allAtomPairTypes) ) ##assert ( set(apts) == set(config.allAtomPairTypes) or set(apts)==set(config.allLabelNames) ) tmpPairFeatures = dict() for apt, tplDistMatrix in d['tplDistMatrix'].items(): ##use one flagMatrix to indicate which entries are invalid (due to gaps or disorder) since they shall be same regardless of atom pair type if apt == 'CaCa': flagMatrix = np.zeros_like(tplDistMatrix) np.putmask(flagMatrix, tplDistMatrix < 0, 1) pairfeatures.append(flagMatrix) """ if apt == 'HB' or apt =='Beta': ## in this case, tplDistMatrix itself is a binary matrix. This kind of information is not used pairfeatures.append(tplDistMatrix) continue """ strengthMatrix = np.copy(tplDistMatrix) np.putmask(strengthMatrix, tplDistMatrix < 3.5, 3.5) np.putmask(strengthMatrix, tplDistMatrix < -0.01, 50) strengthMatrix = 3.5 / strengthMatrix if config.InTPLMemorySaveMode(modelSpecs): tmpPairFeatures[apt] = [ strengthMatrix ] else: tmpPairFeatures[apt] = [ strengthMatrix, np.square(strengthMatrix) ] ## here we add the tmpPairFeatures to pairfeatures in a fixed order. This can avoid errors introduced by different ordering of keys in a python dict() structure ## python of different versions may have different ordering of keys in dict() ? pairfeatures.extend( tmpPairFeatures['CbCb'] ) pairfeatures.extend( tmpPairFeatures['CgCg'] ) pairfeatures.extend( tmpPairFeatures['CaCg'] ) pairfeatures.extend( tmpPairFeatures['CaCa'] ) pairfeatures.extend( tmpPairFeatures['NO'] ) if config.InTPLMemorySaveMode(modelSpecs): matrixFeature = np.dstack( tuple(pairfeatures) ).astype(np.float32) else: matrixFeature = np.dstack( tuple(pairfeatures) ) #print 'matrixFeature.shape: ', matrixFeature.shape oneprotein['sequence'] = d['sequence'] oneprotein['seqLen'] = seqFeature.shape[0] oneprotein['seqFeatures'] = seqFeature oneprotein['matrixFeatures'] = matrixFeature ##collecting labels... if d.has_key('atomDistMatrix'): atomDistMatrix = d['atomDistMatrix'] oneprotein['atomLabelMatrix'] = dict() for response in modelSpecs['responses']: responseName = Response2LabelName(response) labelType = Response2LabelType(response) if not atomDistMatrix.has_key(responseName): print 'In the raw feature data, ', d['name'], ' does not have matrix for ', responseName exit(-1) ## atomDistMatrix is the raw data, so it does not have information about labelType distm = atomDistMatrix[responseName] if labelType.startswith('Discrete'): subType = labelType[len('Discrete'): ] ## no need to discretize for HB and Beta-Pairing since they are binary matrices if responseName.startswith('HB') or responseName.startswith('Beta'): oneprotein['atomLabelMatrix'][response] = distm ## process the other atom pairs such as Cb-Cb, Ca-Ca else: labelMatrix, _, _ = DistanceUtils.DiscretizeDistMatrix(distm, config.distCutoffs[subType], subType.endswith('Plus') ) oneprotein['atomLabelMatrix'][response] = labelMatrix elif labelType.startswith('LogNormal'): labelMatrix = DistanceUtils.LogDistMatrix(distm) oneprotein['atomLabelMatrix'][response] = labelMatrix elif labelType.startswith('Normal'): oneprotein['atomLabelMatrix'][response] = distm else: print 'unsupported response: ', res exit(-1) elif forTrainValidation: print 'atomic distance matrix is needed for the training and validation data' exit(-1) ##at this point, finish collecting features and labels for one protein proteinFeatures.append(oneprotein) counter += 1 if (counter %500 ==1): print 'assembled features and labels for ', counter, ' proteins.' """ tmpfile = open(files[0] + '.contactInput.pkl', 'wb') cPickle.dump(proteinFeatures, tmpfile, protocol = cPickle.HIGHEST_PROTOCOL) tmpfile.close() """ return proteinFeatures ##this function calculates the label distribution of the training proteins and then label weight for long-, medium-, short- and near-range labels ## to assign weight to a specific label matrix, please use another function CalcLabelWeightMatrix() ## data is the trainData generated by LoadDistanceFeatures() and it has already had labels assigned ## the weight factor for a continuous distance label (i.e., regression) is a 4*1 matrix ## the weight factor for a discrete distance label (i.e., classificaton) is a 4*numLabels matrix def CalcLabelDistributionAndWeight(data=None, modelSpecs=None): ## weight for different ranges (long, medium, short, and near-ranges) if not modelSpecs.has_key('weight4range'): modelSpecs['weight4range'] = np.array([3., 2.5, 1., 0.5]).reshape((4,1)).astype(np.float32) else: modelSpecs['weight4range'].reshape((4,1)).astype(np.float32) print 'weight for range: ', modelSpecs['weight4range'] ## weight for 3C, that is, three distance intervals, 0-8, 8-15, and > 15 if modelSpecs.has_key('LRbias'): modelSpecs['weight4Discrete3C']= np.multiply(config.weight43C[modelSpecs['LRbias'] ], modelSpecs['weight4range']) else: modelSpecs['weight4Discrete3C']= np.multiply(config.weight43C['mid'], modelSpecs['weight4range']) print 'LRbias= ', modelSpecs['LRbias'], 'weight43C= ', modelSpecs['weight4Discrete3C'] ## weight for 2C modelSpecs['weight4HB_Discrete2C'] = np.multiply(config.weight4HB2C, modelSpecs['weight4range']) modelSpecs['weight4Beta_Discrete2C'] = np.multiply(config.weight4Beta2C, modelSpecs['weight4range']) ## weight for real value modelSpecs['weight4continuous'] = np.multiply(np.array([1.] * 4).reshape((4, 1)).astype(np.float32), modelSpecs['weight4range']) ## collect all discrete label matrices allLabelMatrices = dict() for response in modelSpecs['responses']: name = Response2LabelName(response) labelType = Response2LabelType(response) if labelType.startswith('LogNormal') or labelType.startswith('Normal'): continue allLabelMatrices[response] = [ d['atomLabelMatrix'][response] for d in data ] ## calculate the discrete label distribution allRefProbs = dict() for response in modelSpecs['responses']: name = Response2LabelName(response) labelType = Response2LabelType(response) if labelType.startswith('LogNormal') or labelType.startswith('Normal'): allRefProbs[response] = np.array([1.] * 4).reshape((4, 1)).astype(np.float32) continue if modelSpecs.has_key('UseBoundingBox4RefProbs') and (modelSpecs['UseBoundingBox4RefProbs'] is True): ## here we sample a sub label matrix using BoundingBox to account for the real training scenario newLabelMatrices = [] for lMatrix in allLabelMatrices[response]: bounds = SampleBoundingBox( (lMatrix.shape[0], lMatrix.shape[1]), modelSpecs['maxbatchSize'] ) new_lMatrix = lMatrix[ bounds[0]:bounds[2], bounds[1]:bounds[3] ].astype(np.int32) newLabelMatrices.append(new_lMatrix) allRefProbs[response] = DistanceUtils.CalcLabelProb(data = newLabelMatrices, numLabels = config.responseProbDims[labelType]) else: allRefProbs[response] = DistanceUtils.CalcLabelProb(data = [ m.astype(np.int32) for m in allLabelMatrices[response] ], numLabels = config.responseProbDims[labelType]) modelSpecs['labelRefProbs'] = allRefProbs ##for discrete labels, we calculate their weights by inferring from the weight intialized to 3 bins: 0-8, 8-15 and >15 or -1, which makes inference easier modelSpecs['weight4labels'] = dict() for response in modelSpecs['responses']: name = Response2LabelName(response) labelType = Response2LabelType(response) if labelType.startswith('LogNormal') or labelType.startswith('Normal'): ## just need to assign range weight modelSpecs['weight4labels'][response] = modelSpecs['weight4continuous'] continue if labelType.startswith('Discrete'): subType = labelType[ len('Discrete'): ] ## if the response is for HB and BetaPairing if subType.startswith('2C'): modelSpecs['weight4labels'][response] = modelSpecs['weight4' + response] continue ## if the response is 3C for normal atom pairs such as Cb-Cb, Ca-Ca, Cg-Cg, CaCg, and NO if subType.startswith('3C'): modelSpecs['weight4labels'][response] = modelSpecs['weight4Discrete3C'] continue ## calculate label weight for 12C, 25C, and 52C for the normal atom pairs such as Cb-Cb, Ca-Ca, Cg-Cg, CaCg, and NO modelSpecs['weight4labels'][response] = DistanceUtils.CalcLabelWeight(modelSpecs['weight4Discrete3C'], allRefProbs[response], config.distCutoffs[subType] ) continue print 'unsupported response in CalcLabelDistributionAndWeight: ', response exit(-1) return modelSpecs['labelRefProbs'], modelSpecs['weight4labels'] ## this function calculates the label weight matrix for a specific label matrix ## the same label may have different weights depending on if a residue pair is in near-range, short-range, medium-range or long-range. ## labelMatrices is a dictionary and has an entry for each response. ## This function returns a dictionary object for labelWeightMatrix def CalcLabelWeightMatrix(LabelMatrix=None, modelSpecs=None): if LabelMatrix is None: return None M1s = np.ones_like(LabelMatrix.values()[0], dtype=np.int16) np.fill_diagonal(M1s, 0) LRmask = np.triu(M1s, 24) + np.tril(M1s, -24) MLRmask = np.triu(M1s, 12) + np.tril(M1s, -12) SMLRmask = np.triu(M1s, 6) + np.tril(M1s, -6) SRmask = SMLRmask - MLRmask MRmask = MLRmask - LRmask NRmask = M1s - SMLRmask for response in modelSpecs['responses']: if not modelSpecs['weight4labels'].has_key(response): print 'Cannot find the weight factor tensor for response ', response exit(-1) ##the below procedure is not very effective. We shall improve it later. labelWeightMatrices = dict() for response in modelSpecs['responses']: ##name = Response2LabelName(response) labelType = Response2LabelType(response) labelWeightMatrices[response] = np.zeros_like(LabelMatrix[response], dtype=theano.config.floatX) ## wMatrix is a matrix with dimension 4 * numLabels wMatrix = modelSpecs['weight4labels'][response] wMatrixShape = wMatrix.shape assert (wMatrixShape[0] == 4) if labelType.startswith('Normal') or labelType.startswith('LogNormal'): ## if the label is real value, then for each range, there is only a single weight for all the possible values tmpWeightMatrices = [] for i in range(4): tmp = wMatrix[i][ M1s ] ## set the weight of the entries without valid distance to 0. An invalid entry in the label matrix is indicated by a negative value,e.g., -1 np.putmask(tmp, LabelMatrix[response] < 0, 0 ) tmpWeightMatrices.append(tmp) else: tmpWeightMatrices = [ wMatrix[i][LabelMatrix[response]] for i in range(4) ] LRw, MRw, SRw, NRw = tmpWeightMatrices labelWeightMatrices[response] += (LRmask * LRw + MRmask* MRw + SRmask * SRw + NRmask * NRw) return labelWeightMatrices """ ## need revision ##hopefully this is a better implementation of CalcLabWeightMatrix, but needs test for efficiency and correctness def CalcLabelWeightMatrix2(LabelMatrix=None, modelSpecs=None): if LabelMatrix is None: return None labelType = modelSpecs['distLabelType'] if not modelSpecs.has_key('weight4' + labelType): print 'Cannot find the label weight for the label type ', labelType, '. Please make sure that it has been generated already.' exit(-1) ##wMatrices is a dictionary. Each item is a matrix of size 4*numLabels for each atomPairType. wMatrices = modelSpecs['weight4' + labelType] ##the below procedure is not very effective. We shall improve it later. labelWeightMatrices = dict() for apt in modelSpecs['atomPairTypes']: size = LabelMatrix[apt].shape ## for long-range weight labelWeightMatrices[apt] = np.choose(LabelMatrix[apt], wMatrices[apt][0]) ## set the diagonal line to 0 np.fill_diagonal(labelWeightMatrices[apt], 0) ##for medium-range weight for offset in np.arange(23, 11, -1): i = np.arange(0, size[0]-offset) j = i + offset labelWeightMatrices[apt][ i, j ] = np.choose(LabelMatrix[apt].diagonal(offset), wMatrices[apt][1] ) for offset in np.arange(-23, -11, 1): i = np.arange(-offset, size[0]) j = i + offset labelWeightMatrices[apt][ i, j ] = np.choose(LabelMatrix[apt].diagonal(offset), wMatrices[apt][1] ) ##for short-range weight for offset in np.arange(11, 5, -1): i = np.arange(0, size[0]-offset) j = i + offset labelWeightMatrices[apt][ i, j ] = np.choose(LabelMatrix[apt].diagonal(offset), wMatrices[apt][2] ) for offset in np.arange(-11,-5, 1): i = np.arange(-offset, size[0]) j = i + offset labelWeightMatrices[apt][ i, j ] = np.choose(LabelMatrix[apt].diagonal(offset), wMatrices[apt][2] ) if modelSpecs['rangeMode'] != 'All': for offset in np.arange(5, 1, -1): np.fill_diagonal(labelWeightMatrics[apt][0:size[0]-offset, offset:size[1]], 0) for offset in np.arange(1, 5, 1): np.fill_diagonal(labelWeightMatrics[apt][offset:size[0], 0:size[1]-offset ], 0) continue ##for near-range weight for offset in np.arange(6, 1, -1): i = np.arange(0, size[0]-offset) j = i + offset labelWeightMatrices[apt][ i, j ] = np.choose(LabelMatrix[apt].diagonal(offset), wMatrices[apt][3] ) for offset in np.arange(-6,-1, 1): i = np.arange(-offset, size[0]) j = i + offset labelWeightMatrices[apt][ i, j ] = np.choose(LabelMatrix[apt].diagonal(offset), wMatrices[apt][3] ) return labelWeightMatrices """ ## this function prepares one batch of data for training, validation and test ## data is a list of protein features and possibly labels, generated by LoadDistanceFeatures def AssembleOneBatch( data, modelSpecs ): if not data: print 'WARNING: the list of data is empty' return None numSeqs = len(data) seqLens = [ d['seqLen'] for d in data ] maxSeqLen = max( seqLens ) minSeqLen = min( seqLens ) #print 'maxSeqLen= ', maxSeqLen, 'minSeqLen= ', minSeqLen X1d = np.zeros(shape=(numSeqs, maxSeqLen, data[0]['seqFeatures'].shape[1] ), dtype = theano.config.floatX) X2d = np.zeros(shape=(numSeqs, maxSeqLen, maxSeqLen, data[0]['matrixFeatures'].shape[2] ), dtype = theano.config.floatX) X1dem = None if data[0].has_key('embedFeatures'): X1dem = np.zeros(shape=(numSeqs, maxSeqLen, data[0]['embedFeatures'].shape[1] ), dtype = theano.config.floatX) ## Y shall be a list of 3D matrices, each for one atom type. Need to revise dtype for Y Y = [] if data[0].has_key('atomLabelMatrix'): for response in modelSpecs['responses']: labelType = Response2LabelType(response) dataType = np.int16 if not labelType.startswith('Discrete'): dataType = theano.config.floatX rValDims = config.responseValueDims[labelType] if rValDims == 1: Y.append( np.zeros(shape=(numSeqs, maxSeqLen, maxSeqLen), dtype = dataType ) ) else: Y.append( np.zeros(shape=(numSeqs, maxSeqLen, maxSeqLen, nValDims), dtype = dataType ) ) ## when Y is empty, weight is useless. So When Y is None, weight shall also be None weightMatrix = [] if Y and modelSpecs['UseSampleWeight']: weightMatrix = [ np.zeros(shape=(numSeqs, maxSeqLen, maxSeqLen), dtype = theano.config.floatX) ] * len( modelSpecs['responses'] ) ## for mask M1d = np.zeros(shape=(numSeqs, maxSeqLen - minSeqLen ), dtype=np.int8 ) M2d = np.zeros(shape=(numSeqs, maxSeqLen - minSeqLen, maxSeqLen ), dtype=np.int8 ) for j in range(len(data) ): seqLen = data[j]['seqLen'] X1d[j, maxSeqLen - seqLen :, : ] = data[j]['seqFeatures'] X2d[j, maxSeqLen - seqLen :, maxSeqLen - seqLen :, : ] = data[j]['matrixFeatures'] M1d[j, maxSeqLen - seqLen : ].fill(1) M2d[j, maxSeqLen - seqLen :, maxSeqLen - seqLen : ].fill(1) if X1dem is not None: X1dem[j, maxSeqLen - seqLen :, : ] = data[j]['embedFeatures'] if Y: for y, response in zip(Y, modelSpecs['responses']): if len(y.shape) == 3: y[j, maxSeqLen-seqLen :, maxSeqLen-seqLen : ] = data[j]['atomLabelMatrix'][response] else: y[j, maxSeqLen-seqLen :, maxSeqLen-seqLen:, ] = data[j]['atomLabelMatrix'][response] if weightMatrix: ## we calculate the labelWeightMatrix here labelWeightMatrix = CalcLabelWeightMatrix(data[j]['atomLabelMatrix'], modelSpecs) for w, at in zip( weightMatrix, modelSpecs['responses']): w[j, maxSeqLen - seqLen :, maxSeqLen - seqLen : ] = labelWeightMatrix[at] onebatch = [X1d, X2d, M1d, M2d] if X1dem is not None: onebatch.append(X1dem) onebatch.extend(Y) onebatch.extend(weightMatrix) return onebatch ##split data into minibatch, each minibatch numDataPoints data points def SplitData2Batches(data=None, numDataPoints=1000000, modelSpecs=None): if data is None: print 'Please provide data for process!' sys.exit(-1) if numDataPoints < 10: print 'Please specify the number of data points in a minibatch' sys.exit(-1) ## sort proteins by length from large to small data.sort(key=lambda x: x['seqLen'], reverse=True) ##seqDataset stores the resultant data batches = [] names = [] i = 0 while i < len(data): currentSeqLen = data[i]['seqLen'] numSeqs = min( len(data) - i, max(1, numDataPoints/np.square(currentSeqLen) ) ) #print 'This batch contains ', numSeqs, ' sequences' names4onebatch = [ d['name'] for d in data[i: i+numSeqs] ] oneBatch = AssembleOneBatch( data[i : i+numSeqs], modelSpecs ) batches.append(oneBatch) names.append(names4onebatch) i += numSeqs return batches, names def CalcAvgWeightPerBatch(batches, modelSpecs): if not modelSpecs['UseSampleWeight']: return None numResponses = len(modelSpecs['responses']) allWeights = [] for b in batches: oneBatchWeight = [] for wMatrix in b[-numResponses: ]: bounds = SampleBoundingBox( (wMatrix.shape[1], wMatrix.shape[2]), modelSpecs['maxbatchSize'] ) new_wMatrix = wMatrix[:, bounds[0]:bounds[2], bounds[1]:bounds[3] ] wSum = np.sum(new_wMatrix) oneBatchWeight.append(wSum) allWeights.append(oneBatchWeight) avgWeights = np.average(allWeights, axis=0) modelSpecs['batchWeightBase'] = np.array(avgWeights).astype(theano.config.floatX) maxWeights = np.amax(allWeights, axis=0) minWeights = np.amin(allWeights, axis=0) ## reutrn the maximum deviation return maxWeights/avgWeights, minWeights/avgWeights