""" Neural Network TensorFlow operations for protein structure prediction. In general this module contains functions for constructing different parts of GeomNetModel networks, excepting ones related to geometric operations. There are some conventions used throughout this module. First, most functions accept some combination of TF tensors and regular python objects. Since all the functions construct parts of TF graphs, the TF tensors they accept are meant to be variables that can change from data point to data point or iteration to iteration. On the other hand, the python objects are meant to be fixed parameters used once in the construction of the TF graph and never revisted. Which is which is indicated in each function, and by the fact that python objects are not converted into TF tensors. Having said that, some funcs are actually somewhat loose, and would work with dynamic values for the supposedly fixed arguments. However the intended behavior is what's described. """ __author__ = "Mohammed AlQuraishi" __copyright__ = "Copyright 2018, Harvard Medical School" __license__ = "MIT" # Imports import numpy as np import tensorflow as tf # Constants NUM_AAS = 20 NUM_DIMENSIONS = 3 NUM_DIHEDRALS = 3 ### Public functions # These functions expose a public interface that properly encapsulates their internals # using tensorflow scoping operations and such. While they are primarily used by the # GeomNetModel, they may also have general utility beyond it. All these functions # are strictly stateless, possessing no internal TF variables. def masking_matrix(mask, name=None): """ Constructs a masking matrix to zero out pairwise distances due to missing residues or padding. This function needs to be called for each individual sequence, and so it's folded in the reading/queuing pipeline for performance reasons. Args: mask: 0/1 vector indicating whether a position should be masked (0) or not (1) Returns: A square matrix with all 1s except for rows and cols whose corresponding indices in mask are set to 0. [MAX_SEQ_LENGTH, MAX_SEQ_LENGTH] """ with tf.name_scope(name, 'masking_matrix', [mask]) as scope: mask = tf.convert_to_tensor(mask, name='mask') mask = tf.expand_dims(mask, 0) base = tf.ones([tf.size(mask), tf.size(mask)]) matrix_mask = base * mask * tf.transpose(mask) return matrix_mask def effective_steps(masks, num_edge_residues, name=None): """ Returns the effective number of steps, i.e. number of residues that are non-missing and are not just padding, given a masking matrix. Args: masks: A batch of square masking matrices (batch is last dimension) [MAX_SEQ_LENGTH, MAX_SEQ_LENGTH, BATCH_SIZE] Returns: A vector with the effective number of steps [BATCH_SIZE] """ with tf.name_scope(name, 'effective_steps', [masks]) as scope: masks = tf.convert_to_tensor(masks, name='masks') traces = tf.matrix_diag_part(tf.transpose(masks, [2, 0, 1])) eff_stepss = tf.add(tf.reduce_sum(traces, [1]), num_edge_residues, name=scope) # NUM_EDGE_RESIDUES shouldn't be here, but I'm keeping it for # legacy reasons. Just be clear that it's _always_ wrong to have # it here, even when it's not equal to 0. return eff_stepss def read_protein(filename_queue, max_length, num_edge_residues, num_evo_entries, name=None): """ Reads and parses a protein TF Record. Primary sequences are mapped onto 20-dimensional one-hot vectors. Evolutionary sequences are mapped onto num_evo_entries-dimensional real-valued vectors. Secondary structures are mapped onto ints indicating one of 8 class labels. Tertiary coordinates are flattened so that there are 3 times as many coordinates as residues. Evolutionary, secondary, and tertiary entries are optional. Args: filename_queue: TF queue for reading files max_length: Maximum length of sequence (number of residues) [MAX_LENGTH]. Not a TF tensor and is thus a fixed value. Returns: id: string identifier of record one_hot_primary: AA sequence as one-hot vectors evolutionary: PSSM sequence as vectors secondary: DSSP sequence as int class labels tertiary: 3D coordinates of structure matrix_mask: Masking matrix to zero out pairwise distances in the masked regions pri_length: Length of amino acid sequence keep: True if primary length is less than or equal to max_length """ with tf.name_scope(name, 'read_protein', []) as scope: # Set up reader and read reader = tf.TFRecordReader() _, serialized_example = reader.read(filename_queue) # Parse TF Record context, features = tf.parse_single_sequence_example(serialized_example, context_features={'id': tf.FixedLenFeature((1,), tf.string)}, sequence_features={ 'primary': tf.FixedLenSequenceFeature((1,), tf.int64), 'evolutionary': tf.FixedLenSequenceFeature((num_evo_entries,), tf.float32, allow_missing=True), 'secondary': tf.FixedLenSequenceFeature((1,), tf.int64, allow_missing=True), 'tertiary': tf.FixedLenSequenceFeature((NUM_DIMENSIONS,), tf.float32, allow_missing=True), 'mask': tf.FixedLenSequenceFeature((1,), tf.float32, allow_missing=True)}) id_ = context['id'][0] primary = tf.to_int32(features['primary'][:, 0]) evolutionary = features['evolutionary'] secondary = tf.to_int32(features['secondary'][:, 0]) tertiary = features['tertiary'] mask = features['mask'][:, 0] # Predicate for when to retain protein pri_length = tf.size(primary) keep = pri_length <= max_length # Convert primary to one-hot one_hot_primary = tf.one_hot(primary, NUM_AAS) # Generate tertiary masking matrix. If mask is missing then assume all residues are present mask = tf.cond(tf.not_equal(tf.size(mask), 0), lambda: mask, lambda: tf.ones([pri_length - num_edge_residues])) ter_mask = masking_matrix(mask, name='ter_mask') # Return tuple return id_, one_hot_primary, evolutionary, secondary, tertiary, ter_mask, pri_length, keep def curriculum_weights(base, slope, max_seq_length, name=None): ''' Returns a tensor of weights that correspond to the current curriculum, as parametrized by base and slope. Args: base: Value of the base parameter, a TF tensor that is expected to change as training progresses. slope: Value of the slope parameter. Not a TF tensor and is thus a fixed value. max_seq_length: Maximum length of sequences. Not a TF tensor and is thus a fixed value. Returns: [MAX_SEQ_LENGTH - 1] ''' with tf.name_scope(name, 'curriculum_weights', [base]) as scope: base = tf.convert_to_tensor(base, name='base') steps = tf.to_float(tf.range(max_seq_length - 1)) # The minus one factor is because we ignore self-distances. weights = tf.sigmoid(-(slope * (steps - base)), name=scope) return weights def weighting_matrix(weights, name=None): """ Takes a vector of weights and returns a weighting matrix in which the ith weight is in the ith upper diagonal of the matrix. All other entries are 0. This functions needs to be called once per curriculum update / iteration, but then used for the entire batch. This function intimately mixes python and TF code. It can do so because all the python code needs to be run only once during the initial construction phase and does not rely on any tensor values. This interaction is subtle however. Args: weights: Curriculum weights. A TF tensor that is expected to change as curriculum progresses. [MAX_SEQ_LENGTH - 1] Returns [MAX_SEQ_LENGTH, MAX_SEQ_LENGTH] """ with tf.name_scope(name, 'weighting_matrix', [weights]) as scope: weights = tf.convert_to_tensor(weights, name='weights') max_seq_length = weights.get_shape().as_list()[0] + 1 split_indices = np.diag_indices(max_seq_length) flat_indices = [] flat_weights = [] for i in range(max_seq_length - 1): indices_subset = np.concatenate((split_indices[0][:-(i+1), np.newaxis], split_indices[1][i+1:, np.newaxis]), 1) weights_subset = tf.fill([len(indices_subset)], weights[i]) flat_indices.append(indices_subset) flat_weights.append(weights_subset) flat_indices = np.concatenate(flat_indices) flat_weights = tf.concat(flat_weights, 0) mat = tf.sparse_to_dense(flat_indices, [max_seq_length, max_seq_length], flat_weights, validate_indices=False, name=scope) return mat def id_filter(ids, filter_string, delimiter='#', name=None): """ Returns a boolean mask corresponding to the chosen id filter from a list of ids """ with tf.name_scope(name, 'id_filter', [ids, filter_string]) as scope: ids = tf.convert_to_tensor(ids, name='ids') filter_string = tf.convert_to_tensor(filter_string, name='filter_string') return tf.equal(tf.string_split(ids, delimiter=delimiter).values[0::2], filter_string, name=scope)