python source code of seq

import numpy as np
import tensorflow as tf

from gtd.ml.framework import Feedable
from gtd.ml.utils import expand_dims_for_broadcast, broadcast


class SequenceBatch(object):
    """Represent a batch of sequences as a Tensor."""

    def __init__(self, values, mask, name='SequenceBatch'):
        with tf.name_scope(name):
            # check that dimensions are correct
            values_shape = tf.shape(values)
            mask_shape = tf.shape(mask)
            values_shape_prefix = tf.slice(values_shape, [0], [2])
            max_rank = max(values.get_shape().ndims, mask.get_shape().ndims)

            assert_op = tf.assert_equal(values_shape_prefix, mask_shape,
                                        data=[values_shape_prefix, mask_shape], summarize=max_rank,
                                        name="assert_shape_prefix")

            with tf.control_dependencies([assert_op]):
                    self._values = tf.identity(values, name='values')
                    self._mask = tf.identity(mask, name='mask')

    @property
    def values(self):
        """A Tensor holding the values of the sequence batch, of shape [batch_size, seq_length, :, ..., :].

        Each row represents one sequence.
        """
        return self._values

    @property
    def mask(self):
        """A boolean mask of shape [batch_size, seq_length], indicating which entries of self.values are padding.

        mask[i, j] = 0 if the entry is padding, 1 otherwise.

        Returns:
            A Tensor of shape (batch_size, seq_length)
        """
        return self._mask

    def with_pad_value(self, val):
        """Return a new SequenceBatch, with pad values set to the specified value."""
        return SequenceBatch(change_pad_value(self.values, self.mask, val), self.mask)


def change_pad_value(values, mask, pad_val):
    """Given a set of values and a pad mask, change the value of all pad entries.

    Args:
        values (Tensor): of shape [batch_size, seq_length, :, ..., :].
        mask (Tensor): binary float tensor of shape [batch_size, seq_length]
        pad_val (float): value to set all pad entries to

    Returns:
        Tensor: a new Tensor of same shape as values
    """
    # broadcast the mask to match shape of values
    mask = expand_dims_for_broadcast(mask, values)  # (batch_size, seq_length, 1, ..., 1)
    mask = broadcast(mask, values)
    mask = tf.cast(mask, tf.bool)  # cast to bool

    # broadcast val
    broadcast_val = pad_val * tf.ones(tf.shape(values))

    new_values = tf.select(mask, values, broadcast_val)
    return new_values


class FeedSequenceBatch(Feedable, SequenceBatch):
    """A SequenceBatch that is fed into TensorFlow from the outside.

    The SequenceBatch is represented by a Tensor of shape [batch_size, seq_length]
        - batch_size is dynamically determined by the # sequences fed
        - seq_length is dynamically set to the length of the longest sequence fed, or the statically specified value.
    """
    def __init__(self, align='left', seq_length=None, dtype=tf.int32, name='FeedSequenceBatch'):
        """Create a Feedable SequenceBatch.

        Args:
            align (str): can be 'left' or 'right'. If 'left', values will be left-aligned, with padding on the right.
                If 'right', values will be right-aligned, with padding on the left. Default is 'left'.
            seq_length (int): the Tensor representing the SequenceBatch will have exactly this many columns. Default
                is None. If None, seq_length will be dynamically determined.
            dtype: data type of the SequenceBatch values array. Defaults to int32.
            name (str): namescope for the Tensors created inside this Model.
        """
        if align not in ('left', 'right'):
            raise ValueError("align must be either 'left' or 'right'.")
        self._align_right = (align == 'right')
        self._seq_length = seq_length

        with tf.name_scope(name):
            values = tf.placeholder(dtype, shape=[None, None], name='values')  # (batch_size, seq_length)
            mask = tf.placeholder(tf.float32, shape=[None, None], name='mask')  # (batch_size, seq_length)

        if self._seq_length is not None:
            # add static shape information
            batch_dim, _ = values.get_shape()
            new_shape = tf.TensorShape([batch_dim, tf.Dimension(seq_length)])
            values.set_shape(new_shape)
            mask.set_shape(new_shape)

        super(FeedSequenceBatch, self).__init__(values, mask)

    def inputs_to_feed_dict(self, sequences, vocab=None):
        """Convert sequences into a feed_dict.

        Args:
            sequences (list[list[unicode]]): a list of unicode sequences
            vocab (Vocab): a vocab mapping tokens to integers. If vocab is None, sequences are directly passed
                into TensorFlow, without performing any token-to-integer lookup.

        Returns:
            a feed_dict
        """
        batch_size = len(sequences)
        if batch_size == 0:
            seq_length = 0 if self._seq_length is None else self._seq_length
            empty = np.empty((0, seq_length))
            return {self.values: empty, self.mask: empty}

        # dynamic seq_length if none specified
        if self._seq_length is None:
            seq_length = max(len(tokens) for tokens in sequences)
        else:
            seq_length = self._seq_length

        # if no vocab, just pass the raw value
        if vocab is None:
            tokens_to_values = lambda words: words
        else:
            tokens_to_values = vocab.words2indices

        if self._align_right:
            truncate = lambda tokens: tokens[-seq_length:]
            indices = map(lambda n: [(seq_length - n) + i for i in range(n)], range(seq_length + 1))
        else:
            truncate = lambda tokens: tokens[:seq_length]
            indices = map(range, range(seq_length + 1))

        values_arr = np.zeros((batch_size, seq_length), dtype=np.float32)
        mask_arr = np.zeros((batch_size, seq_length), dtype=np.float32)

        for row_idx, tokens in enumerate(sequences):
            num_tokens = len(tokens)
            if num_tokens == 0:
                continue

            if num_tokens > seq_length:
                truncated_tokens = truncate(tokens)
            else:
                truncated_tokens = tokens

            inds = indices[len(truncated_tokens)]
            vals = tokens_to_values(truncated_tokens)
            values_arr[row_idx][inds] = vals
            mask_arr[row_idx][inds] = 1.0

        return {self.values: values_arr, self.mask: mask_arr}


def embed(sequence_batch, embeds):
    mask = sequence_batch.mask
    embedded_values = tf.gather(embeds, sequence_batch.values)
    embedded_values = tf.verify_tensor_all_finite(embedded_values, 'embedded_values')

    # set all pad embeddings to zero
    broadcasted_mask = expand_dims_for_broadcast(mask, embedded_values)
    embedded_values *= broadcasted_mask

    return SequenceBatch(embedded_values, mask)


def reduce_mean(seq_batch, allow_empty=False):
    """Compute the mean of each sequence in a SequenceBatch.

    Args:
        seq_batch (SequenceBatch): a SequenceBatch with the following attributes:
            values (Tensor): a Tensor of shape (batch_size, seq_length, :, ..., :)
            mask (Tensor): if the mask values are arbitrary floats (rather than binary), the mean will be
            a weighted average.
        allow_empty (bool): allow computing the average of an empty sequence. In this case, we assume 0/0 == 0, rather
            than NaN. Default is False, causing an error to be thrown.

    Returns:
        Tensor: of shape (batch_size, :, ..., :)
    """
    values, mask = seq_batch.values, seq_batch.mask
    # compute weights for the average
    sums = tf.reduce_sum(mask, 1, keep_dims=True)  # (batch_size, 1)

    if allow_empty:
        asserts = []  # no assertion
        sums = tf.select(tf.equal(sums, 0), tf.ones(tf.shape(sums)), sums)  # replace 0's with 1's
    else:
        asserts = [tf.assert_positive(sums)]  # throw error if 0's exist

    with tf.control_dependencies(asserts):
        weights = mask / sums  # (batch_size, seq_length)
    return weighted_sum(seq_batch, weights)


def reduce_sum(seq_batch):
    weights = tf.ones(shape=tf.shape(seq_batch.mask))
    return weighted_sum(seq_batch, weights)


def weighted_sum(seq_batch, weights):
    """Compute the weighted sum of each sequence in a SequenceBatch.

    Args:
        seq_batch (SequenceBatch): a SequenceBatch.
        weights (Tensor): a Tensor of shape (batch_size, seq_length). Determines the weights. Weights outside the
            seq_batch's mask are ignored.

    Returns:
        Tensor: of shape (batch_size, :, ..., :)
    """
    values, mask = seq_batch.values, seq_batch.mask
    weights = weights * mask  # ignore weights outside the mask
    weights = expand_dims_for_broadcast(weights, values)
    weighted_array = values * weights  # (batch_size, seq_length, X)
    return tf.reduce_sum(weighted_array, 1)  # (batch_size, X)


def reduce_max(seq_batch):
    sums = tf.reduce_sum(seq_batch.mask, 1, keep_dims=True)  # (batch_size, 1)
    with tf.control_dependencies([tf.assert_positive(sums)]):  # avoid dividing by zero
        seq_batch = seq_batch.with_pad_value(float('-inf'))  # set pad values to -inf
        result = tf.reduce_max(seq_batch.values, 1)
    return result