python source code of readers

# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Provides readers configured for different datasets."""

import tensorflow as tf
import utils
import re
from tensorflow import logging
def resize_axis(tensor, axis, new_size, fill_value=0):
  """Truncates or pads a tensor to new_size on on a given axis.

  Truncate or extend tensor such that tensor.shape[axis] == new_size. If the
  size increases, the padding will be performed at the end, using fill_value.

  Args:
    tensor: The tensor to be resized.
    axis: An integer representing the dimension to be sliced.
    new_size: An integer or 0d tensor representing the new value for
      tensor.shape[axis].
    fill_value: Value to use to fill any new entries in the tensor. Will be
      cast to the type of tensor.

  Returns:
    The resized tensor.
  """
  tensor = tf.convert_to_tensor(tensor)
  shape = tf.unstack(tf.shape(tensor))

  pad_shape = shape[:]
  pad_shape[axis] = tf.maximum(0, new_size - shape[axis])

  shape[axis] = tf.minimum(shape[axis], new_size)
  shape = tf.stack(shape)

  resized = tf.concat([
      tf.slice(tensor, tf.zeros_like(shape), shape),
      tf.fill(tf.stack(pad_shape), tf.cast(fill_value, tensor.dtype))
  ], axis)

  # Update shape.
  new_shape = tensor.get_shape().as_list()  # A copy is being made.
  new_shape[axis] = new_size
  resized.set_shape(new_shape)
  return resized

class BaseReader(object):
  """Inherit from this class when implementing new readers."""

  def prepare_reader(self, unused_filename_queue):
    """Create a thread for generating prediction and label tensors."""
    raise NotImplementedError()


class YT8MAggregatedFeatureReader(BaseReader):
  """Reads TFRecords of pre-aggregated Examples.

  The TFRecords must contain Examples with a sparse int64 'labels' feature and
  a fixed length float32 feature, obtained from the features in 'feature_name'.
  The float features are assumed to be an average of dequantized values.
  """

  def __init__(self,
               num_classes=4716,
               feature_sizes=[1024],
               feature_names=["mean_inc3"],
               feature_calcs="",
               feature_remove="",
               decode_zlib=True):
    """Construct a YT8MAggregatedFeatureReader.

    Args:
      num_classes: a positive integer for the number of classes.
      feature_sizes: positive integer(s) for the feature dimensions as a list.
      feature_names: the feature name(s) in the tensorflow record as a list.
    """
    assert len(feature_names) == len(feature_sizes), \
    "length of feature_names (={}) != length of feature_sizes (={})".format( \
    len(feature_names), len(feature_sizes))
    new_feature_names = None
    self.num_classes = num_classes
    self.feature_sizes = feature_sizes
    self.feature_names = feature_names
    self.decode_zlib = decode_zlib
    self.feature_remove = feature_remove.replace(' ','').split(',')
    #
    # New features' names
    #
    new_feature_names = []
    if feature_calcs != "":
       new_feature_names = [x.replace(' ','') for x in feature_calcs.split(',')]
    #
    # Determine new features' sizes
    #
    new_feature_sizes = [] 
    if feature_calcs != "":
       for feat in new_feature_names:
           if re.findall('audio$', feat) != []:
              new_feature_sizes = new_feature_sizes + [128]
           elif re.findall('rgb$', feat) != []:
              new_feature_sizes = new_feature_sizes + [1024]
           elif feat[:14] == 'c_interaction_' or feat[:7] == 'c_diff_':
              x = -1
              for g in re.findall('(\d+):(\d+)',feat):
                  if x < int(g[1])-int(g[0]):
                     x = int(g[1])-int(g[0])
              new_feature_sizes = new_feature_sizes + [x]
    #
    # Update old with new
    #
    if new_feature_sizes != []:
       self.feature_sizes = self.feature_sizes + new_feature_sizes
    if new_feature_names != []:
       self.feature_names = self.feature_names + new_feature_names
    #
    # Remove features
    #
    #if feature_remove != '':
    #  for feat in feature_remove.replace(' ','').split(','):
    #    i = self.feature_names.index(feat)
    #    print(' removing: ' + str(self.feature_names[i]))
    #    del self.feature_names[i]
    #    del self.feature_sizes[i] 

    print('Identified features: ' + str(len(self.feature_names)) + " | " + str(self.feature_names))
    print('            lengths: ' + str(len(self.feature_sizes)) + " | " + str(self.feature_sizes))
    print('            removed: ' + str(len(self.feature_remove))+ " | " + str(self.feature_remove))


  def prepare_reader(self, filename_queue, batch_size=1024):
    """Creates a single reader thread for pre-aggregated YouTube 8M Examples.

    Args:
      filename_queue: A tensorflow queue of filename locations.

    Returns:
      A tuple of video indexes, features, labels, and padding data.
    """
    opts = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB)
    if self.decode_zlib:
      reader = tf.TFRecordReader(options=opts)
    else:
      reader = tf.TFRecordReader()

    _, serialized_examples = reader.read_up_to(filename_queue, batch_size)

    tf.add_to_collection("serialized_examples", serialized_examples)
    return self.prepare_serialized_examples(serialized_examples)

  def prepare_serialized_examples(self, serialized_examples):
    # set the mapping from the fields to data types in the proto
    num_features = len(self.feature_names)
    assert num_features > 0, "self.feature_names is empty!"
    assert len(self.feature_names) == len(self.feature_sizes), \
    "length of feature_names (={}) != length of feature_sizes (={})".format( \
    len(self.feature_names), len(self.feature_sizes))

    # 2017-04-28 - now num_frames is float so I had to change it from int64
    feature_map = {"video_id": tf.FixedLenFeature([], tf.string),
                   "labels": tf.VarLenFeature(tf.int64), 
                   "num_frames": tf.VarLenFeature(tf.float32)}
    
    for feature_index in range(num_features):
      if self.feature_names[feature_index][:2] != 'c_':
          feature_map[self.feature_names[feature_index]] = tf.FixedLenFeature(
              [self.feature_sizes[feature_index]], tf.float32)

    #ssert False, feature_map
    features = tf.parse_example(serialized_examples, features=feature_map)
    labels = tf.sparse_to_indicator(features["labels"], 4716)
    labels.set_shape([None, 4716])
    labels = resize_axis(labels, 1, self.num_classes)

    #assert False, features['mean_rgb']
    #assert False, tf.slice(features['mean_rgb'],[?,0],[?,self.num_interactions])
    #assert False, self.feature_names
    for feature_name in self.feature_names:
         if   feature_name[:5] == 'c_sq_':
             features[feature_name] = features[feature_name[5:]] * features[feature_name[5:]]
         elif feature_name[:6] == 'c_log_':
             features[feature_name] = tf.log1p(tf.abs(features[feature_name[6:]]))
         elif feature_name[:6] == 'c_inv_':
             features[feature_name] = 1/features[feature_name[6:]]
         elif feature_name[:6] == 'c_abs_':
             features[feature_name] = tf.abs(features[feature_name[6:]])
         elif feature_name[:6] == 'c_sin_':
             features[feature_name] = tf.sin(features[feature_name[6:]])
         elif feature_name[:6] == 'c_cos_':
             features[feature_name] = tf.cos(features[feature_name[6:]])
         elif feature_name[:7] == 'c_sqrt_':
             features[feature_name] = tf.sqrt(features[feature_name[7:]])
         elif feature_name[:8] == 'c_rsqrt_':
             features[feature_name] = tf.rsqrt(features[feature_name[8:]])
         elif feature_name[:7] == 'c_diff_':
             x = re.findall('c_diff_([^_]+)_([^_]+)_([^_]+)_([^_]+)_([^_]+)_([^_]+)', feature_name)[0]
             feat_1 = x[0]+'_'+x[1]
             feat_2 = x[3]+'_'+x[4]
             y_1 = [int(y) for y in x[2].split(':')]
             y_2 = [int(y) for y in x[5].split(':')]
             features[feature_name] = tf.subtract(features[feat_1][:,y_1[0]:y_1[1]], features[feat_2][:,y_2[0]:y_2[1]])
         elif feature_name[:8] == 'c_over_':
             x = re.findall('c_over_([^_]+)_([^_]+)_(.+)', feature_name)[0]
             feat_1 = x[0]+'_'+x[2]
             feat_2 = x[1]+'_'+x[2]
             features[feature_name] = tf.divide(features[feat_1], features[feat_2])
         elif feature_name[:14] == 'c_interaction_':
             # example: c_interaction_mean_rgb_0:128_mean_audio_0:128
             #          that is mean_rgb*mean_audio for the first 128 coordinates of both of them
             x = re.findall('c_interaction_([^_]+)_([^_]+)_([^_]+)_([^_]+)_([^_]+)_([^_]+)', feature_name)[0]
             feat_1 = x[0]+'_'+x[1]
             feat_2 = x[3]+'_'+x[4]             
             y_1 = [int(y) for y in x[2].split(':')]
             y_2 = [int(y) for y in x[5].split(':')]
             features[feature_name] = tf.multiply(features[feat_1][:,y_1[0]:y_1[1]], features[feat_2][:,y_2[0]:y_2[1]])
         #elif feature_name == 'num_frames':
         #    features[feature_name] = tf.cast(features[feature_name], tf.float32)

    #assert False, features
    #assert False, [self.feature_sizes, self.feature_names, features]
    #if self.feature_remove != '':
    #  for feat in self.feature_remove.replace(' ','').split(','):
    #    i = self.feature_names.index(feat)
    #    print(' removing: ' + str(self.feature_names[i]))
    #    del self.feature_names[i]
    #    del self.feature_sizes[i]
    #    del features[self.feature_names[i]]

    concatenated_features = tf.concat([
        features[feature_name] for feature_name in self.feature_names if feature_name not in self.feature_remove]
        , 1)
    return features["video_id"], concatenated_features, labels, tf.ones([tf.shape(serialized_examples)[0]])

class YT8MFrameFeatureReader(BaseReader):
  """Reads TFRecords of SequenceExamples.

  The TFRecords must contain SequenceExamples with the sparse in64 'labels'
  context feature and a fixed length byte-quantized feature vector, obtained
  from the features in 'feature_names'. The quantized features will be mapped
  back into a range between min_quantized_value and max_quantized_value.
  """

  def __init__(self,
               num_classes=4716,
               feature_sizes=[1024],
               feature_names=["inc3"],
               max_frames=300):
    """Construct a YT8MFrameFeatureReader.

    Args:
      num_classes: a positive integer for the number of classes.
      feature_sizes: positive integer(s) for the feature dimensions as a list.
      feature_names: the feature name(s) in the tensorflow record as a list.
      max_frames: the maximum number of frames to process.
    """

    assert len(feature_names) == len(feature_sizes), \
    "length of feature_names (={}) != length of feature_sizes (={})".format( \
    len(feature_names), len(feature_sizes))

    self.num_classes = num_classes
    self.feature_sizes = feature_sizes
    self.feature_names = feature_names
    self.max_frames = max_frames

  def get_video_matrix(self,
                       features,
                       feature_size,
                       max_frames,
                       max_quantized_value,
                       min_quantized_value):
    """Decodes features from an input string and quantizes it.

    Args:
      features: raw feature values
      feature_size: length of each frame feature vector
      max_frames: number of frames (rows) in the output feature_matrix
      max_quantized_value: the maximum of the quantized value.
      min_quantized_value: the minimum of the quantized value.

    Returns:
      feature_matrix: matrix of all frame-features
      num_frames: number of frames in the sequence
    """
    decoded_features = tf.reshape(
        tf.cast(tf.decode_raw(features, tf.uint8), tf.float32),
        [-1, feature_size])

    num_frames = tf.minimum(tf.shape(decoded_features)[0], max_frames)
    feature_matrix = utils.Dequantize(decoded_features,
                                      max_quantized_value,
                                      min_quantized_value)
    feature_matrix = resize_axis(feature_matrix, 0, max_frames)
    return feature_matrix, num_frames

  def prepare_reader(self,
                     filename_queue,
                     max_quantized_value=2,
                     min_quantized_value=-2):
    """Creates a single reader thread for YouTube8M SequenceExamples.

    Args:
      filename_queue: A tensorflow queue of filename locations.
      max_quantized_value: the maximum of the quantized value.
      min_quantized_value: the minimum of the quantized value.

    Returns:
      A tuple of video indexes, video features, labels, and padding data.
    """
    reader = tf.TFRecordReader()
    _, serialized_example = reader.read(filename_queue)

    return self.prepare_serialized_examples(serialized_example,
        max_quantized_value, min_quantized_value)

  def prepare_serialized_examples(self, serialized_example,
      max_quantized_value=2, min_quantized_value=-2):

    contexts, features = tf.parse_single_sequence_example(
        serialized_example,
        context_features={"video_id": tf.FixedLenFeature(
            [], tf.string),
                          "labels": tf.VarLenFeature(tf.int64)},
        sequence_features={
            feature_name : tf.FixedLenSequenceFeature([], dtype=tf.string)
            for feature_name in self.feature_names
        })

    # read ground truth labels
    labels = (tf.cast(
        tf.sparse_to_dense(contexts["labels"].values, (4716,), 1,
            validate_indices=False),
        tf.bool))
    
    # loads (potentially) different types of features and concatenates them
    num_features = len(self.feature_names)
    assert num_features > 0, "No feature selected: feature_names is empty!"

    assert len(self.feature_names) == len(self.feature_sizes), \
    "length of feature_names (={}) != length of feature_sizes (={})".format( \
    len(self.feature_names), len(self.feature_sizes))

    num_frames = -1  # the number of frames in the video
    feature_matrices = [None] * num_features  # an array of different features
    for feature_index in range(num_features):
      feature_matrix, num_frames_in_this_feature = self.get_video_matrix(
          features[self.feature_names[feature_index]],
          self.feature_sizes[feature_index],
          self.max_frames,
          max_quantized_value,
          min_quantized_value)
      if num_frames == -1:
        num_frames = num_frames_in_this_feature
      else:
        tf.assert_equal(num_frames, num_frames_in_this_feature)

      feature_matrices[feature_index] = feature_matrix

    # cap the number of frames at self.max_frames
    num_frames = tf.minimum(num_frames, self.max_frames)

    # concatenate different features
    video_matrix = tf.concat(feature_matrices, 1)

    # convert to batch format.
    # TODO: Do proper batch reads to remove the IO bottleneck.
    batch_video_ids = tf.expand_dims(contexts["video_id"], 0)
    batch_video_matrix = tf.expand_dims(video_matrix, 0)
    batch_labels = tf.expand_dims(labels, 0)
    batch_frames = tf.expand_dims(num_frames, 0)

    return batch_video_ids, batch_video_matrix, batch_labels, batch_frames