Python tensorflow.data() Examples

The following are 30 code examples of tensorflow.data(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow , or try the search function .
Example #1
Source File: base_dataset.py    From hierarchical_loc with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _get_data(self, dataset, split_name, **config):
        """Reads the dataset splits using the Tensorflow `tf.data` API.

        This method should create a `tf.data.Dataset` object for the given data split,
        with named components defined through a dictionary mapping strings to tensors.

        It typically performs operations such as reading data from a file or from a
        Python generator, shuffling the elements or applying data augmentation to the
        training split. It should however NOT batch the dataset (left to the model).

        Arguments:
            dataset: An object returned by the `_init_dataset` method.
            split_name: A string, the name of the requested split, either `"training"`,
                `"validation"` or `"test"`.
            config: A configuration dictionary, given during the object instantiantion.

        Returns:
            An object of type `tf.data.Dataset` corresponding to the corresponding split.
        """
        raise NotImplementedError 
Example #2
Source File: embedding_bert_intent_estimator_classifier.py    From rasa_nlu_gq with Apache License 2.0 6 votes vote down vote up
def _create_encoded_intents(self, intent_dict):
        """Create matrix with intents encoded in rows as bag of words,
        if intent_tokenization_flag = False this is identity matrix"""

        if self.intent_tokenization_flag:
            intent_token_dict = self._create_intent_token_dict(
                list(intent_dict.keys()), self.intent_split_symbol)

            encoded_all_intents = np.zeros((len(intent_dict),
                                            len(intent_token_dict)))
            for key, idx in intent_dict.items():
                for t in key.split(self.intent_split_symbol):
                    encoded_all_intents[idx, intent_token_dict[t]] = 1

            return encoded_all_intents
        else:
            return np.eye(len(intent_dict))

    # data helpers: 
Example #3
Source File: embedding_bert_intent_estimator_classifier.py    From rasa_nlu_gq with Apache License 2.0 6 votes vote down vote up
def input_fn(self,features, labels, batch_size, shuffle_num, mode):
        """
         build tf.data set for input pipeline

        :param features: type dict() , define input x structure for parsing
        :param labels: type np.array input label
        :param batch_size: type int number ,input batch_size
        :param shuffle_num: type int number , random select the data
        :param mode: type string ,tf.estimator.ModeKeys.TRAIN or tf.estimator.ModeKeys.PREDICT
        :return: set() with type of (tf.data , and labels)
        """
        dataset = tf.data.Dataset.from_tensor_slices((features, labels))
        if mode == tf.estimator.ModeKeys.TRAIN:
            dataset = dataset.shuffle(shuffle_num).batch(batch_size).repeat(self.epochs)
        else:
            dataset = dataset.batch(batch_size)
        iterator = dataset.make_one_shot_iterator()
        data, labels = iterator.get_next()
        return data, labels 
Example #4
Source File: test.py    From seg-mentor with MIT License 6 votes vote down vote up
def get_data_feed(val_rec_fname, pixels=None):
    '''
        returning 4-element feed: orig_shape, scale, image, annotation.

        TODO: unify parts with with prepare_graph()
    '''
    dataset = data.TFRecordDataset([val_rec_fname]).map(utils.tfrecordify.parse_record)  # .batch(1)
    # note - saving shape before rescale
    dataset = dataset.map(lambda img, ann: (tf.to_float(tf.shape(img)), img, ann))
    if pixels is not None:
        dataset = dataset.map(lambda orig_shape_f, img, ann:
                              (orig_shape_f, tf.reduce_min(pixels/orig_shape_f)) +
                              utils.augmentation.nonrandom_rescale(img, ann, [pixels, pixels]))
    else:
        dataset = dataset.map(lambda shape, img, ann:
                              (shape, 1, img, tf.cast(ann, tf.int32)))

    iterator = dataset.repeat().make_initializable_iterator()
    return iterator 
Example #5
Source File: run.py    From tf-encrypted with Apache License 2.0 6 votes vote down vote up
def _build_data_pipeline(self):
        """Build a reproducible tf.data iterator."""

        def normalize(image, label):
            image = tf.cast(image, tf.float32) / 255.0
            return image, label

        def flatten(image, label):
            image = tf.reshape(image, shape=[self.FLATTENED_DIM])
            return image, label

        dataset = tf.data.TFRecordDataset([self.local_data_file])
        dataset = dataset.map(decode)
        dataset = dataset.map(normalize)
        dataset = dataset.map(flatten)
        dataset = dataset.repeat()
        dataset = dataset.batch(self.BATCH_SIZE)

        iterator = dataset.make_one_shot_iterator()
        return iterator 
Example #6
Source File: GMVAE.py    From GMVAE with MIT License 6 votes vote down vote up
def plot_latent_space(self, data, labels, save=False):
      """Plot the latent space learnt by the model

      Args:
          data: (array) corresponding array containing the data
          labels: (array) corresponding array containing the labels
          save: (bool) whether to save the latent space plot

      Returns:
          fig: (figure) plot of the latent space
      """
      # obtain the latent features
      features = self.latent_features(data)
      
      # plot only the first 2 dimensions
      fig = plt.figure(figsize=(8, 6))
      plt.scatter(features[:, 0], features[:, 1], c=labels, marker='o',
              edgecolor='none', cmap=plt.cm.get_cmap('jet', 10), s = 10)
      plt.colorbar()
      if(save):
          fig.savefig('latent_space.png')
      return fig 
Example #7
Source File: inception_v3.py    From training_results_v0.5 with Apache License 2.0 6 votes vote down vote up
def tensor_transform_fn(data, perm):
  """Transpose function.

  This function is used to transpose an image tensor on the host and then
  perform an inverse transpose on the TPU. The transpose on the TPU gets
  effectively elided thus voiding any associated computational cost.

  NOTE: Eventually the compiler will be able to detect when this kind of
  operation may prove beneficial and perform these types of transformations
  implicitly, voiding the need for user intervention

  Args:
    data: Tensor to be transposed
    perm: New ordering of dimensions

  Returns:
    Transposed tensor
  """
  if FLAGS.transpose_enabled:
    return tf.transpose(data, perm)
  return data 
Example #8
Source File: data_reader.py    From kfac with Apache License 2.0 6 votes vote down vote up
def __call__(self, batch_size):
    """Reads `batch_size` data.

    Args:
      batch_size: Tensor of type `int32`, batch size of the data to be
        retrieved from the dataset. `batch_size` should be less than or
        equal to `max_batch_size`.

    Returns:
       Read data, An iterable of tensors with batch size equal to `batch_size`.
    """
    check_size = tf.assert_less_equal(
        batch_size,
        tf.convert_to_tensor(self._max_batch_size, dtype=tf.int32),
        message='Data set read failure, Batch size greater than max allowed.'
    )
    with tf.control_dependencies([check_size]):
      return _slice_data(self._dataset, batch_size) 
Example #9
Source File: dataset.py    From causal-text-embeddings with MIT License 6 votes vote down vote up
def make_input_id_masker(tokenizer, seed):
    # (One of) Bert's unsupervised objectives is to mask some fraction of the input words and predict the masked words

    def masker(data):
        token_ids = data['token_ids']
        maybe_masked_input_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights = create_masked_lm_predictions(
            token_ids,
            # pre-training defaults from Bert docs
            masked_lm_prob=0.15,
            max_predictions_per_seq=20,
            vocab=tokenizer.vocab,
            seed=seed)
        return {
            **data,
            'maybe_masked_input_ids': maybe_masked_input_ids,
            'masked_lm_positions': masked_lm_positions,
            'masked_lm_ids': masked_lm_ids,
            'masked_lm_weights': masked_lm_weights
        }

    return masker 
Example #10
Source File: dataset.py    From causal-text-embeddings with MIT License 6 votes vote down vote up
def make_extra_feature_cleaning():
    def extra_feature_cleaning(data):
        data['num_authors'] = tf.minimum(data['num_authors'], 6)-1
        data['year'] = data['year']-2007

        # some extras
        equation_referenced = tf.minimum(data['num_ref_to_equations'], 1)
        theorem_referenced = tf.minimum(data['num_ref_to_theorems'], 1)

        # buzzy title
        any_buzz = data["title_contains_deep"] + data["title_contains_neural"] + \
                   data["title_contains_embedding"] + data["title_contains_gan"]
        buzzy_title = tf.cast(tf.not_equal(any_buzz, 0), tf.int32)

        return {**data,
                'equation_referenced': equation_referenced,
                'theorem_referenced': theorem_referenced,
                'buzzy_title': buzzy_title,
                'index': data['id']}
    return extra_feature_cleaning 
Example #11
Source File: tutorial_1.py    From BERT_TF with Apache License 2.0 6 votes vote down vote up
def input_fn(words, tags, params=None, shuffle_and_repeat=False):
    params = params if params is not None else {}
    shapes = (([None], ()), [None])
    types = ((tf.string, tf.int32), tf.string)
    defaults = (('<pad>', 0), '0')

    dataset = tf.data.Dataset.from_generator(
        functools.partial(generator_fn, words, tags),
        output_shapes=shapes,
        output_types=types)
    
    if shuffle_and_repeat:
        dataset = dataset.shuffle(params['buffer']).repeat(params['epochs'])
    
    dataset = dataset.padded_batch(params.get('batch_size', 20), shapes, defaults).prefetch(1)

    return dataset

## Global Logic of the model_fn 
Example #12
Source File: data_reader.py    From kfac with Apache License 2.0 6 votes vote down vote up
def __call__(self, batch_size):
    """Reads `batch_size` data and stores the read batch.

    Args:
      batch_size: Tensor of type `int32`, batch size of the data to be
        retrieved from the dataset. `batch_size` should be less than or
        equal to `max_batch_size`.

    Returns:
       Read data, An iterable of tensors with batch size equal to `batch_size`.
    """
    sliced_data = super(CachedDataReader, self).__call__(batch_size)

    # We need to make sure we read the cached batch before we update it!
    with tf.control_dependencies(self._cached_batch):
      batch_size_assign_op = self._cached_batch_size.assign(batch_size)
      data_assign_ops = [
          prev[:batch_size].assign(cur)  # yes, this actually works
          for prev, cur in zip(self._cached_batch_storage, sliced_data)
      ]
      with tf.control_dependencies(data_assign_ops + [batch_size_assign_op]):
        return [tf.identity(sdata) for sdata in sliced_data] 
Example #13
Source File: base_dataset.py    From hierarchical_loc with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _init_dataset(self, **config):
        """Prepare the dataset for reading.

        This method should configure the dataset for later fetching through `_get_data`,
        such as downloading the data if it is not stored locally, or reading the list of
        data files from disk. Ideally, especially in the case of large images, this
        method shoudl NOT read all the dataset into memory, but rather prepare for faster
        seubsequent fetching.

        Arguments:
            config: A configuration dictionary, given during the object instantiantion.

        Returns:
            An object subsequently passed to `_get_data`, e.g. a list of file paths and
            set splits.
        """
        raise NotImplementedError 
Example #14
Source File: train.py    From tensorflow-deeplab-v3 with MIT License 5 votes vote down vote up
def input_fn(is_training, data_dir, batch_size, num_epochs=1):
  """Input_fn using the tf.data input pipeline for CIFAR-10 dataset.

  Args:
    is_training: A boolean denoting whether the input is for training.
    data_dir: The directory containing the input data.
    batch_size: The number of samples per batch.
    num_epochs: The number of epochs to repeat the dataset.

  Returns:
    A tuple of images and labels.
  """
  dataset = tf.data.Dataset.from_tensor_slices(get_filenames(is_training, data_dir))
  dataset = dataset.flat_map(tf.data.TFRecordDataset)

  if is_training:
    # When choosing shuffle buffer sizes, larger sizes result in better
    # randomness, while smaller sizes have better performance.
    # is a relatively small dataset, we choose to shuffle the full epoch.
    dataset = dataset.shuffle(buffer_size=_NUM_IMAGES['train'])

  dataset = dataset.map(parse_record)
  dataset = dataset.map(
      lambda image, label: preprocess_image(image, label, is_training))
  dataset = dataset.prefetch(batch_size)

  # We call repeat after shuffling, rather than before, to prevent separate
  # epochs from blending together.
  dataset = dataset.repeat(num_epochs)
  dataset = dataset.batch(batch_size)

  iterator = dataset.make_one_shot_iterator()
  images, labels = iterator.get_next()

  return images, labels 
Example #15
Source File: train.py    From tensorflow-deeplab-v3 with MIT License 5 votes vote down vote up
def get_filenames(is_training, data_dir):
  """Return a list of filenames.

  Args:
    is_training: A boolean denoting whether the input is for training.
    data_dir: path to the the directory containing the input data.

  Returns:
    A list of file names.
  """
  if is_training:
    return [os.path.join(data_dir, 'voc_train.record')]
  else:
    return [os.path.join(data_dir, 'voc_val.record')] 
Example #16
Source File: GMVAE.py    From GMVAE with MIT License 5 votes vote down vote up
def generate_data(self, num_elements=1, category=0):
      """Generate data for a specified category

      Args:
          num_elements: (int) number of elements to generate
          category: (int) category from which we will generate data

      Returns:
          generated data according to num_elements
      """
      indices = (np.ones(num_elements)*category).astype(int).tolist()
      
      # category is specified with a one-hot array
      categorical = tf.one_hot(indices, self.num_classes)
      
      # infer the gaussian distribution according to the category
      mean, var = self.network.gaussian_from_categorical(categorical)
      
      # gaussian random sample by using the mean and variance
      gaussian = tf.random_normal(tf.shape(mean), mean, tf.sqrt(var))
      
      # generate new samples with the given gaussian
      _, out = self.network.output_from_gaussian(gaussian, self.output_size)
      
      return self.sess.run(out, feed_dict={self.network.temperature: self.temperature
                                          ,self.learning_rate:self.lr}) 
Example #17
Source File: input_fn.py    From tensorflow-triplet-loss with MIT License 5 votes vote down vote up
def test_input_fn(data_dir, params):
    """Test input function for the MNIST dataset.

    Args:
        data_dir: (string) path to the data directory
        params: (Params) contains hyperparameters of the model (ex: `params.num_epochs`)
    """
    dataset = mnist_dataset.test(data_dir)
    dataset = dataset.batch(params.batch_size)
    dataset = dataset.prefetch(1)  # make sure you always have one batch ready to serve
    return dataset 
Example #18
Source File: input_fn.py    From tensorflow-triplet-loss with MIT License 5 votes vote down vote up
def train_input_fn(data_dir, params):
    """Train input function for the MNIST dataset.

    Args:
        data_dir: (string) path to the data directory
        params: (Params) contains hyperparameters of the model (ex: `params.num_epochs`)
    """
    dataset = mnist_dataset.train(data_dir)
    dataset = dataset.shuffle(params.train_size)  # whole dataset into the buffer
    dataset = dataset.repeat(params.num_epochs)  # repeat for multiple epochs
    dataset = dataset.batch(params.batch_size)
    dataset = dataset.prefetch(1)  # make sure you always have one batch ready to serve
    return dataset 
Example #19
Source File: GMVAE.py    From GMVAE with MIT License 5 votes vote down vote up
def random_generation(self, num_elements=1):
      """Random generation for each category

      Args:
          num_elements: (int) number of elements to generate

      Returns:
          generated data according to num_elements
      """
      # categories for each element
      arr = np.array([])
      for i in range(self.num_classes):
        arr = np.hstack([arr,np.ones(num_elements) * i] )
      indices = arr.astype(int).tolist()
      categorical = tf.one_hot(indices, self.num_classes)
      
      # infer the gaussian distribution according to the category
      mean, var = self.network.gaussian_from_categorical(categorical)
      
      # gaussian random sample by using the mean and variance
      gaussian = tf.random_normal(tf.shape(mean), mean, tf.sqrt(var))
      
      # generate new samples with the given gaussian
      _, out = self.network.output_from_gaussian(gaussian, self.output_size)
      
      return self.sess.run(out, feed_dict={self.network.temperature: self.temperature
                                          ,self.learning_rate:self.lr}) 
Example #20
Source File: model_fn.py    From tensorflow-triplet-loss with MIT License 5 votes vote down vote up
def build_model(is_training, images, params):
    """Compute outputs of the model (embeddings for triplet loss).

    Args:
        is_training: (bool) whether we are training or not
        images: (dict) contains the inputs of the graph (features)
                this can be `tf.placeholder` or outputs of `tf.data`
        params: (Params) hyperparameters

    Returns:
        output: (tf.Tensor) output of the model
    """
    out = images
    # Define the number of channels of each convolution
    # For each block, we do: 3x3 conv -> batch norm -> relu -> 2x2 maxpool
    num_channels = params.num_channels
    bn_momentum = params.bn_momentum
    channels = [num_channels, num_channels * 2]
    for i, c in enumerate(channels):
        with tf.variable_scope('block_{}'.format(i+1)):
            out = tf.layers.conv2d(out, c, 3, padding='same')
            if params.use_batch_norm:
                out = tf.layers.batch_normalization(out, momentum=bn_momentum, training=is_training)
            out = tf.nn.relu(out)
            out = tf.layers.max_pooling2d(out, 2, 2)

    assert out.shape[1:] == [7, 7, num_channels * 2]

    out = tf.reshape(out, [-1, 7 * 7 * num_channels * 2])
    with tf.variable_scope('fc_1'):
        out = tf.layers.dense(out, params.embedding_size)

    return out 
Example #21
Source File: data_preprocessing.py    From tf-encrypted with Apache License 2.0 5 votes vote down vote up
def data_prep_from_saved_model(
    graph_def,
    data_filenames,
    batch_size,
    data_prep_start_node="serialized_example:0",
    data_prep_end_node="DatasetToSingleElement:0"
):
  """Main function to extract data processing pipelines."""

  # Trim graph to keep only the nodes related to data pre-processing
  data_prep_end_node_name = data_prep_end_node.split(":")[0]
  gdef_trimmed = extract_sub_graph(
      graph_def,
      dest_nodes=[data_prep_end_node_name],
  )

  # Load TFRecord files then generate a Dataset of batch
  dataset = tf.data.TFRecordDataset(data_filenames)
  dataset = dataset.batch(batch_size)
  iterator = dataset.make_one_shot_iterator()
  dataset_b = iterator.get_next()

  # Preprocess data
  data_out, = tf.import_graph_def(
      gdef_trimmed,
      input_map={data_prep_start_node: dataset_b},
      return_elements=[data_prep_end_node],
  )

  # TFE expects tensors with fully defined shape
  fixed_shape = [batch_size] + data_out.get_shape().as_list()[1:]
  data_out = tf.reshape(data_out, fixed_shape)
  return data_out 
Example #22
Source File: run.py    From tf-encrypted with Apache License 2.0 5 votes vote down vote up
def provide_input(self) -> tf.Tensor:
        """Prepare input data for prediction."""
        with tf.name_scope("loading"):
            prediction_input, expected_result = self._build_data_pipeline().get_next()
            print_op = tf.print("Expect", expected_result, summarize=self.BATCH_SIZE)
            with tf.control_dependencies([print_op]):
                prediction_input = tf.identity(prediction_input)

        with tf.name_scope("pre-processing"):
            prediction_input = tf.reshape(
                prediction_input, shape=(self.BATCH_SIZE, ModelOwner.FLATTENED_DIM)
            )
        return prediction_input 
Example #23
Source File: run.py    From tf-encrypted with Apache License 2.0 5 votes vote down vote up
def _build_data_pipeline(self):
        """Build a reproducible tf.data iterator."""

        def normalize(image, label):
            image = tf.cast(image, tf.float32) / 255.0
            return image, label

        dataset = tf.data.TFRecordDataset([self.local_data_file])
        dataset = dataset.map(decode)
        dataset = dataset.map(normalize)
        dataset = dataset.repeat()
        dataset = dataset.batch(self.BATCH_SIZE)

        iterator = dataset.make_one_shot_iterator()
        return iterator 
Example #24
Source File: cifar10_main.py    From yolo_v2 with Apache License 2.0 5 votes vote down vote up
def get_filenames(is_training, data_dir):
  """Returns a list of filenames."""
  data_dir = os.path.join(data_dir, 'cifar-10-batches-bin')

  assert os.path.exists(data_dir), (
      'Run cifar10_download_and_extract.py first to download and extract the '
      'CIFAR-10 data.')

  if is_training:
    return [
        os.path.join(data_dir, 'data_batch_%d.bin' % i)
        for i in range(1, _NUM_DATA_FILES + 1)
    ]
  else:
    return [os.path.join(data_dir, 'test_batch.bin')] 
Example #25
Source File: run.py    From tf-encrypted with Apache License 2.0 5 votes vote down vote up
def _build_training_graph(self, training_data):
        """Build a graph for plaintext model training."""

        model = keras.Sequential()
        model.add(keras.layers.Dense(512, input_shape=[self.FLATTENED_DIM]))
        model.add(keras.layers.Activation("relu"))
        model.add(keras.layers.Dense(self.NUM_CLASSES, activation=None))

        # optimizer and data pipeline
        optimizer = tf.train.AdamOptimizer(learning_rate=0.01)

        def loss(model, inputs, targets):
            logits = model(inputs)
            per_element_loss = tf.losses.sparse_softmax_cross_entropy(
                labels=targets, logits=logits
            )
            return tf.reduce_mean(per_element_loss)

        def grad(model, inputs, targets):
            loss_value = loss(model, inputs, targets)
            return loss_value, tf.gradients(loss_value, model.trainable_variables)

        def loop_body(i):
            x, y = training_data.get_next()
            _, grads = grad(model, x, y)
            update_op = optimizer.apply_gradients(zip(grads, model.trainable_variables))
            with tf.control_dependencies([update_op]):
                return i + 1

        loop = tf.while_loop(
            lambda i: i < self.ITERATIONS * self.EPOCHS, loop_body, loop_vars=(0,)
        )

        with tf.control_dependencies([loop]):
            print_op = tf.print("Training complete")
        with tf.control_dependencies([print_op]):
            return [tf.identity(x) for x in model.trainable_variables] 
Example #26
Source File: deepkt.py    From Deep-Knowledge-Tracing with MIT License 5 votes vote down vote up
def evaluate(self,
                 dataset,
                 verbose=1,
                 steps=None,
                 callbacks=None):
        """Returns the loss value & metrics values for the model in test mode.
        Computation is done in batches.
        Arguments:
            dataset: `tf.data` dataset. Should return a
            tuple of `(inputs, (skills, targets))`.
            verbose: 0 or 1. Verbosity mode.
                0 = silent, 1 = progress bar.
            steps: Integer or `None`.
                Total number of steps (batches of samples)
                before declaring the evaluation round finished.
                Ignored with the default value of `None`.
                If x is a `tf.data` dataset and `steps` is
                None, 'evaluate' will run until the dataset is exhausted.
                This argument is not supported with array inputs.
            callbacks: List of `keras.callbacks.Callback` instances.
                List of callbacks to apply during evaluation.
                See [callbacks](/api_docs/python/tf/keras/callbacks).
        Returns:
            Scalar test loss (if the model has a single output and no metrics)
            or list of scalars (if the model has multiple outputs
            and/or metrics). The attribute `model.metrics_names` will give you
            the display labels for the scalar outputs.
        Raises:
            ValueError: in case of invalid arguments.
        """
        return super(DKTModel, self).evaluate(dataset,
                                              verbose=verbose,
                                              steps=steps,
                                              callbacks=callbacks) 
Example #27
Source File: base_model.py    From spl with GNU General Public License v3.0 5 votes vote down vote up
def step(self, session):
        """Runs one training step by evaluating loss, parameter update, summary and output operations.
        
        Model receives data from the data pipeline automatically. In contrast to `sampled_step`, model's output is not
        fed back to the model.
        Args:
            session: TF session object.
        Returns:
            loss, summary proto, prediction
        """
        pass 
Example #28
Source File: cifar10_main.py    From yolo_v2 with Apache License 2.0 5 votes vote down vote up
def input_fn(is_training, data_dir, batch_size, num_epochs=1):
  """Input_fn using the tf.data input pipeline for CIFAR-10 dataset.

  Args:
    is_training: A boolean denoting whether the input is for training.
    data_dir: The directory containing the input data.
    batch_size: The number of samples per batch.
    num_epochs: The number of epochs to repeat the dataset.

  Returns:
    A tuple of images and labels.
  """
  dataset = record_dataset(get_filenames(is_training, data_dir))

  if is_training:
    # When choosing shuffle buffer sizes, larger sizes result in better
    # randomness, while smaller sizes have better performance. Because CIFAR-10
    # is a relatively small dataset, we choose to shuffle the full epoch.
    dataset = dataset.shuffle(buffer_size=_NUM_IMAGES['train'])

  dataset = dataset.map(parse_record)
  dataset = dataset.map(
      lambda image, label: (preprocess_image(image, is_training), label))

  dataset = dataset.prefetch(2 * batch_size)

  # We call repeat after shuffling, rather than before, to prevent separate
  # epochs from blending together.
  dataset = dataset.repeat(num_epochs)

  # Batch results by up to batch_size, and then fetch the tuple from the
  # iterator.
  dataset = dataset.batch(batch_size)
  iterator = dataset.make_one_shot_iterator()
  images, labels = iterator.get_next()

  return images, labels 
Example #29
Source File: inception_v3.py    From training_results_v0.5 with Apache License 2.0 5 votes vote down vote up
def dataset_iterator(self, batch_size, shuffle):
    """Constructs a real-data iterator over batches for train or eval.

    Args:
      batch_size: The effective batch size.
      shuffle: Whether or not to shuffle the data.

    Returns:
      A tf.data iterator.
    """
    file_pattern = os.path.join(self.data_dir, 'train-*'
                                if self.is_training else 'validation-*')
    dataset = tf.data.Dataset.list_files(file_pattern, shuffle=self.is_training)

    if self.is_training:
      dataset = dataset.repeat()

    def prefetch_dataset(filename):
      dataset = tf.data.TFRecordDataset(
          filename, buffer_size=FLAGS.prefetch_dataset_buffer_size)
      return dataset

    dataset = dataset.apply(
        tf.contrib.data.parallel_interleave(
            prefetch_dataset, cycle_length=FLAGS.num_files_infeed, sloppy=True))

    if shuffle and FLAGS.followup_shuffle_buffer_size > 0:
      dataset = dataset.shuffle(buffer_size=FLAGS.followup_shuffle_buffer_size)

    dataset = dataset.map(
        self.dataset_parser, num_parallel_calls=FLAGS.num_parallel_calls)

    dataset = dataset.prefetch(batch_size)

    dataset = dataset.batch(batch_size, drop_remainder=True)

    dataset = dataset.prefetch(2)  # Prefetch overlaps in-feed with training

    return dataset.make_one_shot_iterator() 
Example #30
Source File: dataset.py    From TransE-Knowledge-Graph-Embedding with MIT License 5 votes vote down vote up
def _parse(line):
    """Parse train data."""
    cols_types = [[''], [''], ['']]
    return tf.decode_csv(line, record_defaults=cols_types, field_delim='\t')