Python tensorflow.data() Examples
The following are 30
code examples of tensorflow.data().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow
, or try the search function
.
Example #1
Source File: base_dataset.py From hierarchical_loc with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _get_data(self, dataset, split_name, **config): """Reads the dataset splits using the Tensorflow `tf.data` API. This method should create a `tf.data.Dataset` object for the given data split, with named components defined through a dictionary mapping strings to tensors. It typically performs operations such as reading data from a file or from a Python generator, shuffling the elements or applying data augmentation to the training split. It should however NOT batch the dataset (left to the model). Arguments: dataset: An object returned by the `_init_dataset` method. split_name: A string, the name of the requested split, either `"training"`, `"validation"` or `"test"`. config: A configuration dictionary, given during the object instantiantion. Returns: An object of type `tf.data.Dataset` corresponding to the corresponding split. """ raise NotImplementedError
Example #2
Source File: embedding_bert_intent_estimator_classifier.py From rasa_nlu_gq with Apache License 2.0 | 6 votes |
def _create_encoded_intents(self, intent_dict): """Create matrix with intents encoded in rows as bag of words, if intent_tokenization_flag = False this is identity matrix""" if self.intent_tokenization_flag: intent_token_dict = self._create_intent_token_dict( list(intent_dict.keys()), self.intent_split_symbol) encoded_all_intents = np.zeros((len(intent_dict), len(intent_token_dict))) for key, idx in intent_dict.items(): for t in key.split(self.intent_split_symbol): encoded_all_intents[idx, intent_token_dict[t]] = 1 return encoded_all_intents else: return np.eye(len(intent_dict)) # data helpers:
Example #3
Source File: embedding_bert_intent_estimator_classifier.py From rasa_nlu_gq with Apache License 2.0 | 6 votes |
def input_fn(self,features, labels, batch_size, shuffle_num, mode): """ build tf.data set for input pipeline :param features: type dict() , define input x structure for parsing :param labels: type np.array input label :param batch_size: type int number ,input batch_size :param shuffle_num: type int number , random select the data :param mode: type string ,tf.estimator.ModeKeys.TRAIN or tf.estimator.ModeKeys.PREDICT :return: set() with type of (tf.data , and labels) """ dataset = tf.data.Dataset.from_tensor_slices((features, labels)) if mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.shuffle(shuffle_num).batch(batch_size).repeat(self.epochs) else: dataset = dataset.batch(batch_size) iterator = dataset.make_one_shot_iterator() data, labels = iterator.get_next() return data, labels
Example #4
Source File: test.py From seg-mentor with MIT License | 6 votes |
def get_data_feed(val_rec_fname, pixels=None): ''' returning 4-element feed: orig_shape, scale, image, annotation. TODO: unify parts with with prepare_graph() ''' dataset = data.TFRecordDataset([val_rec_fname]).map(utils.tfrecordify.parse_record) # .batch(1) # note - saving shape before rescale dataset = dataset.map(lambda img, ann: (tf.to_float(tf.shape(img)), img, ann)) if pixels is not None: dataset = dataset.map(lambda orig_shape_f, img, ann: (orig_shape_f, tf.reduce_min(pixels/orig_shape_f)) + utils.augmentation.nonrandom_rescale(img, ann, [pixels, pixels])) else: dataset = dataset.map(lambda shape, img, ann: (shape, 1, img, tf.cast(ann, tf.int32))) iterator = dataset.repeat().make_initializable_iterator() return iterator
Example #5
Source File: run.py From tf-encrypted with Apache License 2.0 | 6 votes |
def _build_data_pipeline(self): """Build a reproducible tf.data iterator.""" def normalize(image, label): image = tf.cast(image, tf.float32) / 255.0 return image, label def flatten(image, label): image = tf.reshape(image, shape=[self.FLATTENED_DIM]) return image, label dataset = tf.data.TFRecordDataset([self.local_data_file]) dataset = dataset.map(decode) dataset = dataset.map(normalize) dataset = dataset.map(flatten) dataset = dataset.repeat() dataset = dataset.batch(self.BATCH_SIZE) iterator = dataset.make_one_shot_iterator() return iterator
Example #6
Source File: GMVAE.py From GMVAE with MIT License | 6 votes |
def plot_latent_space(self, data, labels, save=False): """Plot the latent space learnt by the model Args: data: (array) corresponding array containing the data labels: (array) corresponding array containing the labels save: (bool) whether to save the latent space plot Returns: fig: (figure) plot of the latent space """ # obtain the latent features features = self.latent_features(data) # plot only the first 2 dimensions fig = plt.figure(figsize=(8, 6)) plt.scatter(features[:, 0], features[:, 1], c=labels, marker='o', edgecolor='none', cmap=plt.cm.get_cmap('jet', 10), s = 10) plt.colorbar() if(save): fig.savefig('latent_space.png') return fig
Example #7
Source File: inception_v3.py From training_results_v0.5 with Apache License 2.0 | 6 votes |
def tensor_transform_fn(data, perm): """Transpose function. This function is used to transpose an image tensor on the host and then perform an inverse transpose on the TPU. The transpose on the TPU gets effectively elided thus voiding any associated computational cost. NOTE: Eventually the compiler will be able to detect when this kind of operation may prove beneficial and perform these types of transformations implicitly, voiding the need for user intervention Args: data: Tensor to be transposed perm: New ordering of dimensions Returns: Transposed tensor """ if FLAGS.transpose_enabled: return tf.transpose(data, perm) return data
Example #8
Source File: data_reader.py From kfac with Apache License 2.0 | 6 votes |
def __call__(self, batch_size): """Reads `batch_size` data. Args: batch_size: Tensor of type `int32`, batch size of the data to be retrieved from the dataset. `batch_size` should be less than or equal to `max_batch_size`. Returns: Read data, An iterable of tensors with batch size equal to `batch_size`. """ check_size = tf.assert_less_equal( batch_size, tf.convert_to_tensor(self._max_batch_size, dtype=tf.int32), message='Data set read failure, Batch size greater than max allowed.' ) with tf.control_dependencies([check_size]): return _slice_data(self._dataset, batch_size)
Example #9
Source File: dataset.py From causal-text-embeddings with MIT License | 6 votes |
def make_input_id_masker(tokenizer, seed): # (One of) Bert's unsupervised objectives is to mask some fraction of the input words and predict the masked words def masker(data): token_ids = data['token_ids'] maybe_masked_input_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights = create_masked_lm_predictions( token_ids, # pre-training defaults from Bert docs masked_lm_prob=0.15, max_predictions_per_seq=20, vocab=tokenizer.vocab, seed=seed) return { **data, 'maybe_masked_input_ids': maybe_masked_input_ids, 'masked_lm_positions': masked_lm_positions, 'masked_lm_ids': masked_lm_ids, 'masked_lm_weights': masked_lm_weights } return masker
Example #10
Source File: dataset.py From causal-text-embeddings with MIT License | 6 votes |
def make_extra_feature_cleaning(): def extra_feature_cleaning(data): data['num_authors'] = tf.minimum(data['num_authors'], 6)-1 data['year'] = data['year']-2007 # some extras equation_referenced = tf.minimum(data['num_ref_to_equations'], 1) theorem_referenced = tf.minimum(data['num_ref_to_theorems'], 1) # buzzy title any_buzz = data["title_contains_deep"] + data["title_contains_neural"] + \ data["title_contains_embedding"] + data["title_contains_gan"] buzzy_title = tf.cast(tf.not_equal(any_buzz, 0), tf.int32) return {**data, 'equation_referenced': equation_referenced, 'theorem_referenced': theorem_referenced, 'buzzy_title': buzzy_title, 'index': data['id']} return extra_feature_cleaning
Example #11
Source File: tutorial_1.py From BERT_TF with Apache License 2.0 | 6 votes |
def input_fn(words, tags, params=None, shuffle_and_repeat=False): params = params if params is not None else {} shapes = (([None], ()), [None]) types = ((tf.string, tf.int32), tf.string) defaults = (('<pad>', 0), '0') dataset = tf.data.Dataset.from_generator( functools.partial(generator_fn, words, tags), output_shapes=shapes, output_types=types) if shuffle_and_repeat: dataset = dataset.shuffle(params['buffer']).repeat(params['epochs']) dataset = dataset.padded_batch(params.get('batch_size', 20), shapes, defaults).prefetch(1) return dataset ## Global Logic of the model_fn
Example #12
Source File: data_reader.py From kfac with Apache License 2.0 | 6 votes |
def __call__(self, batch_size): """Reads `batch_size` data and stores the read batch. Args: batch_size: Tensor of type `int32`, batch size of the data to be retrieved from the dataset. `batch_size` should be less than or equal to `max_batch_size`. Returns: Read data, An iterable of tensors with batch size equal to `batch_size`. """ sliced_data = super(CachedDataReader, self).__call__(batch_size) # We need to make sure we read the cached batch before we update it! with tf.control_dependencies(self._cached_batch): batch_size_assign_op = self._cached_batch_size.assign(batch_size) data_assign_ops = [ prev[:batch_size].assign(cur) # yes, this actually works for prev, cur in zip(self._cached_batch_storage, sliced_data) ] with tf.control_dependencies(data_assign_ops + [batch_size_assign_op]): return [tf.identity(sdata) for sdata in sliced_data]
Example #13
Source File: base_dataset.py From hierarchical_loc with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _init_dataset(self, **config): """Prepare the dataset for reading. This method should configure the dataset for later fetching through `_get_data`, such as downloading the data if it is not stored locally, or reading the list of data files from disk. Ideally, especially in the case of large images, this method shoudl NOT read all the dataset into memory, but rather prepare for faster seubsequent fetching. Arguments: config: A configuration dictionary, given during the object instantiantion. Returns: An object subsequently passed to `_get_data`, e.g. a list of file paths and set splits. """ raise NotImplementedError
Example #14
Source File: train.py From tensorflow-deeplab-v3 with MIT License | 5 votes |
def input_fn(is_training, data_dir, batch_size, num_epochs=1): """Input_fn using the tf.data input pipeline for CIFAR-10 dataset. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. Returns: A tuple of images and labels. """ dataset = tf.data.Dataset.from_tensor_slices(get_filenames(is_training, data_dir)) dataset = dataset.flat_map(tf.data.TFRecordDataset) if is_training: # When choosing shuffle buffer sizes, larger sizes result in better # randomness, while smaller sizes have better performance. # is a relatively small dataset, we choose to shuffle the full epoch. dataset = dataset.shuffle(buffer_size=_NUM_IMAGES['train']) dataset = dataset.map(parse_record) dataset = dataset.map( lambda image, label: preprocess_image(image, label, is_training)) dataset = dataset.prefetch(batch_size) # We call repeat after shuffling, rather than before, to prevent separate # epochs from blending together. dataset = dataset.repeat(num_epochs) dataset = dataset.batch(batch_size) iterator = dataset.make_one_shot_iterator() images, labels = iterator.get_next() return images, labels
Example #15
Source File: train.py From tensorflow-deeplab-v3 with MIT License | 5 votes |
def get_filenames(is_training, data_dir): """Return a list of filenames. Args: is_training: A boolean denoting whether the input is for training. data_dir: path to the the directory containing the input data. Returns: A list of file names. """ if is_training: return [os.path.join(data_dir, 'voc_train.record')] else: return [os.path.join(data_dir, 'voc_val.record')]
Example #16
Source File: GMVAE.py From GMVAE with MIT License | 5 votes |
def generate_data(self, num_elements=1, category=0): """Generate data for a specified category Args: num_elements: (int) number of elements to generate category: (int) category from which we will generate data Returns: generated data according to num_elements """ indices = (np.ones(num_elements)*category).astype(int).tolist() # category is specified with a one-hot array categorical = tf.one_hot(indices, self.num_classes) # infer the gaussian distribution according to the category mean, var = self.network.gaussian_from_categorical(categorical) # gaussian random sample by using the mean and variance gaussian = tf.random_normal(tf.shape(mean), mean, tf.sqrt(var)) # generate new samples with the given gaussian _, out = self.network.output_from_gaussian(gaussian, self.output_size) return self.sess.run(out, feed_dict={self.network.temperature: self.temperature ,self.learning_rate:self.lr})
Example #17
Source File: input_fn.py From tensorflow-triplet-loss with MIT License | 5 votes |
def test_input_fn(data_dir, params): """Test input function for the MNIST dataset. Args: data_dir: (string) path to the data directory params: (Params) contains hyperparameters of the model (ex: `params.num_epochs`) """ dataset = mnist_dataset.test(data_dir) dataset = dataset.batch(params.batch_size) dataset = dataset.prefetch(1) # make sure you always have one batch ready to serve return dataset
Example #18
Source File: input_fn.py From tensorflow-triplet-loss with MIT License | 5 votes |
def train_input_fn(data_dir, params): """Train input function for the MNIST dataset. Args: data_dir: (string) path to the data directory params: (Params) contains hyperparameters of the model (ex: `params.num_epochs`) """ dataset = mnist_dataset.train(data_dir) dataset = dataset.shuffle(params.train_size) # whole dataset into the buffer dataset = dataset.repeat(params.num_epochs) # repeat for multiple epochs dataset = dataset.batch(params.batch_size) dataset = dataset.prefetch(1) # make sure you always have one batch ready to serve return dataset
Example #19
Source File: GMVAE.py From GMVAE with MIT License | 5 votes |
def random_generation(self, num_elements=1): """Random generation for each category Args: num_elements: (int) number of elements to generate Returns: generated data according to num_elements """ # categories for each element arr = np.array([]) for i in range(self.num_classes): arr = np.hstack([arr,np.ones(num_elements) * i] ) indices = arr.astype(int).tolist() categorical = tf.one_hot(indices, self.num_classes) # infer the gaussian distribution according to the category mean, var = self.network.gaussian_from_categorical(categorical) # gaussian random sample by using the mean and variance gaussian = tf.random_normal(tf.shape(mean), mean, tf.sqrt(var)) # generate new samples with the given gaussian _, out = self.network.output_from_gaussian(gaussian, self.output_size) return self.sess.run(out, feed_dict={self.network.temperature: self.temperature ,self.learning_rate:self.lr})
Example #20
Source File: model_fn.py From tensorflow-triplet-loss with MIT License | 5 votes |
def build_model(is_training, images, params): """Compute outputs of the model (embeddings for triplet loss). Args: is_training: (bool) whether we are training or not images: (dict) contains the inputs of the graph (features) this can be `tf.placeholder` or outputs of `tf.data` params: (Params) hyperparameters Returns: output: (tf.Tensor) output of the model """ out = images # Define the number of channels of each convolution # For each block, we do: 3x3 conv -> batch norm -> relu -> 2x2 maxpool num_channels = params.num_channels bn_momentum = params.bn_momentum channels = [num_channels, num_channels * 2] for i, c in enumerate(channels): with tf.variable_scope('block_{}'.format(i+1)): out = tf.layers.conv2d(out, c, 3, padding='same') if params.use_batch_norm: out = tf.layers.batch_normalization(out, momentum=bn_momentum, training=is_training) out = tf.nn.relu(out) out = tf.layers.max_pooling2d(out, 2, 2) assert out.shape[1:] == [7, 7, num_channels * 2] out = tf.reshape(out, [-1, 7 * 7 * num_channels * 2]) with tf.variable_scope('fc_1'): out = tf.layers.dense(out, params.embedding_size) return out
Example #21
Source File: data_preprocessing.py From tf-encrypted with Apache License 2.0 | 5 votes |
def data_prep_from_saved_model( graph_def, data_filenames, batch_size, data_prep_start_node="serialized_example:0", data_prep_end_node="DatasetToSingleElement:0" ): """Main function to extract data processing pipelines.""" # Trim graph to keep only the nodes related to data pre-processing data_prep_end_node_name = data_prep_end_node.split(":")[0] gdef_trimmed = extract_sub_graph( graph_def, dest_nodes=[data_prep_end_node_name], ) # Load TFRecord files then generate a Dataset of batch dataset = tf.data.TFRecordDataset(data_filenames) dataset = dataset.batch(batch_size) iterator = dataset.make_one_shot_iterator() dataset_b = iterator.get_next() # Preprocess data data_out, = tf.import_graph_def( gdef_trimmed, input_map={data_prep_start_node: dataset_b}, return_elements=[data_prep_end_node], ) # TFE expects tensors with fully defined shape fixed_shape = [batch_size] + data_out.get_shape().as_list()[1:] data_out = tf.reshape(data_out, fixed_shape) return data_out
Example #22
Source File: run.py From tf-encrypted with Apache License 2.0 | 5 votes |
def provide_input(self) -> tf.Tensor: """Prepare input data for prediction.""" with tf.name_scope("loading"): prediction_input, expected_result = self._build_data_pipeline().get_next() print_op = tf.print("Expect", expected_result, summarize=self.BATCH_SIZE) with tf.control_dependencies([print_op]): prediction_input = tf.identity(prediction_input) with tf.name_scope("pre-processing"): prediction_input = tf.reshape( prediction_input, shape=(self.BATCH_SIZE, ModelOwner.FLATTENED_DIM) ) return prediction_input
Example #23
Source File: run.py From tf-encrypted with Apache License 2.0 | 5 votes |
def _build_data_pipeline(self): """Build a reproducible tf.data iterator.""" def normalize(image, label): image = tf.cast(image, tf.float32) / 255.0 return image, label dataset = tf.data.TFRecordDataset([self.local_data_file]) dataset = dataset.map(decode) dataset = dataset.map(normalize) dataset = dataset.repeat() dataset = dataset.batch(self.BATCH_SIZE) iterator = dataset.make_one_shot_iterator() return iterator
Example #24
Source File: cifar10_main.py From yolo_v2 with Apache License 2.0 | 5 votes |
def get_filenames(is_training, data_dir): """Returns a list of filenames.""" data_dir = os.path.join(data_dir, 'cifar-10-batches-bin') assert os.path.exists(data_dir), ( 'Run cifar10_download_and_extract.py first to download and extract the ' 'CIFAR-10 data.') if is_training: return [ os.path.join(data_dir, 'data_batch_%d.bin' % i) for i in range(1, _NUM_DATA_FILES + 1) ] else: return [os.path.join(data_dir, 'test_batch.bin')]
Example #25
Source File: run.py From tf-encrypted with Apache License 2.0 | 5 votes |
def _build_training_graph(self, training_data): """Build a graph for plaintext model training.""" model = keras.Sequential() model.add(keras.layers.Dense(512, input_shape=[self.FLATTENED_DIM])) model.add(keras.layers.Activation("relu")) model.add(keras.layers.Dense(self.NUM_CLASSES, activation=None)) # optimizer and data pipeline optimizer = tf.train.AdamOptimizer(learning_rate=0.01) def loss(model, inputs, targets): logits = model(inputs) per_element_loss = tf.losses.sparse_softmax_cross_entropy( labels=targets, logits=logits ) return tf.reduce_mean(per_element_loss) def grad(model, inputs, targets): loss_value = loss(model, inputs, targets) return loss_value, tf.gradients(loss_value, model.trainable_variables) def loop_body(i): x, y = training_data.get_next() _, grads = grad(model, x, y) update_op = optimizer.apply_gradients(zip(grads, model.trainable_variables)) with tf.control_dependencies([update_op]): return i + 1 loop = tf.while_loop( lambda i: i < self.ITERATIONS * self.EPOCHS, loop_body, loop_vars=(0,) ) with tf.control_dependencies([loop]): print_op = tf.print("Training complete") with tf.control_dependencies([print_op]): return [tf.identity(x) for x in model.trainable_variables]
Example #26
Source File: deepkt.py From Deep-Knowledge-Tracing with MIT License | 5 votes |
def evaluate(self, dataset, verbose=1, steps=None, callbacks=None): """Returns the loss value & metrics values for the model in test mode. Computation is done in batches. Arguments: dataset: `tf.data` dataset. Should return a tuple of `(inputs, (skills, targets))`. verbose: 0 or 1. Verbosity mode. 0 = silent, 1 = progress bar. steps: Integer or `None`. Total number of steps (batches of samples) before declaring the evaluation round finished. Ignored with the default value of `None`. If x is a `tf.data` dataset and `steps` is None, 'evaluate' will run until the dataset is exhausted. This argument is not supported with array inputs. callbacks: List of `keras.callbacks.Callback` instances. List of callbacks to apply during evaluation. See [callbacks](/api_docs/python/tf/keras/callbacks). Returns: Scalar test loss (if the model has a single output and no metrics) or list of scalars (if the model has multiple outputs and/or metrics). The attribute `model.metrics_names` will give you the display labels for the scalar outputs. Raises: ValueError: in case of invalid arguments. """ return super(DKTModel, self).evaluate(dataset, verbose=verbose, steps=steps, callbacks=callbacks)
Example #27
Source File: base_model.py From spl with GNU General Public License v3.0 | 5 votes |
def step(self, session): """Runs one training step by evaluating loss, parameter update, summary and output operations. Model receives data from the data pipeline automatically. In contrast to `sampled_step`, model's output is not fed back to the model. Args: session: TF session object. Returns: loss, summary proto, prediction """ pass
Example #28
Source File: cifar10_main.py From yolo_v2 with Apache License 2.0 | 5 votes |
def input_fn(is_training, data_dir, batch_size, num_epochs=1): """Input_fn using the tf.data input pipeline for CIFAR-10 dataset. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. Returns: A tuple of images and labels. """ dataset = record_dataset(get_filenames(is_training, data_dir)) if is_training: # When choosing shuffle buffer sizes, larger sizes result in better # randomness, while smaller sizes have better performance. Because CIFAR-10 # is a relatively small dataset, we choose to shuffle the full epoch. dataset = dataset.shuffle(buffer_size=_NUM_IMAGES['train']) dataset = dataset.map(parse_record) dataset = dataset.map( lambda image, label: (preprocess_image(image, is_training), label)) dataset = dataset.prefetch(2 * batch_size) # We call repeat after shuffling, rather than before, to prevent separate # epochs from blending together. dataset = dataset.repeat(num_epochs) # Batch results by up to batch_size, and then fetch the tuple from the # iterator. dataset = dataset.batch(batch_size) iterator = dataset.make_one_shot_iterator() images, labels = iterator.get_next() return images, labels
Example #29
Source File: inception_v3.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def dataset_iterator(self, batch_size, shuffle): """Constructs a real-data iterator over batches for train or eval. Args: batch_size: The effective batch size. shuffle: Whether or not to shuffle the data. Returns: A tf.data iterator. """ file_pattern = os.path.join(self.data_dir, 'train-*' if self.is_training else 'validation-*') dataset = tf.data.Dataset.list_files(file_pattern, shuffle=self.is_training) if self.is_training: dataset = dataset.repeat() def prefetch_dataset(filename): dataset = tf.data.TFRecordDataset( filename, buffer_size=FLAGS.prefetch_dataset_buffer_size) return dataset dataset = dataset.apply( tf.contrib.data.parallel_interleave( prefetch_dataset, cycle_length=FLAGS.num_files_infeed, sloppy=True)) if shuffle and FLAGS.followup_shuffle_buffer_size > 0: dataset = dataset.shuffle(buffer_size=FLAGS.followup_shuffle_buffer_size) dataset = dataset.map( self.dataset_parser, num_parallel_calls=FLAGS.num_parallel_calls) dataset = dataset.prefetch(batch_size) dataset = dataset.batch(batch_size, drop_remainder=True) dataset = dataset.prefetch(2) # Prefetch overlaps in-feed with training return dataset.make_one_shot_iterator()
Example #30
Source File: dataset.py From TransE-Knowledge-Graph-Embedding with MIT License | 5 votes |
def _parse(line): """Parse train data.""" cols_types = [[''], [''], ['']] return tf.decode_csv(line, record_defaults=cols_types, field_delim='\t')