Python tensorflow.record() Examples

The following are 11 code examples of tensorflow.record(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow , or try the search function

Example #1

Source File: musdb_to_tfrecord.py From vimss with GNU General Public License v3.0

5 votes

def _process_dataset(filenames,
                     output_directory,
                     prefix,
                     num_shards):
    """Processes and saves list of audio files as TFRecords.
    Args:
    filenames: list of strings; each string is a path to an audio file
    channel_names: list of strings; each string is a channel name (vocals, bass, drums etc)
    labels: map of string to integer; id for all channel name
    output_directory: path where output files should be created
    prefix: string; prefix for each file
    num_shards: number of chucks to split the filenames into
    Returns:
    files: list of tf-record filepaths created from processing the dataset.
    """
    _check_or_create_dir(output_directory)
    chunksize = int(math.ceil(len(filenames) / num_shards))

    pool = Pool(multiprocessing.cpu_count()-1)

    def output_file(shard_idx):
        return os.path.join(output_directory, '%s-%.5d-of-%.5d' % (prefix, shard_idx, num_shards))

    # chunk data consists of chunk_filenames and output_file
    chunk_data = [(filenames[shard * chunksize: (shard + 1) * chunksize],
                   output_file(shard)) for shard in range(num_shards)]

    files = pool.map(_process_audio_files_batch, chunk_data)

    return files

Example #2

Source File: urmp_to_tfrecords.py From vimss with GNU General Public License v3.0

5 votes

def _process_dataset(filenames,
                     output_directory,
                     prefix,
                     num_shards):
    """Processes and saves list of audio files as TFRecords.
    Args:
    filenames: list of strings; each string is a path to an audio file
    channel_names: list of strings; each string is a channel name (vocals, bass, drums etc)
    labels: map of string to integer; id for all channel name
    output_directory: path where output files should be created
    prefix: string; prefix for each file
    num_shards: number of chucks to split the filenames into
    Returns:
    files: list of tf-record filepaths created from processing the dataset.
    """
    _check_or_create_dir(output_directory)
    chunksize = int(math.ceil(len(filenames) / float(num_shards)))

    pool = Pool(multiprocessing.cpu_count()-1)

    def output_file(shard_idx):
        return os.path.join(output_directory, '%s-%.5d-of-%.5d' % (prefix, shard_idx, num_shards))

    # chunk data consists of chunk_filenames and output_file
    chunk_data = [(filenames[shard * chunksize: (shard + 1) * chunksize],
                  output_file(shard)) for shard in range(num_shards)]

    files = pool.map(_process_audio_files_batch, chunk_data)

    return files

Example #3

Source File: musdb_input.py From vimss with GNU General Public License v3.0

5 votes

def dataset_parser(self, value):
        """Parse an audio example record from a serialized string Tensor."""
        keys_to_features = {
            'audio/file_basename':
                tf.FixedLenFeature([], tf.string, ''),
            'audio/encoded':
                tf.VarLenFeature(tf.float32),
            'audio/sample_rate':
                tf.FixedLenFeature([], tf.int64, SAMPLE_RATE),
            'audio/sample_idx':
                tf.FixedLenFeature([], tf.int64, -1),
            'audio/num_samples':
                tf.FixedLenFeature([], tf.int64, NUM_SAMPLES),
            'audio/channels':
                tf.FixedLenFeature([], tf.int64, CHANNELS),
            'audio/num_sources':
                tf.FixedLenFeature([], tf.int64, NUM_SOURCES)
        }

        parsed = tf.parse_single_example(value, keys_to_features)
        audio_data = tf.sparse_tensor_to_dense(parsed['audio/encoded'], default_value=0)
        audio_shape = tf.stack([MIX_WITH_PADDING + NUM_SOURCES*NUM_SAMPLES])
        audio_data = tf.reshape(audio_data, audio_shape)
        mix, sources = tf.reshape(audio_data[:MIX_WITH_PADDING], tf.stack([MIX_WITH_PADDING, CHANNELS])), \
                       tf.reshape(audio_data[MIX_WITH_PADDING:], tf.stack([NUM_SOURCES, NUM_SAMPLES, CHANNELS]))
        mix = tf.cast(mix, tf.bfloat16)
        sources = tf.cast(sources, tf.bfloat16)
        if self.is_training:
            features = {'mix': mix}
        else:
            features = {'mix': mix, 'filename': parsed['audio/file_basename'], 'sample_id': parsed['audio/sample_idx']}
        return features, sources

Example #4

Source File: data_mani.py From self_driving_pi_car with MIT License

5 votes

def create_record(record_path,
                  data,
                  labels,
                  height,
                  width,
                  channels):
    """
    Fuction to create one tf.record using two numpy arrays.
    The array in data is expected to be flat.

    :param record_path: path to save the tf.record
    :type record_path: str
    :param data: dataset
    :type data: np.array
    :param label: labels
    :type label: np.array
    :param height: image height
    :type height: int
    :param width: image width
    :type width: int
    :param channels: image channels
    :type channels: int
    """
    assert data.shape[1] == height * width * channels
    writer = tf.python_io.TFRecordWriter(record_path)
    for i, e in enumerate(data):
        img_str = data[i].tostring()
        label_str = labels[i].tostring()
        example = tf.train.Example(features=tf.train.Features(feature={
            'height': _int64_feature(height),
            'width': _int64_feature(width),
            'channels': _int64_feature(channels),
            'image_raw': _bytes_feature(img_str),
            'labels_raw': _bytes_feature(label_str)}))

        writer.write(example.SerializeToString())
    writer.close()

Example #5

Source File: run_ner.py From KBQA-BERT with MIT License

5 votes

def filed_based_convert_examples_to_features(
        examples, label_list, max_seq_length, tokenizer, output_file, mode=None
):
    """
    将数据转化为TF_Record 结构，作为模型数据输入
    :param examples:  样本
    :param label_list:标签list
    :param max_seq_length: 预先设定的最大序列长度
    :param tokenizer: tokenizer 对象
    :param output_file: tf.record 输出路径
    :param mode:
    :return:
    """
    writer = tf.python_io.TFRecordWriter(output_file)
    # 遍历训练数据
    for (ex_index, example) in enumerate(examples):
        if ex_index % 5000 == 0:
            tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
        # 对于每一个训练样本,
        feature = convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, mode)

        def create_int_feature(values):
            f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
            return f

        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(feature.input_ids)
        features["input_mask"] = create_int_feature(feature.input_mask)
        features["segment_ids"] = create_int_feature(feature.segment_ids)
        features["label_ids"] = create_int_feature(feature.label_ids)
        # features["label_mask"] = create_int_feature(feature.label_mask)
        # tf.train.Example/Feature 是一种协议，方便序列化？？？
        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
        writer.write(tf_example.SerializeToString())

Example #6

Source File: run_ner.py From KBQA-BERT with MIT License

5 votes

def file_based_input_fn_builder(input_file, seq_length, is_training, drop_remainder):
    name_to_features = {
        "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
        "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
        "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
        "label_ids": tf.FixedLenFeature([seq_length], tf.int64),
        # "label_ids":tf.VarLenFeature(tf.int64),
        # "label_mask": tf.FixedLenFeature([seq_length], tf.int64),
    }

    def _decode_record(record, name_to_features):
        example = tf.parse_single_example(record, name_to_features)
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.to_int32(t)
            example[name] = t
        return example

    def input_fn(params):
        batch_size = params["batch_size"]
        d = tf.data.TFRecordDataset(input_file)
        if is_training:
            d = d.repeat()
            d = d.shuffle(buffer_size=100)
        d = d.apply(tf.contrib.data.map_and_batch(
            lambda record: _decode_record(record, name_to_features),
            batch_size=batch_size,
            drop_remainder=drop_remainder
        ))
        return d

    return input_fn

Example #7

Source File: parse_sdf_utils.py From deep-molecular-massspec with Apache License 2.0

5 votes

def validate_spectra_array_contents(record_path_name, hparams,
                                    spectra_array_path_name):
  """Checks that np.array containing spectra matches contents of record.

  Args:
    record_path_name: pathname to tf.Record file matching np.array
    hparams: See get_dataset_from_record
    spectra_array_path_name : pathname to spectra np.array.
  Raises:
    ValueError: if values in np.array stored at spectra_array_path_name
       does not match the spectra values in the TFRecord stored in the
       record_path_name.
  """
  dataset = get_dataset_from_record(
      [record_path_name],
      hparams,
      mode=tf.estimator.ModeKeys.EVAL,
      all_data_in_one_batch=True)

  feature_names = [fmap_constants.DENSE_MASS_SPEC]
  label_names = [fmap_constants.INDEX_TO_GROUND_TRUTH_ARRAY]

  features, labels = make_features_and_labels(
      dataset, feature_names, label_names, mode=tf.estimator.ModeKeys.EVAL)

  with tf.Session() as sess:
    feature_values, label_values = sess.run([features, labels])

  spectra_array = load_training_spectra_array(spectra_array_path_name)

  for i in range(np.shape(spectra_array)[0]):
    test_idx = label_values[fmap_constants.INDEX_TO_GROUND_TRUTH_ARRAY][i]
    spectra_from_dataset = feature_values[fmap_constants.DENSE_MASS_SPEC][
        test_idx, :]
    spectra_from_array = spectra_array[test_idx, :]

    if not all(spectra_from_dataset.flatten() == spectra_from_array.flatten()):
      raise ValueError('np.array of spectra stored at {} does not match spectra'
                       ' values in tf.Record {}'.format(spectra_array_path_name,
                                                        record_path_name))
  return

Example #8

Source File: bert_lstm_ner.py From pynlp with MIT License

5 votes

def filed_based_convert_examples_to_features(
        examples, label_list, max_seq_length, tokenizer, output_file, mode=None
):
    """
    将数据转化为TF_Record 结构，作为模型数据输入
    :param examples:  样本
    :param label_list:标签list
    :param max_seq_length: 预先设定的最大序列长度
    :param tokenizer: tokenizer 对象
    :param output_file: tf.record 输出路径
    :param mode:
    :return:
    """
    writer = tf.python_io.TFRecordWriter(output_file)
    # 遍历训练数据
    for (ex_index, example) in enumerate(examples):
        if ex_index % 5000 == 0:
            tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
        # 对于每一个训练样本,
        feature = convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, mode)

        def create_int_feature(values):
            f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
            return f

        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(feature.input_ids)
        features["input_mask"] = create_int_feature(feature.input_mask)
        features["segment_ids"] = create_int_feature(feature.segment_ids)
        features["label_ids"] = create_int_feature(feature.label_ids)
        # features["label_mask"] = create_int_feature(feature.label_mask)
        # tf.train.Example/Feature 是一种协议，方便序列化？？？
        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
        writer.write(tf_example.SerializeToString())

Example #9

Source File: bert_lstm_ner.py From pynlp with MIT License

5 votes

def file_based_input_fn_builder(input_file, seq_length, is_training, drop_remainder):
    name_to_features = {
        "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
        "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
        "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
        "label_ids": tf.FixedLenFeature([seq_length], tf.int64),
        # "label_ids":tf.VarLenFeature(tf.int64),
        # "label_mask": tf.FixedLenFeature([seq_length], tf.int64),
    }

    def _decode_record(record, name_to_features):
        example = tf.parse_single_example(record, name_to_features)
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.to_int32(t)
            example[name] = t
        return example

    def input_fn(params):
        batch_size = params["batch_size"]
        d = tf.data.TFRecordDataset(input_file)
        if is_training:
            d = d.repeat()
            d = d.shuffle(buffer_size=100)
        d = d.apply(tf.contrib.data.map_and_batch(
            lambda record: _decode_record(record, name_to_features),
            batch_size=batch_size,
            drop_remainder=drop_remainder
        ))
        return d

    return input_fn

Example #10

Source File: urmp_input.py From vimss with GNU General Public License v3.0

4 votes

def dataset_parser(self, value):
        """Parse an audio example record from a serialized string Tensor."""
        keys_to_features = {
            'audio/file_basename':
                tf.FixedLenFeature([], tf.int64, -1),
            'audio/encoded':
                tf.VarLenFeature(tf.float32),
            'audio/sample_rate':
                tf.FixedLenFeature([], tf.int64, SAMPLE_RATE),
            'audio/sample_idx':
                tf.FixedLenFeature([], tf.int64, -1),
            'audio/num_samples':
                tf.FixedLenFeature([], tf.int64, NUM_SAMPLES),
            'audio/channels':
                tf.FixedLenFeature([], tf.int64, CHANNELS),
            'audio/labels':
                tf.VarLenFeature(tf.int64),
            'audio/num_sources':
                tf.FixedLenFeature([], tf.int64, NUM_SOURCES),
            'audio/source_names':
                tf.FixedLenFeature([], tf.string, ''),
        }

        parsed = tf.parse_single_example(value, keys_to_features)
        audio_data = tf.sparse_tensor_to_dense(parsed['audio/encoded'], default_value=0)
        audio_shape = tf.stack([MIX_WITH_PADDING + NUM_SOURCES*NUM_SAMPLES])
        audio_data = tf.reshape(audio_data, audio_shape)
        mix, sources = tf.reshape(audio_data[:MIX_WITH_PADDING], tf.stack([MIX_WITH_PADDING, CHANNELS])),tf.reshape(audio_data[MIX_WITH_PADDING:], tf.stack([NUM_SOURCES, NUM_SAMPLES, CHANNELS]))
        labels = tf.sparse_tensor_to_dense(parsed['audio/labels'])
        labels = tf.reshape(labels, tf.stack([NUM_SOURCES]))

        if self.use_bfloat16:
            mix = tf.cast(mix, tf.bfloat16)
            labels = tf.cast(labels, tf.bfloat16)
            sources = tf.cast(sources, tf.bfloat16)
        if self.mode == 'train':
            features = {'mix': mix,
                        'labels': labels}
        elif self.mode == 'eval':
            features = {'mix': mix,
                        'labels': labels}
        else:
            features = {'mix': mix, 'filename': parsed['audio/file_basename'],
                        'sample_id': parsed['audio/sample_idx'], 'labels': labels}
        return features, sources

Example #11

Source File: parse_sdf_utils.py From deep-molecular-massspec with Apache License 2.0

4 votes

def write_dicts_to_example(mol_list,
                           record_path_name,
                           max_atoms,
                           max_mass_spec_peak_loc,
                           true_library_array_path_name=None):
  """Helper function for writing tf.record from all examples.

  Uses dict_to_tfexample to write the actual tf.example

  Args:
    mol_list : list of rdkit.Mol objects
    record_path_name : file name for storing tf record
    max_atoms : max. number of atoms to consider in a molecule.
    max_mass_spec_peak_loc : largest mass/charge ratio to allow in a spectra
    true_library_array_path_name: path for storing np.array of true spectra

  Returns:
    - Writes tf.Record of an example for each eligible molecule
    (i.e. # atoms < max_atoms)
    - Writes np.array (len(mol_list), max_mass_spec_peak_loc) to
      true_library_array_path_name if it is defined.
  """
  options = tf.python_io.TFRecordOptions(
      tf.python_io.TFRecordCompressionType.ZLIB)

  # Wrapper function to add index value to dictionary
  if true_library_array_path_name:
    spectra_matrix = np.zeros((len(mol_list), max_mass_spec_peak_loc))

    def make_mol_dict_with_saved_array(idx, mol):
      mol_dict = make_mol_dict(mol, max_atoms, max_mass_spec_peak_loc)
      mol_dict[fmap_constants.INDEX_TO_GROUND_TRUTH_ARRAY] = idx
      spectra_matrix[idx, :] = mol_dict[fmap_constants.DENSE_MASS_SPEC]
      return mol_dict

    make_mol_dict_fn = make_mol_dict_with_saved_array

  else:

    def make_mol_dict_without_saved_array(idx, mol):
      del idx
      return make_mol_dict(mol, max_atoms, max_mass_spec_peak_loc)

    make_mol_dict_fn = make_mol_dict_without_saved_array

  with tf.python_io.TFRecordWriter(record_path_name, options) as writer:
    for idx, mol in enumerate(mol_list):
      mol_dict = make_mol_dict_fn(idx, mol)
      example = dict_to_tfexample(mol_dict)
      writer.write(example.SerializeToString())

  if true_library_array_path_name:
    with tf.gfile.Open(true_library_array_path_name, 'w') as f:
      np.save(f, spectra_matrix)