python source code of urmp_to

import math
import os
import random
import multiprocessing
from multiprocessing import Pool
from absl import flags
import tensorflow as tf
# options for reading wav file
# from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
# import soundfile as sf

import librosa
from google.cloud import storage


flags.DEFINE_string(
    'project', os.environ["PROJECT_NAME"], 'Google cloud project id for uploading the dataset.')
flags.DEFINE_string(
    'gcs_output_path', 'gs://vimsstfrecords/urmpv2-labels', 'GCS path for uploading the dataset.')
flags.DEFINE_string(
    'local_scratch_dir', '/dev/tfrecords/urmpv2', 'Scratch directory path for temporary files.')
flags.DEFINE_string(
    'raw_data_dir', '/home/olga/urmpv2', 'Directory path for raw URMP dataset. '
    'Should have train and test subdirectories inside it.')


"""
This script converts wav files of URMP dataset to TF records.
An important difference between ImageNet and MusDB is that we should split individual recordings into small chunk of
size input_size. Perhaps, if we want to use bigger input_size later, if should be a multiple of input_size.

"""

FLAGS = flags.FLAGS

TRAINING_DIRECTORY = 'train'
TEST_DIRECTORY = 'test'

TRAINING_SHARDS = 6
TEST_SHARDS = 3

CHANNEL_NAMES = ['.stem_mix.wav', '.stem_bn.wav', '.stem_cl.wav', '.stem_db.wav', '.stem_fl.wav', '.stem_hn.wav', '.stem_ob.wav',
                 '.stem_sax.wav', '.stem_tba.wav', '.stem_tbn.wav', '.stem_tpt.wav', '.stem_va.wav', '.stem_vc.wav', '.stem_vn.wav']

MIX_WITH_PADDING = 147443
SAMPLE_RATE = 22050     # Set a fixed sample rate
NUM_SAMPLES = 16384     # get from parameters of the model
CHANNELS = 1            # always work with mono!
NUM_SOURCES = 13         # fix 4 sources for musdb + mix
CACHE_SIZE = 16         # load 16 audio files in memory, then shuffle examples and write a tf.record

source_map = {
    'mix': 0,
    'bn': 1,
    'cl': 2,
    'db': 3,
    'fl': 4,
    'hn': 5,
    'ob': 6,
    'sax': 7,
    'tba': 8,
    'tbn': 9,
    'tpt': 10,
    'va': 11,
    'vc': 12,
    'vn': 13,
}

def make_shuffle_idx(n):
    order = list(range(n))
    random.shuffle(order)
    return order

def _check_or_create_dir(directory):
    """Check if directory exists otherwise create it."""
    if not tf.gfile.Exists(directory):
        tf.gfile.MakeDirs(directory)


def _int64_feature(value):
    """Wrapper for inserting int64 features into Example proto."""
    if not isinstance(value, list):
        value = [value]
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))


def _bytes_feature(value):
    """Wrapper for inserting bytes features into Example proto."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def _floatlist_feature(value):
    """Wrapper for inserting float list features into Example proto."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))


def _sources_floatlist_feature(value):
    """Wrapper for inserting list of sources into Example proto. """
    flatten = [item for sublist in value for item in sublist]
    return tf.train.Feature(float_list=tf.train.FloatList(value=flatten))


def _convert_to_example(filename, sample_idx, data_buffer, num_sources, labels,
                        sample_rate=SAMPLE_RATE, channels=CHANNELS, num_samples=NUM_SAMPLES):
    """Creating a training or testing example. These examples are aggregated later in a batch.
    Each data example should consist of [mix, bass, drums, other, vocals] data and corresponding metadata
    Each data example should have the same input_size (from base 16k to 244k samples), it needs to be fixed.

    data_buffer here is a vector of size num_samples*(num_sources+1), the first channel is always "mix"

    """
    example = tf.train.Example(features=tf.train.Features(feature={
        'audio/file_basename': _bytes_feature("_".join((os.path.basename(filename[0])).split("_")[:3])),
        'audio/sample_rate': _int64_feature(sample_rate),
        'audio/sample_idx': _int64_feature(sample_idx),
        'audio/num_samples': _int64_feature(num_samples),
        'audio/channels': _int64_feature(channels),
        'audio/num_sources': _int64_feature(num_sources),
        'audio/labels': _int64_feature(labels),
        'audio/source_names': _bytes_feature(",".join((os.path.basename(filename[0]).replace(".","_")).split("_")[3:-1])),
        'audio/encoded': _sources_floatlist_feature(data_buffer)}))
    return example


def _get_segments_from_audio_cache(file_data_cache):
    """
    Args:
        file_data_cache: list of raw audio files, mix and 4 sources data: [filename, len(data), data]
    Returns:
         segments: k segments of raw data
            each one contains file_basename, sample_idx, 5 raw data audio frames in a single list
    """
    segments = list()
    offset = (MIX_WITH_PADDING - NUM_SAMPLES)//2
    start_idx = offset
    end_idx = file_data_cache[0][1] - offset - 1
    for sample_idx in range((end_idx - start_idx)//NUM_SAMPLES):
        # sampling segments, ignore first and last incomplete segments
        # notice that we sample MIX_WITH_PADDING from the mix and central cropped NUM_SAMPLES from the sources
        segments_data = list()
        sample_offset_start = start_idx + sample_idx*NUM_SAMPLES
        sample_offset_end = start_idx + (sample_idx+1)*NUM_SAMPLES
        # adding big datasample for mix
        segments_data.append(file_data_cache[0][2][sample_offset_start-offset:sample_offset_end+offset+1])
        # adding rest of the sources
        assert len(segments_data[0]) == MIX_WITH_PADDING
        for source in file_data_cache[1:]:
            segments_data.append(source[2][sample_offset_start:sample_offset_end])
        segments.append([file_data_cache[0][0], sample_idx, segments_data, len(file_data_cache)-1])
    return segments


def _process_audio_files_batch(chunk_data):
    """Processes and saves list of audio files as TFRecords.
    Args:
        chunk_data: tuple of chunk_files and output_file
        chunk_files: list of strings; each string is a path to an wav file
        output_file: string, unique identifier specifying the data set
    """

    chunk_files, output_file = chunk_data[0], chunk_data[1]
    # Get training files from the directory name

    writer = tf.python_io.TFRecordWriter(output_file)

    chunk_data_cache = list()
    for track in chunk_files:
        # load all wave files into memory and create a buffer
        file_data_cache = list()
        for source in track:
            data, sr = librosa.core.load(source, sr=SAMPLE_RATE, mono=True)
            file_data_cache.append([track, len(data), data])

            # Option 1: use only tf to read and resample audio
            # audio_binary = tf.read_file(filename+source)
            # wav_decoder = contrib_audio.decode_wav(
            #     audio_binary,
            #     desired_channels=CHANNELS)
            # Option 2: use Soundfile and read binary files
            # SoundFile should be much more faster but it doesn't matter because we store everything in tf.records
            # with sf.SoundFile(filename+source, "r") as f:
            #     print(filename+source, f.samplerate, f.channels, len(f), f.read().tobytes())

        for segment in _get_segments_from_audio_cache(file_data_cache):
            chunk_data_cache.append(segment)

    # shuffle all segments
    shuffle_idx = make_shuffle_idx(len(chunk_data_cache))
    chunk_data_cache = [chunk_data_cache[i] for i in shuffle_idx]

    for chunk in chunk_data_cache:
        labels = get_labels_from_filename(chunk[0])
        example = _convert_to_example(filename=chunk[0], sample_idx=chunk[1],
                                      data_buffer=chunk[2], num_sources=chunk[3],
                                      labels=labels)
        writer.write(example.SerializeToString())

    writer.close()
    tf.logging.info('Finished writing file: %s' % output_file)


def get_labels_from_filename(filename):
    labels = [0]*(len(source_map)-1)
    label_names = (os.path.basename(filename[0]).replace(".", "_")).split("_")[3:-1]
    for label_name in label_names:
        labels[source_map[label_name]-1] = 1
    return labels


def _process_dataset(filenames,
                     output_directory,
                     prefix,
                     num_shards):
    """Processes and saves list of audio files as TFRecords.
    Args:
    filenames: list of strings; each string is a path to an audio file
    channel_names: list of strings; each string is a channel name (vocals, bass, drums etc)
    labels: map of string to integer; id for all channel name
    output_directory: path where output files should be created
    prefix: string; prefix for each file
    num_shards: number of chucks to split the filenames into
    Returns:
    files: list of tf-record filepaths created from processing the dataset.
    """
    _check_or_create_dir(output_directory)
    chunksize = int(math.ceil(len(filenames) / float(num_shards)))

    pool = Pool(multiprocessing.cpu_count()-1)

    def output_file(shard_idx):
        return os.path.join(output_directory, '%s-%.5d-of-%.5d' % (prefix, shard_idx, num_shards))

    # chunk data consists of chunk_filenames and output_file
    chunk_data = [(filenames[shard * chunksize: (shard + 1) * chunksize],
                  output_file(shard)) for shard in range(num_shards)]

    files = pool.map(_process_audio_files_batch, chunk_data)

    return files


def get_wav(database_path):
    """ Iterate through .wav files from URMP dataset
        returns data_list: List[List[path_to_wavefiles]] """

    silence_path = '../silence.wav'
    track_list = []
    # for dir in os.listdir(database_path):
    #     source_list = []
    #     for filename in os.listdir(database_path+"/"+dir):
    #         source_list.append(os.path.join(database_path+"/"+dir, filename))
    #         source_list.sort()
    #     track_list.append(source_list)

    # tracks = [f for r,d,f in os.walk('test')]
    # waves = [os.path.join(database_path, wav) for sub_folders in tracks for wav in sub_folders]

    # Iterate through each tracks
    for folder in os.listdir(database_path):
        track_sources = [0 for i in range(14)]  # 1st index must be mix source + 13 individual sources

        # Create Sample object for each instrument source files present
        for filename in os.listdir(os.path.join(database_path, folder)):
            if filename.endswith(".wav"):
                if filename.startswith("AuMix"):
                    # Place mix source to the first index
                    mix_path = os.path.join(database_path, folder, filename)
                    # mix_audio, mix_rate = soundfile.read(mix_path, always_2d=True)
                    # mix_duration = mix_audio.shape[0] / mix_rate
                    # mix = Sample(mix_path, mix_rate, mix_audio.shape[1], mix_duration)
                    track_sources[0] = mix_path
                else:
                    # Place Sample object mapping to its instrument index
                    print(filename, filename.split('_'))
                    source_name = filename.split('_')[2]
                    source_idx = source_map[source_name]
                    source_path = os.path.join(database_path, folder, filename)
                    # source_audio, source_rate = soundfile.read(source_path, always_2d=True)
                    # source_duration = source_audio.shape[0] / source_rate
                    # source = Sample(source_path, source_rate, source_audio.shape[1], source_duration)
                    track_sources[source_idx] = source_path

        # Create and insert silence Sample object for instruments not present in the track
        # silence_audio, silence_rate = soundfile.read(silence_path, always_2d=True)
        # silence_duration = silence_audio.shape[0] / silence_rate
        # silence = Sample(silence_path, silence_rate, silence_audio.shape[1], silence_duration)
        for i, track in enumerate(track_sources):
            if track == 0:
                track_sources[i] = silence_path

        track_list.append(track_sources)


    return track_list

def convert_to_tf_records(raw_data_dir):
    """Convert the URMP dataset into TF-Record dumps."""

    training_files = get_wav(os.path.join(raw_data_dir, TRAINING_DIRECTORY))
    test_files= get_wav(os.path.join(raw_data_dir, TEST_DIRECTORY))

    # # Glob all the training files
    # training_files = tf.gfile.Glob(
    #     os.path.join(raw_data_dir, TRAINING_DIRECTORY, '*.wav'))
    # training_files = list(set([filename.split('.')[0] for filename in training_files]))
    #
    # # Glob all the validation files
    # test_files = sorted(tf.gfile.Glob(
    #     os.path.join(raw_data_dir, TEST_DIRECTORY, '*.wav')))
    # test_files = list(set([filename.split('.')[0] for filename in test_files]))

    # Create training data
    tf.logging.info('Processing the training data.')
    training_records = _process_dataset(training_files,
                                        os.path.join(FLAGS.local_scratch_dir, TRAINING_DIRECTORY),
                                        TRAINING_DIRECTORY, TRAINING_SHARDS)

    # Create validation data
    tf.logging.info('Processing the validation data.')
    test_records = _process_dataset(test_files,
                                    os.path.join(FLAGS.local_scratch_dir, TEST_DIRECTORY),
                                    TEST_DIRECTORY, TEST_SHARDS)

    return training_records, test_records


def upload_to_gcs(training_records, test_records):
    """Upload TF-Record files to GCS, at provided path."""

    # Find the GCS bucket_name and key_prefix for dataset files
    path_parts = FLAGS.gcs_output_path[5:].split('/', 1)
    bucket_name = path_parts[0]
    if len(path_parts) == 1:
        key_prefix = ''
    elif path_parts[1].endswith('/'):
        key_prefix = path_parts[1]
    else:
        key_prefix = path_parts[1] + '/'

    client = storage.Client(project=FLAGS.project)
    bucket = client.get_bucket(bucket_name)

    def _upload_files(filenames):
        """Upload a list of files into a specifc subdirectory."""
        for i, filename in enumerate(filenames):
            blob = bucket.blob(key_prefix + os.path.basename(filename))
            blob.upload_from_filename(filename)
            if not i % 5:
                tf.logging.info('Finished uploading file: %s' % filename)

    # Upload training dataset
    tf.logging.info('Uploading the training data.')
    _upload_files(training_records)

    # Upload validation dataset
    tf.logging.info('Uploading the validation data.')
    _upload_files(test_records)


def main(argv):  # pylint: disable=unused-argument
    tf.logging.set_verbosity(tf.logging.INFO)

    if FLAGS.project is None:
        raise ValueError('GCS Project must be provided.')

    if FLAGS.gcs_output_path is None:
        raise ValueError('GCS output path must be provided.')
    elif not FLAGS.gcs_output_path.startswith('gs://'):
        raise ValueError('GCS output path must start with gs://')

    if FLAGS.local_scratch_dir is None:
        raise ValueError('Scratch directory path must be provided.')

    # Download the dataset if it is not present locally
    raw_data_dir = FLAGS.raw_data_dir

    # Convert the raw data into tf-records
    training_records, test_records = convert_to_tf_records(raw_data_dir)

    # Upload to GCS
    # upload_to_gcs(training_records, test_records)


if __name__ == '__main__':
    tf.app.run()