""" Copyright (c) 2019 Microsoft Corporation. All rights reserved. MIT License Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import sys import torch.utils.data as data import glob import numpy as np from multiprocessing import Queue import zipfile import io import json from simulation import SimpleSimulator, imagesc from simulation.freq_analysis import stft import reader import os def _utt2seg(data, seg_len, seg_shift): """ Cut an utterance (MxN matrix) to segments. """ if data.ndim == 1: data = np.reshape(data, (1, data.size)) dim, n_fr = data.shape n_seg = int(np.floor((n_fr - seg_len) / seg_shift)) + 1 seg = [] for i in range(n_seg): start = i * seg_shift stop = start + seg_len seg.append(data[:, start:stop]) return seg class DataBuffer: """This is the class that generate data for neural network training and used with dynamic data simulation. The job is to prepare speech corpus into individual training samples. E.g. feature-label pairs. It may also need to call speech simulation to simulated distorted multi-channel array data. This class generates samples and put them in a buffer (a FIFO queue). """ def __init__(self, data_generator, buffer_size=1000, preload_size=100, randomize=True): self.data_generator = data_generator self.buffer_size = buffer_size self.preload_size = preload_size self.buffer = [] self.randomize = randomize def get(self): """Generate required number of training samples. """ if len(self.buffer) < self.buffer_size: # maintain minimum number of entries in the buffer. while len(self.buffer) < self.buffer_size + self.preload_size: tmp_data = self.data_generator.generate() self.buffer += tmp_data if self.randomize: return_idx = np.random.randint(len(self.buffer)) else: return_idx = 0 data = self.buffer.pop(return_idx) return data def get_len(self): return self.data_generator.get_len() class SpeechDataset(data.Dataset): def __init__(self, config): self.transform=None self.sequence_mode = config["data_config"]["sequence_mode"] # load the three types of source data dir_noise_streams = None if "dir_noise_paths" in config: dir_noise_streams = self._load_streams(config["dir_noise_paths"], config['data_path'], is_speech=False) rir_streams = None if "rir_paths" in config: rir_streams = self._load_streams(config["rir_paths"], config['data_path'], is_speech=False, is_rir=True) source_streams = self._load_streams(config["source_paths"], config['data_path'], is_speech=True) self.source_stream_sizes = [i.get_number_of_data() for i in source_streams] if self.sequence_mode: self.source_stream_cum_sizes = [self.source_stream_sizes[0]] for i in range(1, len(self.source_stream_sizes)): self.source_stream_cum_sizes.append(self.source_stream_cum_sizes[-1] + self.source_stream_sizes[i]) if 'simulation_prob' in config['data_config']: simulation_prob = config['data_config']['simulation_prob'] else: simulation_prob = 0 generator_config = DataGeneratorSequenceConfig( use_reverb=config["data_config"]["use_reverb"], use_noise=config["data_config"]["use_dir_noise"], snr_range=[config["data_config"]["snr_min"], config["data_config"]["snr_max"]], n_hour_per_epoch=config["sweep_size"], sequence_mode=self.sequence_mode, load_label=config["data_config"]["load_label"], seglen=config["data_config"]["seg_len"], segshift=config["data_config"]["seg_shift"], use_cmn=config["data_config"]["use_cmn"], simulation_prob=simulation_prob ) data_generator = DataGeneratorTrain(source_streams, dir_noise_streams, rir_streams, generator_config, DEBUG=False) if self.sequence_mode: self.data_buffer = data_generator else: self.data_buffer = DataBuffer(data_generator, buffer_size=20000, preload_size=200, randomize=True) self.sample_len_seconds = config["data_config"]["seg_len"] * 0.01 # default sampling rate: 100Hz self.stream_idx_for_transform = [0] def _load_streams(self, source_list, data_path, is_speech=True, is_rir=False): source_streams = list() for i in range(len(source_list)): corpus_type = source_list[i]['type'] corpus_wav_path = data_path+source_list[i]['wav'] label_paths = [] label_names = [] if 'label' in source_list[i]: label_paths.append(data_path+source_list[i]['label']) label_names.append('label') else: corpus_label_path = None if 'aux_label' in source_list[i]: label_paths.append(data_path+source_list[i]['aux_label']) label_names.append('aux_label') else: corpus_label_path = None print("%s::_load_streams: loading %s from %s..." % (self.__class__.__name__, corpus_type, corpus_wav_path)) curr_stream = reader.stream.gen_stream_from_zip(corpus_wav_path, label_files=label_paths, label_names=label_names, is_speech_corpus=is_speech, is_rir=is_rir, get_duration=False, corpus_name=corpus_type, file_extension='wav') source_streams.append(curr_stream) return source_streams def __getitem__(self, index): if self.sequence_mode: # find the stream index and utterance index corresponding to the given index stream_idx = -1 for i in range(len(self.source_stream_cum_sizes)): if index < self.source_stream_cum_sizes[i]: stream_idx = i break if stream_idx == -1: raise Exception('index larger than available number of sentences. ') if stream_idx==0: utt_idx = index else: utt_idx = index - self.source_stream_cum_sizes[stream_idx-1] data = self.data_buffer.generate((stream_idx, utt_idx))[0] # data = self.data_buffer.get((stream_idx, utt_idx)) else: data = self.data_buffer.get() if self.transform is not None: data = self.transform.apply(data, stream_keys=self.stream_idx_for_transform) return data def sample_in_seconds(self): return self.sample_len_seconds def __len__(self): if self.sequence_mode: return np.sum(self.source_stream_sizes) else: return self.data_buffer.get_len() def __repr__(self): fmt_str = 'Dataset ' + self.__class__.__name__ + '\n' fmt_str += ' Number of datapoints: {}\n'.format(self.__len__()) tmp = 'train' if self.train is True else 'test' fmt_str += ' Split: {}\n'.format(tmp) fmt_str += ' Root Location: {}\n'.format(self.root) tmp = ' Transforms (if any): ' fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) tmp = ' Target Transforms (if any): ' fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) return fmt_str class DataGeneratorSequenceConfig: """ Define the configurations of data generation. """ def __init__(self, use_reverb, use_noise, snr_range, n_hour_per_epoch=10, sequence_mode=False, load_label=True, min_seglen=0, seglen=500, segshift=500, use_cmn=False, gain_norm=False, simulation_prob=0.5): self.n_hour_per_epoch = n_hour_per_epoch self.load_label = load_label self.segment_config = {} self.use_reverb = use_reverb self.use_noise = use_noise self.snr_range = snr_range self.segment_config['sequence_mode'] = sequence_mode self.segment_config['seglen'] = seglen # length of segments in terms of frames self.segment_config['segshift'] = segshift self.segment_config['min_seglen'] = min_seglen self.n_segment_per_epoch = int(3600 * n_hour_per_epoch / self.segment_config['seglen'] * 100) self.gain_norm = gain_norm self.use_cmn = use_cmn self.simulation_prob = simulation_prob class DataGeneratorTrain: """ Generate simulated speech utterances from clean speech streams, noise streams, and rir streams. Responsible for sampling of the data from the streams, and call SimpleSimulator to do the simulation. Also responsible for extract features and make training samples. """ _window_file = 'mel80_window.txt' # the file that stores the Mel scale window coefficients def __init__(self, source_streams, noise_streams, rir_streams, config, DEBUG=False): """ :param source_streams: a list of SpeechDataStream objects, containing the clean speech source files names and meta-data such as label, utterance ID, and speaker ID. :param noise_streams: a list of DataStream objects, containing noise file names. :param rir_streams: a list of RIRDataStream objects, containing RIR file names and meta data information. :param config: an object of type DataGeneratorSequenceConfig :param DEBUG: if set to DEBUG mode, will plot the filterbanks and label. """ self._source_streams = source_streams self._source_streams_prior = self._get_streams_prior(source_streams) self._rir_streams = rir_streams self._rir_streams_prior = self._get_streams_prior(rir_streams) self._noise_streams = noise_streams self._noise_streams_prior = self._get_streams_prior(noise_streams) self._data_len = config.n_segment_per_epoch self._single_source_simulator = SimpleSimulator(use_rir=config.use_reverb, use_noise=config.use_noise, snr_range=config.snr_range) self._config = config self._DEBUG = DEBUG self._gen_window() def _get_streams_prior(self, streams): if streams is None: return None else: n_entrys = np.asarray([stream.get_number_of_data() for stream in streams]) return n_entrys / np.sum(n_entrys) def _gen_window(self): # load the pre-computed window coefficients for 80D log filterbanks used in typical acoustic modeling. # the window is computed by the following code # import librosa # self._window = librosa.filters.mel(16000, 512, n_mels=80, fmax=7690, htk=True) mel_file = os.path.join(os.path.dirname(__file__), self._window_file) with open(mel_file) as file: lines = [line.rstrip('\n') for line in file] self._window = np.vstack([np.asarray([np.float32(j) for j in i.split(",")]) for i in lines]) def _logfbank_extractor(self, wav): # typical log fbank extraction for 16kHz speech data preemphasis = 0.96 t1 = np.sum(self._window, 0) t1[t1 == 0] = -1 inv = np.diag(1 / t1) mel = self._window.dot(inv).T wav = wav[1:] - preemphasis * wav[:-1] S = stft(wav, n_fft=512, hop_length=160, win_length=400, window=np.hamming(400), center=False).T spec_mag = np.abs(S) spec_power = spec_mag ** 2 fbank_power = spec_power.T.dot(mel * 32768 ** 2) + 1 log_fbank = np.log(fbank_power) return log_fbank def generate(self, index=None): """ :param index: a tuple of 2 entries (source_stream_idx, utt_idx) that specifies which clean source file to use for simulation. If not provided, will randomly choose one clean source file from the clean source streams. :return: a list of training samples """ seg_len = self._config.segment_config['seglen'] seg_shift = self._config.segment_config['segshift'] if index is None: # if no index is given, let the simulator do random sampling # sample a clean speech stream source_stream_idx = np.random.choice(np.arange(len(self._source_streams)), replace=True, p=self._source_streams_prior) # sample a clean speech utterance _, utt_id, source_wav, _ = self._source_streams[source_stream_idx].sample_spk_and_utt(n_spk=1, n_utt_per_spk=1, load_data=True) else: # if index is given, use the specified sentence assert len(index) == 2 source_stream_idx = index[0] utt_id = [self._source_streams[source_stream_idx].utt_id[index[1]]] _, _, source_wav, _ = self._source_streams[source_stream_idx].read_utt_with_id(utt_id, load_data=True) if np.random.random() > self._config.simulation_prob: simulated_wav = source_wav[0] else: if self._noise_streams is None: noise_wavs = None else: noise_stream_idx = np.random.choice(np.arange(len(self._noise_streams)), replace=True, p=self._noise_streams_prior) noise_wavs, noise_files = self._noise_streams[noise_stream_idx].sample_data() if self._rir_streams is None: source_rir = None noise_rirs = None else: rir_stream_idx = np.random.choice(np.arange(len(self._rir_streams)), replace=True, p=self._rir_streams_prior) n_rir = 1 if noise_wavs is None else 1+len(noise_wavs) rir_wav, room_size, array_position, positions, t60 = self._rir_streams[rir_stream_idx].sample_rir(n_rir) source_rir = rir_wav[0] noise_rirs = rir_wav[1:] simulated_wav, _, mask, config = self._single_source_simulator(source_wav[0], dir_noise_wavs=noise_wavs, source_rir=source_rir, dir_noise_rirs=noise_rirs, gen_mask=False, normalize_gain=self._config.gain_norm) fbank = self._logfbank_extractor(simulated_wav[:,0]) if self._config.load_label: _, label = self._source_streams[source_stream_idx].read_label_with_id(utt_id) frame_label = label['label'][0].T if 'aux_label' in label: aux_label = label['aux_label'] else: aux_label = np.zeros((1,1)) if np.abs(frame_label.shape[0] - fbank.shape[0])>5: print("DataGeneratorTrain::generate: Warning: filterbank and label have significantly different number of frames. ") n_fr = np.minimum(frame_label.shape[0], fbank.shape[0]) frame_label = frame_label[:n_fr,:] fbank = fbank[:n_fr,:] if self._config.use_cmn: fbank = reader.preprocess.cmn(fbank, axis=0) if self._config.segment_config['sequence_mode']: if self._config.load_label: train_samples = [(fbank, utt_id, frame_label, aux_label)] else: train_samples = [(fbank, utt_id)] else: fbank_seg = _utt2seg(fbank.T, seg_len, seg_shift) if len(fbank_seg) == 0: return [] if self._config.load_label: label_seg = _utt2seg(frame_label.T, seg_len, seg_shift) train_samples = [(fbank_seg[i].T, utt_id, label_seg[i].T) for i in range(len(label_seg))] else: train_samples = [(fbank_seg[i].T, utt_id) for i in range(len(fbank_seg))] if self._DEBUG: import matplotlib.pyplot as plt n_sample = len(train_samples) for i in range(n_sample): plt.subplot(n_sample,2,i*2+1) imagesc(train_samples[i][0].T) plt.subplot(n_sample,2,i*2+2) plt.plot(train_samples[i][2]) return train_samples def get_len(self): return self._data_len