# -*- coding: utf-8 -*- ''' By kyubyong park. kbpark.linguist@gmail.com. https://www.github.com/kyubyong/dc_tts ''' from __future__ import print_function, division from hyperparams import Hyperparams as hp import numpy as np import tensorflow as tf import librosa import copy import matplotlib matplotlib.use('pdf') import matplotlib.pyplot as plt from scipy import signal import os def get_spectrograms(fpath): '''Returns normalized log(melspectrogram) and log(magnitude) from `sound_file`. Args: sound_file: A string. The full path of a sound file. Returns: mel: A 2d array of shape (T, n_mels) <- Transposed mag: A 2d array of shape (T, 1+n_fft/2) <- Transposed ''' # num = np.random.randn() # if num < .2: # y, sr = librosa.load(fpath, sr=hp.sr) # else: # if num < .4: # tempo = 1.1 # elif num < .6: # tempo = 1.2 # elif num < .8: # tempo = 0.9 # else: # tempo = 0.8 # cmd = "ffmpeg -i {} -y ar {} -hide_banner -loglevel panic -ac 1 -filter:a atempo={} -vn temp.wav".format(fpath, hp.sr, tempo) # os.system(cmd) # y, sr = librosa.load('temp.wav', sr=hp.sr) # Loading sound file y, sr = librosa.load(fpath, sr=hp.sr) # Trimming y, _ = librosa.effects.trim(y) # Preemphasis y = np.append(y[0], y[1:] - hp.preemphasis * y[:-1]) # stft linear = librosa.stft(y=y, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length) # magnitude spectrogram mag = np.abs(linear) # (1+n_fft//2, T) # mel spectrogram mel_basis = librosa.filters.mel(hp.sr, hp.n_fft, hp.n_mels) # (n_mels, 1+n_fft//2) mel = np.dot(mel_basis, mag) # (n_mels, t) # to decibel mel = 20 * np.log10(np.maximum(1e-5, mel)) mag = 20 * np.log10(np.maximum(1e-5, mag)) # normalize mel = np.clip((mel - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1) mag = np.clip((mag - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1) # Transpose mel = mel.T.astype(np.float32) # (T, n_mels) mag = mag.T.astype(np.float32) # (T, 1+n_fft//2) return mel, mag def spectrogram2wav(mag): '''# Generate wave file from spectrogram''' # transpose mag = mag.T # de-noramlize mag = (np.clip(mag, 0, 1) * hp.max_db) - hp.max_db + hp.ref_db # to amplitude mag = np.power(10.0, mag * 0.05) # wav reconstruction wav = griffin_lim(mag) # de-preemphasis wav = signal.lfilter([1], [1, -hp.preemphasis], wav) # trim wav, _ = librosa.effects.trim(wav) return wav.astype(np.float32) def griffin_lim(spectrogram): '''Applies Griffin-Lim's raw. ''' X_best = copy.deepcopy(spectrogram) for i in range(hp.n_iter): X_t = invert_spectrogram(X_best) est = librosa.stft(X_t, hp.n_fft, hp.hop_length, win_length=hp.win_length) phase = est / np.maximum(1e-8, np.abs(est)) X_best = spectrogram * phase X_t = invert_spectrogram(X_best) y = np.real(X_t) return y def invert_spectrogram(spectrogram): ''' spectrogram: [f, t] ''' return librosa.istft(spectrogram, hp.hop_length, win_length=hp.win_length, window="hann") def plot_alignment(alignment, gs): """Plots the alignment alignments: A list of (numpy) matrix of shape (encoder_steps, decoder_steps) gs : (int) global step """ fig, ax = plt.subplots() im = ax.imshow(alignment) # cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7]) fig.colorbar(im) plt.title('{} Steps'.format(gs)) plt.savefig('{}/alignment_{}k.png'.format(hp.logdir, gs//1000), format='png') def learning_rate_decay(init_lr, global_step, warmup_steps=4000.): '''Noam scheme from tensor2tensor''' step = tf.cast(global_step + 1, dtype=tf.float32) return init_lr * warmup_steps ** 0.5 * tf.minimum(step * warmup_steps ** -1.5, step ** -0.5) def load_spectrograms(fpath): fname = os.path.basename(fpath) mel, mag = get_spectrograms(fpath) t = mel.shape[0] num_paddings = hp.r - (t % hp.r) if t % hp.r != 0 else 0 # for reduction mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode="constant") mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode="constant") return fname, mel.reshape((-1, hp.n_mels*hp.r)), mag