import numpy as np import librosa import os from tensorflow.python.framework import tensor_util import tensorflow as tf from tensorflow.python.platform import gfile from tensorflow.core.framework import graph_pb2 import operator import itertools from numpy import inf #### Load Data #### # windowing fft/ifft function def stft(data, fft_size=512, step_size=160,padding=True): # short time fourier transform if padding == True: # for 16K sample rate data, 48192-192 = 48000 pad = np.zeros(192,) data = np.concatenate((data,pad),axis=0) # padding hanning window 512-400 = 112 window = np.concatenate((np.zeros((56,)),np.hanning(fft_size-112),np.zeros((56,))),axis=0) win_num = (len(data) - fft_size) // step_size out = np.ndarray((win_num, fft_size), dtype=data.dtype) for i in range(win_num): left = int(i * step_size) right = int(left + fft_size) out[i] = data[left: right] * window F = np.fft.rfft(out, axis=1) return F def istft(F, fft_size=512, step_size=160,padding=True): # inverse short time fourier transform data = np.fft.irfft(F, axis=-1) # padding hanning window 512-400 = 112 window = np.concatenate((np.zeros((56,)),np.hanning(fft_size-112),np.zeros((56,))),axis=0) number_windows = F.shape[0] T = np.zeros((number_windows * step_size + fft_size)) for i in range(number_windows): head = int(i * step_size) tail = int(head + fft_size) T[head:tail] = T[head:tail] + data[i, :] * window if padding == True: T = T[:48000] return T # combine FFT bins to mel frequency bins def mel2freq(mel_data,sr,fft_size,n_mel,fmax=8000): matrix= librosa.filters.mel(sr, fft_size, n_mel, fmax=fmax) return np.dot(mel_data,matrix) def freq2mel(f_data,sr,fft_size,n_mel,fmax=8000): pre_matrix = librosa.filters.mel(sr, fft_size, n_mel, fmax=fmax) matrix = pre_matrix.T / np.sum(pre_matrix.T,axis=0) return np.dot(f_data,matrix) # directly time to mel domain transformation def time_to_mel(data,sr,fft_size,n_mel,step_size,fmax=8000): F = stft(data,fft_size,step_size) M = freq2mel(F,sr,fft_size,n_mel,fmax=8000) return M def mel_to_time(M,sr,fft_size,n_mel,step_size,fmax=8000): F = mel2freq(M,sr,fft_size,n_mel) T = istft(F,fft_size,step_size) return T def real_imag_expand(c_data,dim='new'): # dim = 'new' or 'same' # expand the complex data to 2X data with true real and image number if dim == 'new': D = np.zeros((c_data.shape[0],c_data.shape[1],2)) D[:,:,0] = np.real(c_data) D[:,:,1] = np.imag(c_data) return D if dim =='same': D = np.zeros((c_data.shape[0],c_data.shape[1]*2)) D[:,::2] = np.real(c_data) D[:,1::2] = np.imag(c_data) return D def real_imag_shrink(F,dim='new'): # dim = 'new' or 'same' # shrink the complex data to combine real and imag number F_shrink = np.zeros((F.shape[0], F.shape[1])) if dim =='new': F_shrink = F[:,:,0] + F[:,:,1]*1j if dim =='same': F_shrink = F[:,::2] + F[:,1::2]*1j return F_shrink def power_law(data,power=0.6): # assume input has negative value mask = np.zeros(data.shape) mask[data>=0] = 1 mask[data<0] = -1 data = np.power(np.abs(data),power) data = data*mask return data def fast_stft(data,power=False,**kwargs): # directly transform the wav to the input # power law = A**0.3 , to prevent loud audio from overwhelming soft audio if power: data = power_law(data) return real_imag_expand(stft(data)) def fast_istft(F,power=False,**kwargs): # directly transform the frequency domain data to time domain data # apply power law T = istft(real_imag_shrink(F)) if power: T = power_law(T,(1.0/0.6)) return T def generate_cRM(Y,S): ''' :param Y: mixed/noisy stft :param S: clean stft :return: structed cRM ''' M = np.zeros(Y.shape) epsilon = 1e-8 # real part M_real = np.multiply(Y[:,:,0],S[:,:,0])+np.multiply(Y[:,:,1],S[:,:,1]) square_real = np.square(Y[:,:,0])+np.square(Y[:,:,1]) M_real = np.divide(M_real,square_real+epsilon) M[:,:,0] = M_real # imaginary part M_img = np.multiply(Y[:,:,0],S[:,:,1])-np.multiply(Y[:,:,1],S[:,:,0]) square_img = np.square(Y[:,:,0])+np.square(Y[:,:,1]) M_img = np.divide(M_img,square_img+epsilon) M[:,:,1] = M_img return M def cRM_tanh_compress(M,K=10,C=0.1): ''' Recall that the irm takes on vlaues in the range[0,1],compress the cRM with hyperbolic tangent :param M: crm (298,257,2) :param K: parameter to control the compression :param C: parameter to control the compression :return crm: compressed crm ''' numerator = 1-np.exp(-C*M) numerator[numerator == inf] = 1 numerator[numerator == -inf] = -1 denominator = 1+np.exp(-C*M) denominator[denominator == inf] = 1 denominator[denominator == -inf] = -1 crm = K * np.divide(numerator,denominator) return crm def cRM_tanh_recover(O,K=10,C=0.1): ''' :param O: predicted compressed crm :param K: parameter to control the compression :param C: parameter to control the compression :return M : uncompressed crm ''' numerator = K-O denominator = K+O M = -np.multiply((1.0/C),np.log(np.divide(numerator,denominator))) return M def fast_cRM(Fclean,Fmix,K=10,C=0.1): ''' :param Fmix: mixed/noisy stft :param Fclean: clean stft :param K: parameter to control the compression :param C: parameter to control the compression :return crm: compressed crm ''' M = generate_cRM(Fmix,Fclean) crm = cRM_tanh_compress(M,K,C) return crm def fast_icRM(Y,crm,K=10,C=0.1): ''' :param Y: mixed/noised stft :param crm: DNN output of compressed crm :param K: parameter to control the compression :param C: parameter to control the compression :return S: clean stft ''' M = cRM_tanh_recover(crm,K,C) S = np.zeros(np.shape(M)) S[:,:,0] = np.multiply(M[:,:,0],Y[:,:,0])-np.multiply(M[:,:,1],Y[:,:,1]) S[:,:,1] = np.multiply(M[:,:,0],Y[:,:,1])+np.multiply(M[:,:,1],Y[:,:,0]) return S def generate_one_sample(audio_path_list,num_speaker,fix_sr=16000,verbose=0): ''' generate one sample from audios in the list :param audio_path_list: list contains path of the wav audio file :param num_speaker: specify the task for speech separation :param fix_sr: fix sample rate :return X_sample , y_sample ''' # check path is exist for path in audio_path_list: if not os.path.exists(path): if verbose == 1: print('%s is not exist!'% path) return # initiate variables data_list = [] sr_list = [] F_list = [] # STFT list for each sample X_sample = np.empty(shape=(298,257,2)) y_sample = np.empty(shape=(298,257,2,num_speaker)) # import data for i in range(num_speaker): data, sr = librosa.load(audio_path_list[i],sr=fix_sr) data_list.append(data) sr_list.append(sr) # create mix audio according to mix rate mix_rate = 1.0 / float(num_speaker) mix = np.zeros(shape=data_list[0].shape) for data in data_list: np.add(mix,data*mix_rate) # transfrom data via STFT and several preprocessing function for i in range(num_speaker): F = fast_stft(data_list[i]) F_list.append(F) F_mix = fast_stft(mix) X_sample = F_mix # create cRM for each speaker and fill into y_sample for i in range(num_speaker): cRM = np.divide(F_list[i], F_mix, out=np.zeros_like(F_list[i]), where=F_mix!=0) y_sample[:,:,:,i] = cRM # return values if verbose == 1: print('shape of X: ',X_sample.shape) print('shape of y: ',y_sample.shape) return X_sample, y_sample def generate_dataset(sample_range,repo_path,num_speaker=2,**kwargs): ''' A function to generate dataset :param sample_range: range of the sample to create the dataset :param repo_path: audio repository :param num_speaker: number of speaker to separate :return: X_data, y_data ''' audio_path_list = [] X_data = [] y_data = [] num_data = 0 for i in range(sample_range[0],sample_range[1]): path = repo_path + '/trim_audio_train%d.wav'%i if os.path.exists(path): audio_path_list.append(path) combinations = itertools.combinations(audio_path_list,num_speaker) for combo in combinations: num_data += 1 X_sample,y_sample = generate_one_sample(combo,num_speaker) X_data.append(X_sample) y_data.append(y_sample) X_data = np.asarray(X_data) y_data = np.asarray(y_data) print('number of the data generated: ',num_data) return X_data, y_data #### normalization function #### def min_max_norm(x): # x should be numpy M*N matrix , normalize the N axis return (x-np.min(x,axis=0)) / (np.max(x,axis=0)-np.min(x,axis=0)) #### load pre-trained model #### def load_graph(graph_path,tensorboard=False,**kwargs): ''' :param graph_filename: the path of the pb file :return: tensorflow graph ''' with gfile.FastGFile(graph_path,'rb') as f: graph_def = graph_pb2.GraphDef() graph_def.ParseFromString(f.read()) with tf.Graph().as_default() as graph: tf.import_graph_def(graph_def,name="") if tensorboard: writer = tf.summary.FileWriter("log/") writer.add_graph(graph) return graph def inspect_operation(graph_path,output_txt_file): ''' :param graph_path: the path of the pb file :param output_txt_file: the path of the txt outputfile for inspect the model :return: ''' graph = load_graph(graph_path) with tf.Session(graph=graph) as sess: operations = sess.graph.get_operations() ops_dict = {} with open(output_txt_file,'w') as f: for i, op in enumerate(operations): f.write('---------------------------------------------------------------------------------------------\n') f.write("{}: op name = {}, op type = ( {} ), inputs = {}, outputs = {}\n".\ format(i, op.name, op.type, ", ".join([x.name for x in op.inputs]), ", ".join([x.name for x in op.outputs]))) f.write('@input shapes:\n') for x in op.inputs: f.write("name = {} : {}\n".format(x.name, x.get_shape())) f.write('@output shapes:\n') for x in op.outputs: f.write("name = {} : {}\n".format(x.name, x.get_shape())) if op.type in ops_dict: ops_dict[op.type] += 1 else: ops_dict[op.type] = 1 f.write('---------------------------------------------------------------------------------------------\n') sorted_ops_count = sorted(ops_dict.items(), key=operator.itemgetter(1)) print('OPS counts:') for i in sorted_ops_count: print("{} : {}".format(i[0], i[1])) def SNR(true_file,pred_file): T_true,_ = librosa.load(true_file,sr=16000) F_true = fast_stft(T_true) T_pred, _ = librosa.load(pred_file,sr=16000) F_pred = fast_stft(T_pred) F_inter = F_true - F_pred P_true = np.sum(np.square(F_true[:,:,0])+np.square(F_true[:,:,1])) P_inter = np.sum(np.square(F_inter[:,:,0])+np.square(F_inter[:,:,1])) return 10*np.log10(np.divide(P_true,P_inter)) def phase_enhance_pred(mix_STFT,pred_file, mode='STFT'): if mode=='wav': T_pred, _ = librosa.load(pred_file,sr=16000) F_pred = fast_stft(T_pred) if mode =='STFT': F_pred = pred_file M = np.sqrt(np.square(F_pred[:,:,0])+np.square(F_pred[:,:,1])) #magnitude print('shape M:',M.shape) P = np.arctan(np.divide(mix_STFT[:,:,0],mix_STFT[:,:,1])) #phase print('shape p:',P.shape) F_enhance = np.zeros_like(F_pred) print('shape enhance',F_enhance.shape) F_enhance[:,:,0] = np.multiply(M,np.cos(P)) F_enhance[:,:,1] = np.multiply(M,np.sin(P)) print('shape enhance', F_enhance.shape) T_enhance = fast_istft(F_enhance) return T_enhance ## test code part if __name__ == '__main__': # check data generative function CHECK1 = 0 CHECK2 = 1 if CHECK1: sample_range = (0,20) repo_path = os.path.expanduser('../../data/audio/audio_train') X_data,y_data = generate_dataset(sample_range,repo_path,num_speaker=2,verbose=1) print('shape of the X data: ',X_data.shape) print('shape of the y data: ',y_data.shape) if CHECK2: # check SNR calculation original_audio_path = '../../data/audio/audio_train' predict_audio_path1 = '../model_v1/pred_with_enhance' predict_audio_path2 = '../model_v1/pred_without_enhance' print(SNR(original_audio_path + '/trim_audio_train24.wav', original_audio_path + '/trim_audio_train24.wav')) #print(SNR(original_audio_path+'/trim_audio_train340.wav',predict_audio_path1+'/00009-00340-00340.wav')) print(SNR(original_audio_path + '/trim_audio_train340.wav', predict_audio_path2 + '/00009-00340-00340.wav'))