Python librosa.magphase() Examples

The following are 30 code examples of librosa.magphase(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module librosa , or try the search function .
Example #1
Source File: audio.py    From Speech_emotion_recognition_BLSTM with MIT License 6 votes vote down vote up
def split_vocal(self, y):
        S_full, phase = librosa.magphase(librosa.stft(y))

        # To avoid being biased by local continuity, we constrain similar frames to be
        # separated by at least 1.2 seconds.
        S_filter = librosa.decompose.nn_filter(S_full, aggregate=np.median, metric='cosine',
                                               width=int(librosa.time_to_frames(self._constrained, sr=self._sr)))

        S_filter = np.minimum(S_full, S_filter)

        margin_v = 10
        power = 2

        mask_v = librosa.util.softmask(S_full - S_filter,
                                       margin_v * S_filter,
                                       power=power)

        S_foreground = mask_v * S_full

        foreground = griffinlim(S_foreground)

        return foreground 
Example #2
Source File: irm_dataset.py    From IRM-based-Speech-Enhancement-using-LSTM with MIT License 6 votes vote down vote up
def __getitem__(self, idx):
        clean_y, _ = librosa.load(self.clean_f_paths[idx], sr=16000)
        snr = random.choice(self.snr_list)

        noise_data = random.choice(self.all_noise_data)
        noise_name = noise_data["name"]
        noise_y = noise_data["y"]

        name = f"{str(idx).zfill(5)}_{noise_name}_{snr}"
        clean_y, noise_y, noisy_y = synthesis_noisy_y(clean_y, noise_y, snr)

        if self.mode == "train":
            clean_mag, _ = librosa.magphase(librosa.stft(clean_y, n_fft=320, hop_length=160, win_length=320))
            noise_mag, _ = librosa.magphase(librosa.stft(noise_y, n_fft=320, hop_length=160, win_length=320))
            noisy_mag, _ = librosa.magphase(librosa.stft(noisy_y, n_fft=320, hop_length=160, win_length=320))
            mask = np.sqrt(clean_mag ** 2 / (clean_mag + noise_mag) ** 2)
            n_frames = clean_mag.shape[-1]
            return noisy_mag, clean_mag, mask, n_frames
        elif self.mode == "validation":
            return noisy_y, clean_y, name
        else:
            return noisy_y, name 
Example #3
Source File: text2speech.py    From OpenSeq2Seq with Apache License 2.0 6 votes vote down vote up
def griffin_lim(magnitudes, n_iters=50, n_fft=1024):
  """
  Griffin-Lim algorithm to convert magnitude spectrograms to audio signals
  """

  phase = np.exp(2j * np.pi * np.random.rand(*magnitudes.shape))
  complex_spec = magnitudes * phase
  signal = librosa.istft(complex_spec)
  if not np.isfinite(signal).all():
    print("WARNING: audio was not finite, skipping audio saving")
    return np.array([0])

  for _ in range(n_iters):
    _, phase = librosa.magphase(librosa.stft(signal, n_fft=n_fft))
    complex_spec = magnitudes * phase
    signal = librosa.istft(complex_spec)
  return signal 
Example #4
Source File: tts_infer.py    From NeMo with Apache License 2.0 6 votes vote down vote up
def griffin_lim(magnitudes, n_iters=50, n_fft=1024):
    """
    Griffin-Lim algorithm to convert magnitude spectrograms to audio signals
    """
    phase = np.exp(2j * np.pi * np.random.rand(*magnitudes.shape))
    complex_spec = magnitudes * phase
    signal = librosa.istft(complex_spec)
    if not np.isfinite(signal).all():
        logging.warning("audio was not finite, skipping audio saving")
        return np.array([0])

    for _ in range(n_iters):
        _, phase = librosa.magphase(librosa.stft(signal, n_fft=n_fft))
        complex_spec = magnitudes * phase
        signal = librosa.istft(complex_spec)
    return signal 
Example #5
Source File: helpers.py    From NeMo with Apache License 2.0 6 votes vote down vote up
def griffin_lim(magnitudes, n_iters=50, n_fft=1024):
    """
    Griffin-Lim algorithm to convert magnitude spectrograms to audio signals
    """
    phase = np.exp(2j * np.pi * np.random.rand(*magnitudes.shape))
    complex_spec = magnitudes * phase
    signal = librosa.istft(complex_spec)
    if not np.isfinite(signal).all():
        logging.warning("audio was not finite, skipping audio saving")
        return np.array([0])

    for _ in range(n_iters):
        _, phase = librosa.magphase(librosa.stft(signal, n_fft=n_fft))
        complex_spec = magnitudes * phase
        signal = librosa.istft(complex_spec)
    return signal 
Example #6
Source File: utils.py    From magenta with Apache License 2.0 6 votes vote down vote up
def griffin_lim(mag, phase_angle, n_fft, hop, num_iters):
  """Iterative algorithm for phase retrieval from a magnitude spectrogram.

  Args:
    mag: Magnitude spectrogram.
    phase_angle: Initial condition for phase.
    n_fft: Size of the FFT.
    hop: Stride of FFT. Defaults to n_fft/2.
    num_iters: Griffin-Lim iterations to perform.

  Returns:
    audio: 1-D array of float32 sound samples.
  """
  fft_config = dict(n_fft=n_fft, win_length=n_fft, hop_length=hop, center=True)
  ifft_config = dict(win_length=n_fft, hop_length=hop, center=True)
  complex_specgram = inv_magphase(mag, phase_angle)
  for i in range(num_iters):
    audio = librosa.istft(complex_specgram, **ifft_config)
    if i != num_iters - 1:
      complex_specgram = librosa.stft(audio, **fft_config)
      _, phase = librosa.magphase(complex_specgram)
      phase_angle = np.angle(phase)
      complex_specgram = inv_magphase(mag, phase_angle)
  return audio 
Example #7
Source File: vocoder.py    From DeepPavlov with Apache License 2.0 5 votes vote down vote up
def griffin_lim(self, magnitudes):
        """Griffin-Lim algorithm to convert magnitude spectrograms to audio signals."""
        phase = np.exp(2j * np.pi * np.random.rand(*magnitudes.shape))
        complex_spec = magnitudes * phase
        signal = librosa.istft(complex_spec)

        for _ in range(self.n_iters):
            _, phase = librosa.magphase(librosa.stft(signal, n_fft=self.n_fft))
            complex_spec = magnitudes * phase
            signal = librosa.istft(complex_spec)
        return signal 
Example #8
Source File: audio.py    From Speech_emotion_recognition_BLSTM with MIT License 5 votes vote down vote up
def split_vocal_to_wav(self, filename, fp_foreground, fp_background=None):
        print(filename.split('/')[-1])

        y, sr = librosa.load(filename, sr=self._sr)

        S_full, phase = librosa.magphase(librosa.stft(y))

        # To avoid being biased by local continuity, we constrain similar frames to be
        # separated by at least 1.2 seconds.
        S_filter = librosa.decompose.nn_filter(S_full, aggregate=np.median, metric='cosine',
                                               width=int(librosa.time_to_frames(self._constrained, sr=self._sr)))

        S_filter = np.minimum(S_full, S_filter)

        margin_i, margin_v = 2, 10
        power = 2

        mask_i = librosa.util.softmask(S_filter,
                                       margin_i * (S_full - S_filter),
                                       power=power)

        mask_v = librosa.util.softmask(S_full - S_filter,
                                       margin_v * S_filter,
                                       power=power)

        S_foreground = mask_v * S_full
        S_background = mask_i * S_full

        foreground = griffinlim(S_foreground)
        fp_foreground += filename.split('/')[-1]
        sf.write(fp_foreground, foreground, sr, 'PCM_16')

        if fp_background is not None:
            background = griffinlim(S_background)
            fp_background += filename.split('/')[-1]
            sf.write(fp_background, background, sr, 'PCM_16') 
Example #9
Source File: data_loader.py    From pytorch-nlp with MIT License 5 votes vote down vote up
def parse_audio(self, audio_path):
        if self.augment:
            y = load_randomly_augmented_audio(audio_path, self.sample_rate)
        else:
            y = load_audio(audio_path)
        if self.noiseInjector:
            add_noise = np.random.binomial(1, self.noise_prob)
            if add_noise:
                y = self.noiseInjector.inject_noise(y)
        n_fft = int(self.sample_rate * self.window_size)
        win_length = n_fft
        hop_length = int(self.sample_rate * self.window_stride)
        # STFT
        D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
                         win_length=win_length, window=self.window)
        spect, phase = librosa.magphase(D)
        # S = log(S+1)
        spect = np.log1p(spect)
        spect = torch.FloatTensor(spect)
        if self.normalize:
            mean = spect.mean()
            std = spect.std()
            spect.add_(-mean)
            spect.div_(std)

        return spect 
Example #10
Source File: helpers.py    From NeMo with Apache License 2.0 5 votes vote down vote up
def waveglow_log_to_tb_func(
    swriter,
    tensors,
    step,
    tag="train",
    log_images=False,
    log_images_freq=1,
    n_fft=1024,
    hop_length=256,
    window="hann",
    mel_fb=None,
):
    loss, audio_pred, spec_target, mel_length = tensors
    if loss:
        swriter.add_scalar("loss", loss, step)
    if log_images and step % log_images_freq == 0:
        mel_length = mel_length[0]
        spec_target = spec_target[0].data.cpu().numpy()[:, :mel_length]
        swriter.add_image(
            f"{tag}_mel_target", plot_spectrogram_to_numpy(spec_target), step, dataformats="HWC",
        )
        if mel_fb is not None:
            mag, _ = librosa.core.magphase(
                librosa.core.stft(
                    np.nan_to_num(audio_pred[0].cpu().detach().numpy()),
                    n_fft=n_fft,
                    hop_length=hop_length,
                    window=window,
                )
            )
            mel_pred = np.matmul(mel_fb.cpu().numpy(), mag).squeeze()
            log_mel_pred = np.log(np.clip(mel_pred, a_min=1e-5, a_max=None))
            swriter.add_image(
                f"{tag}_mel_predicted",
                plot_spectrogram_to_numpy(log_mel_pred[:, :mel_length]),
                step,
                dataformats="HWC",
            ) 
Example #11
Source File: datautils.py    From panotti with MIT License 5 votes vote down vote up
def make_phase_gram(mono_sig, sr, n_bins=128):
    stft = librosa.stft(mono_sig)#, n_fft = (2*n_bins)-1)
    magnitude, phase = librosa.magphase(stft)   # we don't need magnitude

    # resample the phase array to match n_bins
    phase = np.resize(phase, (n_bins, phase.shape[1]))[np.newaxis,:,:,np.newaxis]
    return phase



# turn multichannel audio as multiple melgram layers 
Example #12
Source File: spectrogram.py    From cocktail-party with MIT License 5 votes vote down vote up
def griffin_lim(magnitude, n_fft, hop_length, n_iterations):
	"""Iterative algorithm for phase retrival from a magnitude spectrogram."""
	phase_angle = np.pi * np.random.rand(*magnitude.shape)
	D = invert_magnitude_phase(magnitude, phase_angle)
	signal = librosa.istft(D, hop_length=hop_length)

	for i in range(n_iterations):
		D = librosa.stft(signal, n_fft=n_fft, hop_length=hop_length)
		_, phase = librosa.magphase(D)
		phase_angle = np.angle(phase)

		D = invert_magnitude_phase(magnitude, phase_angle)
		signal = librosa.istft(D, hop_length=hop_length)

	return signal 
Example #13
Source File: data_loader.py    From training with Apache License 2.0 5 votes vote down vote up
def parse_audio(self, audio_path):
        if self.augment:
            y = load_randomly_augmented_audio(audio_path, self.sample_rate)
        else:
            y = load_audio(audio_path)
        if self.noiseInjector:
            add_noise = np.random.binomial(1, self.noise_prob)
            if add_noise:
                y = self.noiseInjector.inject_noise(y)
        n_fft = int(self.sample_rate * self.window_size)
        win_length = n_fft
        hop_length = int(self.sample_rate * self.window_stride)
        # STFT
        D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
                         win_length=win_length, window=self.window)
        spect, phase = librosa.magphase(D)
        # S = log(S+1)
        spect = np.log1p(spect)
        spect = torch.FloatTensor(spect)
        if self.normalize:
            mean = spect.mean()
            std = spect.std()
            spect.add_(-mean)
            spect.div_(std)

        return spect 
Example #14
Source File: data_tools.py    From Speech-enhancement with MIT License 5 votes vote down vote up
def audio_to_magnitude_db_and_phase(n_fft, hop_length_fft, audio):
    """This function takes an audio and convert into spectrogram,
       it returns the magnitude in dB and the phase"""

    stftaudio = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length_fft)
    stftaudio_magnitude, stftaudio_phase = librosa.magphase(stftaudio)

    stftaudio_magnitude_db = librosa.amplitude_to_db(
        stftaudio_magnitude, ref=np.max)

    return stftaudio_magnitude_db, stftaudio_phase 
Example #15
Source File: audio_dataset.py    From OpenNMT-py with MIT License 5 votes vote down vote up
def extract_features(self, audio_path):
        # torchaudio loading options recently changed. It's probably
        # straightforward to rewrite the audio handling to make use of
        # up-to-date torchaudio, but in the meantime there is a legacy
        # method which uses the old defaults
        sound, sample_rate_ = torchaudio.legacy.load(audio_path)
        if self.truncate and self.truncate > 0:
            if sound.size(0) > self.truncate:
                sound = sound[:self.truncate]

        assert sample_rate_ == self.sample_rate, \
            'Sample rate of %s != -sample_rate (%d vs %d)' \
            % (audio_path, sample_rate_, self.sample_rate)

        sound = sound.numpy()
        if len(sound.shape) > 1:
            if sound.shape[1] == 1:
                sound = sound.squeeze()
            else:
                sound = sound.mean(axis=1)  # average multiple channels

        n_fft = int(self.sample_rate * self.window_size)
        win_length = n_fft
        hop_length = int(self.sample_rate * self.window_stride)
        # STFT
        d = librosa.stft(sound, n_fft=n_fft, hop_length=hop_length,
                         win_length=win_length, window=self.window)
        spect, _ = librosa.magphase(d)
        spect = np.log1p(spect)
        spect = torch.FloatTensor(spect)
        if self.normalize_audio:
            mean = spect.mean()
            std = spect.std()
            spect.add_(-mean)
            spect.div_(std)
        return spect 
Example #16
Source File: utils.py    From Speaker-Diarization with Apache License 2.0 5 votes vote down vote up
def load_data(path, win_length=400, sr=16000, hop_length=160, n_fft=512, spec_len=250, mode='train'):
    wav = load_wav(path, sr=sr, mode=mode)
    linear_spect = lin_spectogram_from_wav(wav, hop_length, win_length, n_fft)
    mag, _ = librosa.magphase(linear_spect)  # magnitude
    mag_T = mag.T
    freq, time = mag_T.shape
    if mode == 'train':
        randtime = np.random.randint(0, time-spec_len)
        spec_mag = mag_T[:, randtime:randtime+spec_len]
    else:
        spec_mag = mag_T
    # preprocessing, subtract mean, divided by time-wise var
    mu = np.mean(spec_mag, 0, keepdims=True)
    std = np.std(spec_mag, 0, keepdims=True)
    return (spec_mag - mu) / (std + 1e-5) 
Example #17
Source File: preprocess.py    From Speaker-Diarization with Apache License 2.0 5 votes vote down vote up
def load_data(path, split=False, win_length=400, sr=16000, hop_length=160, n_fft=512, min_slice=720):
    wav = load_wav(path, sr=sr)
    linear_spect = lin_spectogram_from_wav(wav, hop_length, win_length, n_fft)
    mag, _ = librosa.magphase(linear_spect)  # magnitude
    mag_T = mag.T
    freq, time = mag_T.shape
    spec_mag = mag_T

    utterances_spec = []

    if(split):
        minSpec = min_slice//(1000//(sr//hop_length)) # The minimum timestep of each slice in spectrum
        randStarts = np.random.randint(0,time, 10)   # generate 10 slices at most.
        for start in randStarts:
            if(time-start<=minSpec):
                continue
            randDuration = np.random.randint(minSpec, time-start)
            spec_mag = mag_T[:, start:start+randDuration]

            # preprocessing, subtract mean, divided by time-wise var
            mu = np.mean(spec_mag, 0, keepdims=True)
            std = np.std(spec_mag, 0, keepdims=True)
            spec_mag = (spec_mag - mu) / (std + 1e-5)
            utterances_spec.append(spec_mag)

    else:
        # preprocessing, subtract mean, divided by time-wise var
        mu = np.mean(spec_mag, 0, keepdims=True)
        std = np.std(spec_mag, 0, keepdims=True)
        spec_mag = (spec_mag - mu) / (std + 1e-5)
        utterances_spec.append(spec_mag)

    return utterances_spec 
Example #18
Source File: speakerDiarization.py    From Speaker-Diarization with Apache License 2.0 5 votes vote down vote up
def load_data(path, win_length=400, sr=16000, hop_length=160, n_fft=512, embedding_per_second=0.5, overlap_rate=0.5):
    wav, intervals = load_wav(path, sr=sr)
    linear_spect = lin_spectogram_from_wav(wav, hop_length, win_length, n_fft)
    mag, _ = librosa.magphase(linear_spect)  # magnitude
    mag_T = mag.T
    freq, time = mag_T.shape
    spec_mag = mag_T

    spec_len = sr/hop_length/embedding_per_second
    spec_hop_len = spec_len*(1-overlap_rate)

    cur_slide = 0.0
    utterances_spec = []

    while(True):  # slide window.
        if(cur_slide + spec_len > time):
            break
        spec_mag = mag_T[:, int(cur_slide+0.5) : int(cur_slide+spec_len+0.5)]
        
        # preprocessing, subtract mean, divided by time-wise var
        mu = np.mean(spec_mag, 0, keepdims=True)
        std = np.std(spec_mag, 0, keepdims=True)
        spec_mag = (spec_mag - mu) / (std + 1e-5)
        utterances_spec.append(spec_mag)

        cur_slide += spec_hop_len

    return utterances_spec, intervals 
Example #19
Source File: pre_processing.py    From audio-source-separation with MIT License 5 votes vote down vote up
def process(file_path,direc,destination_path,phase_bool,destination_phase_path):
	t1,t2=librosa.load(file_path,sr=None)
	duration=librosa.get_duration(t1,t2)
	regex = re.compile(r'\d+')
	index=regex.findall(direc)
	#print(index)
	num_segments=0
	#mean=np.zeros((513,52))
	#var=np.zeros((513,52))
	for start in range(30,int(200)):

		wave_array, fs = librosa.load(file_path,sr=44100,offset=start*0.3,duration = 0.3)

		mag, phase = librosa.magphase(librosa.stft(wave_array, n_fft=1024,hop_length=256,window='hann',center='True'))
		#mean+=mag
		#num_segments+=1;
		if not os.path.exists(destination_path):
			os.makedirs(destination_path)
		#print(mag.shape)
		#print(torch.from_numpy(np.expand_dims(mag,axis=0)).shape)

		# magnitude stored as tensor, phase as np array
		#pickle.dump(torch.from_numpy(np.expand_dims(mag,axis=2)),open(os.path.join(destination_path,(index[0] +"_" + str(start) +'_m.pt')),'wb'))
		torch.save(torch.from_numpy(np.expand_dims(mag,axis=0)),os.path.join(destination_path,(index[0] +"_" + str(start) +'_m.pt')))
		if phase_bool:
			if not os.path.exists(destination_phase_path):
				os.makedirs(destination_phase_path)
			np.save(os.path.join(destination_phase_path,(index[0]+"_" +str(start)+'_p.npy')),phase)
	return

#--------- training data------------------------------------- 
Example #20
Source File: audio_dataset.py    From OpenNMT-kpg-release with MIT License 5 votes vote down vote up
def extract_features(self, audio_path):
        # torchaudio loading options recently changed. It's probably
        # straightforward to rewrite the audio handling to make use of
        # up-to-date torchaudio, but in the meantime there is a legacy
        # method which uses the old defaults
        sound, sample_rate_ = torchaudio.legacy.load(audio_path)
        if self.truncate and self.truncate > 0:
            if sound.size(0) > self.truncate:
                sound = sound[:self.truncate]

        assert sample_rate_ == self.sample_rate, \
            'Sample rate of %s != -sample_rate (%d vs %d)' \
            % (audio_path, sample_rate_, self.sample_rate)

        sound = sound.numpy()
        if len(sound.shape) > 1:
            if sound.shape[1] == 1:
                sound = sound.squeeze()
            else:
                sound = sound.mean(axis=1)  # average multiple channels

        n_fft = int(self.sample_rate * self.window_size)
        win_length = n_fft
        hop_length = int(self.sample_rate * self.window_stride)
        # STFT
        d = librosa.stft(sound, n_fft=n_fft, hop_length=hop_length,
                         win_length=win_length, window=self.window)
        spect, _ = librosa.magphase(d)
        spect = np.log1p(spect)
        spect = torch.FloatTensor(spect)
        if self.normalize_audio:
            mean = spect.mean()
            std = spect.std()
            spect.add_(-mean)
            spect.div_(std)
        return spect 
Example #21
Source File: spectrogram.py    From cocktail-party with MIT License 5 votes vote down vote up
def signal_to_mel_spectrogram(self, audio_signal, log=True, get_phase=False):
		signal = audio_signal.get_data(channel_index=0)
		D = librosa.core.stft(signal, n_fft=self._N_FFT, hop_length=self._HOP_LENGTH)
		magnitude, phase = librosa.core.magphase(D)

		mel_spectrogram = np.dot(self._MEL_FILTER, magnitude)

		mel_spectrogram = mel_spectrogram ** 2
		if log:
			mel_spectrogram = librosa.power_to_db(mel_spectrogram)

		if get_phase:
			return mel_spectrogram, phase
		else:
			return mel_spectrogram 
Example #22
Source File: audio_dataset.py    From encoder-agnostic-adaptation with MIT License 5 votes vote down vote up
def extract_features(self, audio_path):
        # torchaudio loading options recently changed. It's probably
        # straightforward to rewrite the audio handling to make use of
        # up-to-date torchaudio, but in the meantime there is a legacy
        # method which uses the old defaults
        sound, sample_rate_ = torchaudio.legacy.load(audio_path)
        if self.truncate and self.truncate > 0:
            if sound.size(0) > self.truncate:
                sound = sound[:self.truncate]

        assert sample_rate_ == self.sample_rate, \
            'Sample rate of %s != -sample_rate (%d vs %d)' \
            % (audio_path, sample_rate_, self.sample_rate)

        sound = sound.numpy()
        if len(sound.shape) > 1:
            if sound.shape[1] == 1:
                sound = sound.squeeze()
            else:
                sound = sound.mean(axis=1)  # average multiple channels

        n_fft = int(self.sample_rate * self.window_size)
        win_length = n_fft
        hop_length = int(self.sample_rate * self.window_stride)
        # STFT
        d = librosa.stft(sound, n_fft=n_fft, hop_length=hop_length,
                         win_length=win_length, window=self.window)
        spect, _ = librosa.magphase(d)
        spect = np.log1p(spect)
        spect = torch.FloatTensor(spect)
        if self.normalize_audio:
            mean = spect.mean()
            std = spect.std()
            spect.add_(-mean)
            spect.div_(std)
        return spect 
Example #23
Source File: data_loader.py    From LipReading with MIT License 5 votes vote down vote up
def parse_audio(self, audio_path):
        if self.augment:
            y = load_randomly_augmented_audio(audio_path, self.sample_rate)
        else:
            y = load_audio(audio_path)
        if self.noiseInjector:
            add_noise = np.random.binomial(1, self.noise_prob)
            if add_noise:
                y = self.noiseInjector.inject_noise(y)
        n_fft = int(self.sample_rate * self.window_size)
        win_length = n_fft
        hop_length = int(self.sample_rate * self.window_stride)
        # STFT
        D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
                         win_length=win_length, window=self.window)
        spect, phase = librosa.magphase(D)
        # S = log(S+1)
        spect = np.log1p(spect)
        spect = torch.FloatTensor(spect)
        if self.normalize:
            mean = spect.mean()
            std = spect.std()
            spect.add_(-mean)
            spect.div_(std)

        return spect 
Example #24
Source File: audio_dataset.py    From ITDD with MIT License 5 votes vote down vote up
def extract_features(audio_path, sample_rate, truncate, window_size,
                         window_stride, window, normalize_audio):
        global torchaudio, librosa, np
        import torchaudio
        import librosa
        import numpy as np

        sound, sample_rate_ = torchaudio.load(audio_path)
        if truncate and truncate > 0:
            if sound.size(0) > truncate:
                sound = sound[:truncate]

        assert sample_rate_ == sample_rate, \
            'Sample rate of %s != -sample_rate (%d vs %d)' \
            % (audio_path, sample_rate_, sample_rate)

        sound = sound.numpy()
        if len(sound.shape) > 1:
            if sound.shape[1] == 1:
                sound = sound.squeeze()
            else:
                sound = sound.mean(axis=1)  # average multiple channels

        n_fft = int(sample_rate * window_size)
        win_length = n_fft
        hop_length = int(sample_rate * window_stride)
        # STFT
        d = librosa.stft(sound, n_fft=n_fft, hop_length=hop_length,
                         win_length=win_length, window=window)
        spect, _ = librosa.magphase(d)
        spect = np.log1p(spect)
        spect = torch.FloatTensor(spect)
        if normalize_audio:
            mean = spect.mean()
            std = spect.std()
            spect.add_(-mean)
            spect.div_(std)
        return spect 
Example #25
Source File: compute_features.py    From laughter-detection with MIT License 5 votes vote down vote up
def compute_mfcc_features(y,sr):
    mfcc_feat = librosa.feature.mfcc(y,sr,n_mfcc=12,n_mels=12,hop_length=int(sr/100), n_fft=int(sr/40)).T
    S, phase = librosa.magphase(librosa.stft(y,hop_length=int(sr/100)))
    rms = librosa.feature.rms(S=S).T
    return np.hstack([mfcc_feat,rms]) 
Example #26
Source File: data_loader.py    From end2end-asr-pytorch with MIT License 5 votes vote down vote up
def parse_audio(self, audio_path):
        if self.augment:
            y = load_randomly_augmented_audio(audio_path, self.sample_rate)
        else:
            y = load_audio(audio_path)

        if self.noiseInjector:
            logging.info("inject noise")
            add_noise = np.random.binomial(1, self.noise_prob)
            if add_noise:
                y = self.noiseInjector.inject_noise(y)

        n_fft = int(self.sample_rate * self.window_size)
        win_length = n_fft
        hop_length = int(self.sample_rate * self.window_stride)

        # Short-time Fourier transform (STFT)
        D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length,
                         win_length=win_length, window=self.window)
        spect, phase = librosa.magphase(D)

        # S = log(S+1)
        spect = np.log1p(spect)
        spect = torch.FloatTensor(spect)

        if self.normalize:
            mean = spect.mean()
            std = spect.std()
            spect.add_(-mean)
            spect.div_(std)

        return spect 
Example #27
Source File: fft.py    From pumpp with ISC License 5 votes vote down vote up
def transform_audio(self, y):
        '''Compute the STFT magnitude and phase.

        Parameters
        ----------
        y : np.ndarray
            The audio buffer

        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape=(n_frames, 1 + n_fft//2)
                STFT magnitude

            data['phase'] : np.ndarray, shape=(n_frames, 1 + n_fft//2)
                STFT phase
        '''
        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        D = stft(y, hop_length=self.hop_length,
                 n_fft=self.n_fft)

        D = fix_length(D, n_frames)

        mag, phase = magphase(D)
        if self.log:
            mag = amplitude_to_db(mag, ref=np.max)

        return {'mag': to_dtype(mag.T[self.idx], self.dtype),
                'phase': to_dtype(np.angle(phase.T)[self.idx], self.dtype)} 
Example #28
Source File: fft.py    From pumpp with ISC License 5 votes vote down vote up
def transform_audio(self, y):
        '''Compute the STFT magnitude and phase differential.

        Parameters
        ----------
        y : np.ndarray
            The audio buffer

        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape=(n_frames, 1 + n_fft//2)
                STFT magnitude

            data['dphase'] : np.ndarray, shape=(n_frames, 1 + n_fft//2)
                STFT phase
        '''
        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        D = stft(y, hop_length=self.hop_length,
                 n_fft=self.n_fft)

        D = fix_length(D, n_frames)

        mag, phase = magphase(D)
        if self.log:
            mag = amplitude_to_db(mag, ref=np.max)

        phase = phase_diff(np.angle(phase.T)[self.idx], self.conv)

        return {'mag': to_dtype(mag.T[self.idx], self.dtype),
                'dphase': to_dtype(phase, self.dtype)} 
Example #29
Source File: cqt.py    From pumpp with ISC License 5 votes vote down vote up
def transform_audio(self, y):
        '''Compute the CQT

        Parameters
        ----------
        y : np.ndarray
            The audio buffer

        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape = (n_frames, n_bins)
                The CQT magnitude

            data['phase']: np.ndarray, shape = mag.shape
                The CQT phase
        '''
        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        C = cqt(y=y, sr=self.sr, hop_length=self.hop_length,
                fmin=self.fmin,
                n_bins=(self.n_octaves * self.over_sample * 12),
                bins_per_octave=(self.over_sample * 12))

        C = fix_length(C, n_frames)

        cqtm, phase = magphase(C)
        if self.log:
            cqtm = amplitude_to_db(cqtm, ref=np.max)

        return {'mag': to_dtype(cqtm.T[self.idx], self.dtype),
                'phase': to_dtype(np.angle(phase).T[self.idx], self.dtype)} 
Example #30
Source File: cqt.py    From pumpp with ISC License 5 votes vote down vote up
def transform_audio(self, y):
        '''Compute the CQT

        Parameters
        ----------
        y : np.ndarray
            The audio buffer

        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape = (n_frames, n_bins)
                The CQT magnitude

            data['phase']: np.ndarray, shape = mag.shape
                The CQT phase
        '''
        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        C = cqt(y=y, sr=self.sr, hop_length=self.hop_length,
                fmin=self.fmin,
                n_bins=(self.n_octaves * self.over_sample * 12),
                bins_per_octave=(self.over_sample * 12))

        C = fix_length(C, n_frames)

        cqtm, phase = magphase(C)
        if self.log:
            cqtm = amplitude_to_db(cqtm, ref=np.max)

        dphase = phase_diff(np.angle(phase).T[self.idx], self.conv)

        return {'mag': to_dtype(cqtm.T[self.idx], self.dtype),
                'dphase': to_dtype(dphase, self.dtype)}