Python librosa.load() Examples

The following are 30 code examples of librosa.load(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module librosa , or try the search function

Example #1

Source File: data_augmentation.py From Sound-Recognition-Tutorial with Apache License 2.0

10 votes

def demo_plot():
    audio = './data/esc10/audio/Dog/1-30226-A.ogg'
    y, sr = librosa.load(audio, sr=44100)
    y_ps = librosa.effects.pitch_shift(y, sr, n_steps=6)   # n_steps控制音调变化尺度
    y_ts = librosa.effects.time_stretch(y, rate=1.2)   # rate控制时间维度的变换尺度
    plt.subplot(311)
    plt.plot(y)
    plt.title('Original waveform')
    plt.axis([0, 200000, -0.4, 0.4])
    # plt.axis([88000, 94000, -0.4, 0.4])
    plt.subplot(312)
    plt.plot(y_ts)
    plt.title('Time Stretch transformed waveform')
    plt.axis([0, 200000, -0.4, 0.4])
    plt.subplot(313)
    plt.plot(y_ps)
    plt.title('Pitch Shift transformed waveform')
    plt.axis([0, 200000, -0.4, 0.4])
    # plt.axis([88000, 94000, -0.4, 0.4])
    plt.tight_layout()
    plt.show()

Example #2

Source File: train_data.py From subsync with Apache License 2.0

7 votes

def extract_features(files=None):
    if files is None:
        files = transcode_audio()

    audio = []
    labels = []

    for (wav, srt) in files:
        print("Processing audio:", wav)
        y, sr = librosa.load(wav, sr=FREQ)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=int(HOP_LEN), n_mfcc=int(N_MFCC))
        label = extract_labels(srt, len(mfcc[0]))
        audio.append(mfcc)
        labels.append(label)

    return audio, labels

Example #3

Source File: test_rythm.py From audiomate with MIT License

7 votes

def test_compute_cleanup_after_one_utterance(self):
        test_file_path = resources.sample_wav_file('wav_1.wav')
        y, sr = librosa.load(test_file_path, sr=None)
        frames = librosa.util.frame(y, frame_length=2048, hop_length=1024).T

        # EXPECTED
        S = np.abs(librosa.stft(y, center=False, n_fft=2048, hop_length=1024)) ** 2
        S = librosa.feature.melspectrogram(S=S, n_mels=128, sr=sr)
        S = librosa.power_to_db(S)
        onsets = librosa.onset.onset_strength(S=S, center=False)
        exp_tgram = librosa.feature.tempogram(onset_envelope=onsets, sr=sr, win_length=11, center=True).T

        # ACTUAL
        tgram_step = pipeline.Tempogram(win_length=11)

        # FIRST RUN
        tgrams = tgram_step.process_frames(frames, sr, last=True)

        assert np.allclose(tgrams, exp_tgram)

        # SECOND RUN
        tgrams = tgram_step.process_frames(frames, sr, last=True)

        assert np.allclose(tgrams, exp_tgram)

Example #4

Source File: singlelayer.py From EUSIPCO2017 with GNU Affero General Public License v3.0

6 votes

def compute_spectrograms(filename):
    out_rate = 12000
    N_FFT = 512
    HOP_LEN = 256

    frames, rate = librosa.load(filename, sr=out_rate, mono=True)
    if len(frames) < out_rate*3:
        # if less then 3 second - can't process
        raise Exception("Audio duration is too short")

    logam = librosa.power_to_db
    melgram = librosa.feature.melspectrogram
    x = logam(melgram(y=frames, sr=out_rate, hop_length=HOP_LEN,
                      n_fft=N_FFT, n_mels=N_MEL_BANDS) ** 2,
              ref=1.0)

    # now going through spectrogram with the stride of the segment duration
    for start_idx in range(0, x.shape[1] - SEGMENT_DUR + 1, SEGMENT_DUR):
        yield x[:, start_idx:start_idx + SEGMENT_DUR]

Example #5

Source File: transforms.py From pase with MIT License

6 votes

def load_filter(self, filt_file, filt_fmt):

        filt_file = os.path.join(self.data_root, filt_file)

        if filt_fmt == 'mat':
            filt_coeff = loadmat(filt_file, squeeze_me=True, struct_as_record=False)
            filt_coeff = filt_coeff['filt_coeff']

        elif filt_fmt == 'imp' or filt_fmt == 'txt':
            filt_coeff = np.loadtxt(filt_file)
        elif filt_fmt == 'npy':
            filt_coeff = np.load(filt_file)
        else:
            raise TypeError('Unrecognized filter format: ', filt_fmt)

        filt_coeff = filt_coeff / np.abs(np.max(filt_coeff))

        return filt_coeff

Example #6

Source File: transforms.py From pase with MIT License

6 votes

def load_filter(self, filt_file, filt_fmt):

        filt_file = os.path.join(self.data_root, filt_file)

        if filt_fmt == 'mat':
            filt_coeff = loadmat(filt_file, squeeze_me=True, struct_as_record=False)
            filt_coeff = filt_coeff['filt_coeff']

        elif filt_fmt == 'imp' or filt_fmt == 'txt':
            filt_coeff = np.loadtxt(filt_file)
        elif filt_fmt == 'npy':
            filt_coeff = np.load(filt_file)
        else:
            raise TypeError('Unrecognized filter format: ', filt_fmt)

        filt_coeff = filt_coeff / np.abs(np.max(filt_coeff))

        return filt_coeff

Example #7

Source File: utils.py From speech_separation with MIT License

6 votes

def phase_enhance_pred(mix_STFT,pred_file, mode='STFT'):
    if mode=='wav':
        T_pred, _ = librosa.load(pred_file,sr=16000)
        F_pred = fast_stft(T_pred)
    if mode =='STFT':
        F_pred = pred_file
    M = np.sqrt(np.square(F_pred[:,:,0])+np.square(F_pred[:,:,1]))     #magnitude
    print('shape M:',M.shape)
    P = np.arctan(np.divide(mix_STFT[:,:,0],mix_STFT[:,:,1]))          #phase
    print('shape p:',P.shape)
    F_enhance = np.zeros_like(F_pred)
    print('shape enhance',F_enhance.shape)
    F_enhance[:,:,0] = np.multiply(M,np.cos(P))
    F_enhance[:,:,1] = np.multiply(M,np.sin(P))
    print('shape enhance', F_enhance.shape)
    T_enhance = fast_istft(F_enhance)
    return T_enhance

## test code part

Example #8

Source File: transforms.py From pase with MIT License

6 votes

def load_IR(self, ir_file, ir_fmt):
        ir_file = os.path.join(self.data_root, ir_file)
        # print('loading ir_file: ', ir_file)
        if hasattr(self, 'cache') and ir_file in self.cache:
            return self.cache[ir_file]
        else:
            if ir_fmt == 'mat':
                IR = loadmat(ir_file, squeeze_me=True, struct_as_record=False)
                IR = IR['risp_imp']
            elif ir_fmt == 'imp' or ir_fmt == 'txt':
                IR = np.loadtxt(ir_file)
            elif ir_fmt == 'npy':
                IR = np.load(ir_file)
            elif ir_fmt == 'wav':
                IR, _ = sf.read(ir_file)
            else:
                raise TypeError('Unrecognized IR format: ', ir_fmt)
            IR = IR[:self.max_reverb_len]
            if np.max(IR)>0:
                IR = IR / np.abs(np.max(IR))
            p_max = np.argmax(np.abs(IR))
            if hasattr(self, 'cache'):
                self.cache[ir_file] = (IR, p_max)
            return IR, p_max

Example #9

Source File: transforms.py From pase with MIT License

6 votes

def __call__(self, pkg, cached_file=None):
        pkg = format_package(pkg)
        wav = pkg['chunk']
        y = wav.data.numpy()
        max_frames = y.shape[0] // self.hop
        if cached_file is not None:
            # load pre-computed data
            plp = torch.load(cached_file)
            beg_i = pkg['chunk_beg_i'] // self.hop
            end_i = pkg['chunk_end_i'] // self.hop
            plp = plp[:, beg_i:end_i]
            pkg[self.name] = plp
        else:
            # print(y.dtype)
            feats = self.__execute_command__(y, self.cmd)
            pkg[self.name] = torch.tensor(feats[:,:max_frames].astype(np.float32))
        
        # Overwrite resolution to hop length
        pkg['dec_resolution'] = self.hop
        return pkg

Example #10

Source File: inputs.py From EnglishSpeechUpsampler with MIT License

6 votes

def read_file_pair(filename_pair, mono=True):
    """
    given a pair of file names, read in both waveforms and upsample (through
    librosa's default interpolation) the downsampled waveform
    assumes the file name pair is of the form ("original", "downsampled")
    mono selects whether to read in mono or stereo formatted waveforms

    returns a pair of numpy arrays representing the original and upsampled
    waveform
    """
    channel = 1 if mono else 2
    true_waveform, true_br = librosa.load(filename_pair[0], sr=None,
                                          mono=mono)
    ds_waveform, _ = librosa.load(filename_pair[1], sr=true_br, mono=mono)
    # truth, example
    return true_waveform.reshape((-1, channel)), \
        ds_waveform.reshape((-1, channel))

Example #11

Source File: melspec.py From Deep-Music-Tagger with MIT License

6 votes

def __extract_melspec(audio_fpath, audio_fname):
    """
    Using librosa to calculate log mel spectrogram values
    and scipy.misc to draw and store them (in grayscale).

    :param audio_fpath:
    :param audio_fname:
    :return:
    """
    # Load sound file
    y, sr = librosa.load(audio_fpath, sr=12000)

    # Let's make and display a mel-scaled power (energy-squared) spectrogram
    S = librosa.feature.melspectrogram(y, sr=sr, hop_length=256, n_mels=96)

    # Convert to log scale (dB). We'll use the peak power as reference.
    log_S = librosa.logamplitude(S, ref_power=np.max)

    spectr_fname = audio_fname + '.png'
    subdir_path = __get_subdir(spectr_fname)

    # Draw log values matrix in grayscale
    scipy.misc.toimage(log_S).save(subdir_path.format(spectr_fname))

Example #12

Source File: audio_reader.py From tensorflow-wavenet with MIT License

6 votes

def load_generic_audio(directory, sample_rate):
    '''Generator that yields audio waveforms from the directory.'''
    files = find_files(directory)
    id_reg_exp = re.compile(FILE_PATTERN)
    print("files length: {}".format(len(files)))
    randomized_files = randomize_files(files)
    for filename in randomized_files:
        ids = id_reg_exp.findall(filename)
        if not ids:
            # The file name does not match the pattern containing ids, so
            # there is no id.
            category_id = None
        else:
            # The file name matches the pattern for containing ids.
            category_id = int(ids[0][0])
        audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
        audio = audio.reshape(-1, 1)
        yield audio, filename, category_id

Example #13

Source File: test_rythm.py From audiomate with MIT License

6 votes

def test_compute(self):
        test_file_path = resources.sample_wav_file('wav_1.wav')
        y, sr = librosa.load(test_file_path, sr=None)
        frames = librosa.util.frame(y, frame_length=2048, hop_length=1024).T

        # EXPECTED
        S = np.abs(librosa.stft(y, center=False, n_fft=2048, hop_length=1024)) ** 2
        S = librosa.feature.melspectrogram(S=S, n_mels=128, sr=sr)
        S = librosa.power_to_db(S)
        onsets = librosa.onset.onset_strength(S=S, center=False)
        exp_tgram = librosa.feature.tempogram(onset_envelope=onsets, sr=sr, win_length=11, center=True).T

        # ACTUAL
        tgram_step = pipeline.Tempogram(win_length=11)
        tgrams = tgram_step.process_frames(frames, sr, last=True)

        assert np.allclose(tgrams, exp_tgram)

Example #14

Source File: test_rythm.py From audiomate with MIT License

6 votes

def test_compute_online(self):
        # Data: 41523 samples, 16 kHz
        # yields 40 frames with frame-size 2048 and hop-size 1024
        test_file_path = resources.sample_wav_file('wav_1.wav')
        y, sr = librosa.load(test_file_path, sr=None)

        # EXPECTED
        y_pad = np.pad(y, (0, 1024), mode='constant', constant_values=0)
        S = np.abs(librosa.stft(y_pad, center=False, n_fft=2048, hop_length=1024)) ** 2
        S = librosa.feature.melspectrogram(S=S, n_mels=128, sr=sr)
        S = librosa.power_to_db(S)
        onsets = librosa.onset.onset_strength(S=S, center=False)
        exp_tgram = librosa.feature.tempogram(onset_envelope=onsets, sr=sr, win_length=4, center=True).T

        # ACTUAL
        test_file = tracks.FileTrack('idx', test_file_path)
        tgram_step = pipeline.Tempogram(win_length=4)
        tgram_gen = tgram_step.process_track_online(test_file, 2048, 1024, chunk_size=5)

        chunks = list(tgram_gen)
        tgrams = np.vstack(chunks)

        assert np.allclose(tgrams, exp_tgram)

Example #15

Source File: test_onset.py From audiomate with MIT License

6 votes

def test_compute(self):
        test_file_path = resources.sample_wav_file('wav_1.wav')
        y, sr = librosa.load(test_file_path, sr=None)
        frames = librosa.util.frame(y, frame_length=2048, hop_length=1024).T

        # EXPECTED
        S = np.abs(librosa.stft(y, center=False, n_fft=2048, hop_length=1024)) ** 2
        S = librosa.feature.melspectrogram(S=S, n_mels=128, sr=sr)
        S = librosa.power_to_db(S)
        exp_onsets = librosa.onset.onset_strength(S=S, center=False).T
        exp_onsets = exp_onsets.reshape(exp_onsets.shape[0], 1)

        # ACTUAL
        onset = pipeline.OnsetStrength()
        onsets = onset.process_frames(frames, sr, last=True)

        assert np.allclose(onsets, exp_onsets)

Example #16

Source File: test_onset.py From audiomate with MIT License

6 votes

def test_compute_online(self):
        test_file_path = resources.sample_wav_file('wav_1.wav')
        y, sr = librosa.load(test_file_path, sr=None)

        # EXPECTED
        y_pad = np.pad(y, (0, 1024), mode='constant', constant_values=0)
        S = np.abs(librosa.stft(y_pad, center=False, n_fft=2048, hop_length=1024)) ** 2
        S = librosa.feature.melspectrogram(S=S, n_mels=128, sr=sr)
        S = librosa.power_to_db(S)
        exp_onsets = librosa.onset.onset_strength(S=S, center=False).T
        exp_onsets = exp_onsets.reshape(exp_onsets.shape[0], 1)

        # ACTUAL
        test_file = tracks.FileTrack('idx', test_file_path)
        onset = pipeline.OnsetStrength()
        onset_gen = onset.process_track_online(test_file, 2048, 1024, chunk_size=5)

        chunks = list(onset_gen)
        onsets = np.vstack(chunks)

        print(onsets.shape, exp_onsets.shape)

        assert np.allclose(onsets, exp_onsets)

Example #17

Source File: audio.py From argus-freesound with MIT License

6 votes

def read_audio(file_path):
    min_samples = int(config.min_seconds * config.sampling_rate)
    try:
        y, sr = librosa.load(file_path, sr=config.sampling_rate)
        trim_y, trim_idx = librosa.effects.trim(y)  # trim, top_db=default(60)

        if len(trim_y) < min_samples:
            center = (trim_idx[1] - trim_idx[0]) // 2
            left_idx = max(0, center - min_samples // 2)
            right_idx = min(len(y), center + min_samples // 2)
            trim_y = y[left_idx:right_idx]

            if len(trim_y) < min_samples:
                padding = min_samples - len(trim_y)
                offset = padding // 2
                trim_y = np.pad(trim_y, (offset, padding - offset), 'constant')
        return trim_y
    except BaseException as e:
        print(f"Exception while reading file {e}")
        return np.zeros(min_samples, dtype=np.float32)

Example #18

Source File: melspec.py From Deep-Music-Tagger with MIT License

6 votes

def __extract_melspec(audio_fpath, audio_fname):
    """
    Using librosa to calculate log mel spectrogram values
    and scipy.misc to draw and store them (in grayscale).

    :param audio_fpath:
    :param audio_fname:
    :return:
    """
    # Load sound file
    y, sr = librosa.load(audio_fpath, sr=12000)

    # Let's make and display a mel-scaled power (energy-squared) spectrogram
    S = librosa.feature.melspectrogram(y, sr=sr, hop_length=256, n_mels=96)

    # Convert to log scale (dB). We'll use the peak power as reference.
    log_S = librosa.logamplitude(S, ref_power=np.max)

    spectr_fname = audio_fname + '.png'
    subdir_path = __get_subdir(spectr_fname)

    # Draw log values matrix in grayscale
    scipy.misc.toimage(log_S).save(subdir_path.format(spectr_fname))

Example #19

Source File: audio.py From signaltrain with GNU General Public License v3.0

6 votes

def triangle(t, randfunc=np.random.rand, t0_fac=None): # ramp up then down
    height = (0.4 * randfunc() + 0.4) * np.random.choice([-1,1])
    width = randfunc()/4 * t[-1]     # half-width actually
    t0 = 2*width + 0.4 * randfunc()*t[-1] if t0_fac is None else t0_fac*t[-1]
    x = height * (1 - np.abs(t-t0)/width)
    x[np.where(t < (t0-width))] = 0
    x[np.where(t > (t0+width))] = 0
    amp_n = (0.1*randfunc()+0.02)   # add noise
    return x + amp_n*pinknoise(t.shape[0])


# Prelude to read_audio_file
# Tried lots of ways of doing this.. most are slow.
#signal, rate = librosa.load(filename, sr=sr, mono=True, res_type='kaiser_fast') # Librosa's reader is incredibly slow. do not use
#signal, rate = torchaudio.load(filename)#, normalization=True)   # Torchaudio's reader is pretty fast but normalization is a problem
#signal = signal.numpy().flatten()
#reader = io_methods.AudioIO   # Stylios' file reader. Haven't gotten it working yet
#signal, rate = reader.audioRead(filename, mono=True)
#signal, rate = sf.read('existing_file.wav')

Example #20

Source File: transforms.py From pase with MIT License

6 votes

def __call__(self, pkg, cached_file=None):
        pkg = format_package(pkg)
        wav = pkg['chunk']
        if torch.is_tensor(wav):
            wav = wav.data.numpy().astype(np.float32)
        max_frames = wav.shape[0] // self.hop
        if cached_file is not None:
            # load pre-computed data
            X = torch.load(cached_file)
            beg_i = pkg['chunk_beg_i'] // self.hop
            end_i = pkg['chunk_end_i'] // self.hop
            X = X[:, beg_i:end_i]
            pkg[self.name] = X
        else:
            wav = self.frame_signal(wav, self.window)
            #print('wav shape: ', wav.shape)
            lpc = pysptk.sptk.lpc(wav, order=self.order)
            #print('lpc: ', lpc.shape)
            pkg[self.name] = torch.FloatTensor(lpc)
        # Overwrite resolution to hop length
        pkg['dec_resolution'] = self.hop
        return pkg

Example #21

Source File: transforms.py From pase with MIT License

5 votes

def __call__(self, pkg, cached_file=None):
        pkg = format_package(pkg)
        wav = pkg['chunk']
        y = wav.data.numpy()
        max_frames = y.shape[0] // self.hop
        if cached_file is not None:
            # load pre-computed data
            mfcc = torch.load(cached_file)
            beg_i = pkg['chunk_beg_i'] // self.hop
            end_i = pkg['chunk_end_i'] // self.hop
            mfcc = mfcc[:, beg_i:end_i]
            pkg[self.name] = mfcc
        else:
            # print(y.dtype)
            mfcc = librosa.feature.mfcc(y, sr=self.sr,
                                        n_mfcc=self.order,
                                        n_fft=self.n_fft,
                                        hop_length=self.hop,
                                        #win_length=self.win,
                                        )[:, :max_frames]
            if self.der_order > 0 :
                deltas=[mfcc]
                for n in range(1,self.der_order+1):
                    deltas.append(librosa.feature.delta(mfcc,order=n))
                mfcc=np.concatenate(deltas)
    
            pkg[self.name] = torch.tensor(mfcc.astype(np.float32))
        # Overwrite resolution to hop length
        pkg['dec_resolution'] = self.hop
        return pkg

Example #22

Source File: utils.py From pase with MIT License

5 votes

def compute_utterances_durs(files, data_root):
    durs = []
    for file_ in files:
        wav, rate = librosa.load(os.path.join(data_root,
                                              file_),
                                 sr=None)
        durs.append(wav.shape[0])
    return durs, rate

Example #23

Source File: prepare_data.py From Transformer-TTS with MIT License

5 votes

def load_wav(self, filename):
        return librosa.load(filename, sr=hp.sample_rate)

Example #24

Source File: preprocess.py From Transformer-TTS with MIT License

5 votes

def load_wav(self, filename):
        return librosa.load(filename, sr=hp.sample_rate)

Example #25

Source File: preprocess.py From Transformer-TTS with MIT License

5 votes

def __getitem__(self, idx):
        wav_name = os.path.join(self.root_dir, self.landmarks_frame.ix[idx, 0]) + '.wav'
        text = self.landmarks_frame.ix[idx, 1]

        text = np.asarray(text_to_sequence(text, [hp.cleaners]), dtype=np.int32)
        mel = np.load(wav_name[:-4] + '.pt.npy')
        mel_input = np.concatenate([np.zeros([1,hp.num_mels], np.float32), mel[:-1,:]], axis=0)
        text_length = len(text)
        pos_text = np.arange(1, text_length + 1)
        pos_mel = np.arange(1, mel.shape[0] + 1)

        sample = {'text': text, 'mel': mel, 'text_length':text_length, 'mel_input':mel_input, 'pos_mel':pos_mel, 'pos_text':pos_text}

        return sample

Example #26

Source File: transforms.py From pase with MIT License

5 votes

def __call__(self, pkg, cached_file=None):
        pkg = format_package(pkg)
        wav = pkg['chunk']
        y = wav.data.numpy()
        max_frames = y.shape[0] // self.hop
        if cached_file is not None:
            # load pre-computed data
            mfcc = torch.load(cached_file)
            beg_i = pkg['chunk_beg_i'] // self.hop
            end_i = pkg['chunk_end_i'] // self.hop
            mfcc = mfcc[:, beg_i:end_i]
            pkg[self.name] = mfcc
        else:
            # print(y.dtype)
            mfcc = librosa.feature.mfcc(y, sr=self.sr,
                                        n_mfcc=self.order,
                                        n_fft=self.n_fft,
                                        hop_length=self.hop,
                                        #win_length=self.win,
					n_mels=self.n_mels,
                                        htk=self.htk,
                                        )[:, :max_frames]
            if self.der_order > 0 :
                deltas=[mfcc]
                for n in range(1,self.der_order+1):
                    deltas.append(librosa.feature.delta(mfcc,order=n))
                mfcc=np.concatenate(deltas)

            pkg[self.name] = torch.tensor(mfcc.astype(np.float32))
        # Overwrite resolution to hop length
        pkg['dec_resolution'] = self.hop
        return pkg

Example #27

Source File: transforms.py From pase with MIT License

5 votes

def __init__(self, stats):
        self.stats_name = stats
        with open(stats, 'rb') as stats_f:
            self.stats = pickle.load(stats_f)

    # @profile

Example #28

Source File: transforms.py From pase with MIT License

5 votes

def __call__(self, pkg, cached_file=None):
        pkg = format_package(pkg)
        wav = pkg['chunk']
        if torch.is_tensor(wav):
            wav = wav.data.numpy().astype(np.float32)
        max_frames = wav.shape[0] // self.hop
        if cached_file is not None:
            # load pre-computed data
            X = torch.load(cached_file)
            beg_i = pkg['chunk_beg_i'] // self.hop
            end_i = pkg['chunk_end_i'] // self.hop
            X = X[:, beg_i:end_i]
            pkg[self.name] = X
        else:
            windowtime = float(self.win) / self.rate
            windowhop = float(self.hop) / self.rate
            gtn = gammatone.gtgram.gtgram(wav, self.rate, 
                                          windowtime, windowhop,
                                          self.n_channels,
                                          self.f_min)
            gtn = np.log(gtn + 1e-10)
 
            if self.der_order > 0 :
                deltas=[gtn]
                for n in range(1,self.der_order+1):
                    deltas.append(librosa.feature.delta(gtn,order=n))
                gtn=np.concatenate(deltas)

            expected_frames = len(wav) // self.hop
            gtn = torch.FloatTensor(gtn)
            if gtn.shape[1] < expected_frames:
                P = expected_frames - gtn.shape[1]
                # pad repeating borders
                gtn = F.pad(gtn.unsqueeze(0), (0, P), mode='replicate')
                gtn = gtn.squeeze(0)
            #pkg['gtn'] = torch.FloatTensor(gtn[:, :total_frames])

            pkg[self.name] = torch.FloatTensor(gtn)
        # Overwrite resolution to hop length
        pkg['dec_resolution'] = self.hop
        return pkg

Example #29

Source File: transforms.py From pase with MIT License

5 votes

def __call__(self, pkg, cached_file=None):
        pkg = format_package(pkg)
        wav = pkg['chunk']
        if torch.is_tensor(wav):
            wav = wav.data.numpy().astype(np.float32)
        max_frames = wav.shape[0] // self.hop
        if cached_file is not None:
            # load pre-computed data
            X = torch.load(cached_file)
            beg_i = pkg['chunk_beg_i'] // self.hop
            end_i = pkg['chunk_end_i'] // self.hop
            X = X[:, beg_i:end_i]
            pkg[self.name] = X
        else:
            winlen = (float(self.win) / self.rate)
            winstep = (float(self.hop) / self.rate)
            X = logfbank(wav, self.rate, winlen, winstep,
                         self.n_filters, self.n_fft).T
            expected_frames = len(wav) // self.hop

            if self.der_order > 0 :
                deltas=[X]
                for n in range(1,self.der_order+1):
                    deltas.append(librosa.feature.delta(X,order=n))
                X=np.concatenate(deltas)

            fbank = torch.FloatTensor(X)
            if fbank.shape[1] < expected_frames:
                P = expected_frames - fbank.shape[1]
                # pad repeating borders
                fbank = F.pad(fbank.unsqueeze(0), (0, P), mode='replicate')
                fbank = fbank.squeeze(0)
            pkg[self.name] = fbank
        # Overwrite resolution to hop length
        pkg['dec_resolution'] = self.hop
        return pkg

Example #30

Source File: prep_voxceleb.py From pase with MIT License

5 votes

def prep_rec(input_rec_path, out_rec_path, sr=16000, out_length_seconds=10):

	try:

		y, s = librosa.load(input_rec_path, sr=sr)

		n_samples = sr*out_length_seconds

		try:
			ridx = np.random.randint(0, len(y)-n_samples)
			librosa.output.write_wav(out_rec_path, y[ridx:(ridx+n_samples)], sr=sr)

			y = y[ridx:(ridx+n_samples)]

		except ValueError:

			mul = int(np.ceil(n_samples/len(y)))
			y = np.tile(y, (mul))[:n_samples]

		librosa.output.write_wav(out_rec_path, y, sr=sr)

		return True

	except:

		return False