Python librosa.get_duration() Examples

The following are 30 code examples of librosa.get_duration(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module librosa , or try the search function .
Example #1
Source File: Audio.py    From mugen with MIT License 6 votes vote down vote up
def __init__(self, file: str, *, sample_rate: int = 44100):
        """        
        Parameters
        ----------
        file
            Audio file to load
        """

        self.file = file
        self.samples, self.sample_rate = librosa.load(file, sr=sample_rate)
        self.duration = librosa.get_duration(y=self.samples, sr=self.sample_rate) 
Example #2
Source File: data_tools.py    From Speech-enhancement with MIT License 6 votes vote down vote up
def audio_files_to_numpy(audio_dir, list_audio_files, sample_rate, frame_length, hop_length_frame, min_duration):
    """This function take audio files of a directory and merge them
    in a numpy matrix of size (nb_frame,frame_length) for a sliding window of size hop_length_frame"""

    list_sound_array = []

    for file in list_audio_files:
        # open the audio file
        y, sr = librosa.load(os.path.join(audio_dir, file), sr=sample_rate)
        total_duration = librosa.get_duration(y=y, sr=sr)

        if (total_duration >= min_duration):
            list_sound_array.append(audio_to_audio_frame_stack(
                y, frame_length, hop_length_frame))
        else:
            print(
                f"The following file {os.path.join(audio_dir,file)} is below the min duration")

    return np.vstack(list_sound_array) 
Example #3
Source File: test_core.py    From muda with ISC License 6 votes vote down vote up
def test_save(jam_in, audio_file, strict, fmt):

    jam = muda.load_jam_audio(jam_in, audio_file)

    _, jamfile = tempfile.mkstemp(suffix='.jams')
    _, audfile = tempfile.mkstemp(suffix='.wav')

    muda.save(audfile, jamfile, jam, strict=strict, fmt=fmt)

    jam2 = muda.load_jam_audio(jamfile, audfile, fmt=fmt)
    jam2_raw = jams.load(jamfile, fmt=fmt)

    os.unlink(audfile)
    os.unlink(jamfile)

    assert hasattr(jam2.sandbox, 'muda')
    assert '_audio' in jam2.sandbox.muda
    assert '_audio' not in jam2_raw.sandbox.muda

    duration = librosa.get_duration(**jam2.sandbox.muda['_audio'])

    assert jam2.file_metadata.duration == duration 
Example #4
Source File: test_deformers.py    From muda with ISC License 6 votes vote down vote up
def test_ir_convolution(ir_files,jam_fixture,n_fft,rolloff_value):
    D = muda.deformers.IRConvolution(ir_files = ir_files, n_fft=n_fft, rolloff_value = rolloff_value)

    jam_orig = deepcopy(jam_fixture)
    orig_duration = librosa.get_duration(**jam_orig.sandbox.muda['_audio'])

    for jam_new in D.transform(jam_orig):
        # Verify that the original jam reference hasn't changed
        assert jam_new is not jam_orig

        #Testing with shifted impulse
        __test_shifted_impulse(jam_orig, jam_new, ir_files, orig_duration,n_fft=n_fft, rolloff_value = rolloff_value)

        #Verify that the state and history objects are intact
        __test_deformer_history(D, jam_new.sandbox.muda.history[-1])

    # Serialization test
    D2 = muda.deserialize(muda.serialize(D))
    __test_params(D, D2) 
Example #5
Source File: rhythm.py    From pumpp with ISC License 6 votes vote down vote up
def transform_audio(self, y):
        '''Compute the tempogram

        Parameters
        ----------
        y : np.ndarray
            Audio buffer

        Returns
        -------
        data : dict
            data['tempogram'] : np.ndarray, shape=(n_frames, win_length)
                The tempogram
        '''
        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        tgram = tempogram(y=y, sr=self.sr,
                          hop_length=self.hop_length,
                          win_length=self.win_length)

        tgram = to_dtype(fix_length(tgram, n_frames), self.dtype)
        return {'tempogram': tgram.T[self.idx]} 
Example #6
Source File: test_deformers.py    From muda with ISC License 5 votes vote down vote up
def test_colorednoise(n_samples, color, weight_min, weight_max, jam_test_silence):

    D = muda.deformers.ColoredNoise(n_samples=n_samples,
                                    color=color,
                                    weight_min=weight_min,
                                    weight_max=weight_max,
                                    rng=0)
    jam_orig = deepcopy(jam_test_silence)

    orig_duration = librosa.get_duration(**jam_orig.sandbox.muda['_audio'])

    n_out = 0
    for jam_new in D.transform(jam_orig):
        assert jam_new is not jam_test_silence
        __test_effect(jam_orig, jam_test_silence)

        assert not np.allclose(jam_orig.sandbox.muda['_audio']['y'],
                               jam_new.sandbox.muda['_audio']['y'])
        # verify that duration hasn't changed
        assert librosa.get_duration(**jam_new.sandbox.muda['_audio']) == orig_duration

        # Verify that the state and history objects are intact
        __test_deformer_history(D, jam_new.sandbox.muda.history[-1])

        __test_effect(jam_orig, jam_new)

        # Verify the colored noise has desired slope for its log-log
        # scale power spectrum
        color = jam_new.sandbox.muda.history[-1]['state']['color']
        __test_color_slope(jam_orig, jam_new, color)

        n_out += 1
    assert n_out == n_samples
    # Serialization test
    D2 = muda.deserialize(muda.serialize(D))
    __test_params(D, D2) 
Example #7
Source File: labeled_example.py    From speechless with MIT License 5 votes vote down vote up
def duration_in_s(self) -> float:
        try:
            return librosa.get_duration(filename=str(self.audio_file))
        except Exception as e:
            log("Failed to get duration of {}: {}".format(self.audio_file, e))
            return 0 
Example #8
Source File: speech_cls_task.py    From delta with Apache License 2.0 5 votes vote down vote up
def get_duration(self, filename, sr):  #pylint: disable=invalid-name
    ''' time in second '''
    if filename.endswith('.npy'):
      nframe = np.load(filename).shape[0]
      return librosa.frames_to_time(
          nframe, hop_length=self._winstep * sr, sr=sr)

    if filename.endswith('.wav'):
      return librosa.get_duration(filename=filename)

    raise ValueError("filename suffix not .npy or .wav: {}".format(
        os.path.splitext(filename)[-1])) 
Example #9
Source File: get_hi-mia_data.py    From NeMo with Apache License 2.0 5 votes vote down vote up
def process_single_line(line: str):
    line = line.strip()
    y, sr = librosa.load(line, sr=None)
    if sr != 16000:
        y, sr = librosa.load(line, sr=16000)
        librosa.output.write_wav(line, y, sr)
    dur = librosa.get_duration(y=y, sr=sr)
    if 'test' in line.split("/"):
        speaker = line.split('/')[-1].split('.')[0].split('_')[0]
    else:
        speaker = line.split('/')[-2]
    speaker = list(speaker)
    speaker = ''.join(speaker)
    meta = {"audio_filepath": line, "duration": float(dur), "label": speaker}
    return meta 
Example #10
Source File: scp_to_manifest.py    From NeMo with Apache License 2.0 5 votes vote down vote up
def main(scp, id, out, split=False):
    if os.path.exists(out):
        os.remove(out)
    scp_file = open(scp, 'r').readlines()

    lines = []
    speakers = []
    with open(out, 'w') as outfile:
        for line in tqdm(scp_file):
            line = line.strip()
            y, sr = l.load(line, sr=None)
            dur = l.get_duration(y=y, sr=sr)
            speaker = line.split('/')[id]
            speaker = list(speaker)
            speaker = ''.join(speaker)
            speakers.append(speaker)
            meta = {"audio_filepath": line, "duration": float(dur), "label": speaker}
            lines.append(meta)
            json.dump(meta, outfile)
            outfile.write("\n")

    path = os.path.dirname(out)
    if split:
        sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
        for train_idx, test_idx in sss.split(speakers, speakers):
            logging.info(len(train_idx))

        out = os.path.join(path, 'train.json')
        write_file(out, lines, train_idx)
        out = os.path.join(path, 'dev.json')
        write_file(out, lines, test_idx) 
Example #11
Source File: get_databaker_data.py    From NeMo with Apache License 2.0 5 votes vote down vote up
def __convert_waves(wavedir, converted_wavedir, wavename, sr):
    """
    Converts a wav file to target sample rate.
    """
    wavepath = os.path.join(wavedir, wavename)
    converted_wavepath = os.path.join(converted_wavedir, wavename)
    y, sr = librosa.load(wavepath, sr=sr)
    duration = librosa.get_duration(y=y, sr=sr)
    librosa.output.write_wav(converted_wavepath, y, sr)
    return wavename, round(duration, 2) 
Example #12
Source File: test_deformers.py    From muda with ISC License 5 votes vote down vote up
def __test_time(jam_orig, jam_new, rate):

    # Test the track length
    ap_(librosa.get_duration(**jam_orig.sandbox.muda['_audio']),
        rate * librosa.get_duration(**jam_new.sandbox.muda['_audio']))

    # Test the metadata
    ap_(jam_orig.file_metadata.duration,
        rate * jam_new.file_metadata.duration)

    # Test each annotation
    for ann_orig, ann_new in zip(jam_orig.annotations, jam_new.annotations):
        # JAMS 0.2.1 support
        if hasattr(ann_orig, 'time'):
            ap_(ann_orig.time, rate * ann_new.time)
            ap_(ann_orig.duration, rate * ann_new.duration)

        assert len(ann_orig.data) == len(ann_new.data)

        for obs1, obs2 in zip(ann_orig, ann_new):

            ap_(obs1.time, rate * obs2.time)
            ap_(obs1.duration, rate * obs2.duration)

            if ann_orig.namespace == 'tempo':
                ap_(rate * obs1.value, obs2.value) 
Example #13
Source File: test_deformers.py    From muda with ISC License 5 votes vote down vote up
def test_background(noise, n_samples, weight_min, weight_max, jam_fixture):

    D = muda.deformers.BackgroundNoise(files=noise,
                                       n_samples=n_samples,
                                       weight_min=weight_min,
                                       weight_max=weight_max)

    jam_orig = deepcopy(jam_fixture)
    orig_duration = librosa.get_duration(**jam_orig.sandbox.muda['_audio'])

    n_out = 0
    for jam_new in D.transform(jam_orig):

        assert jam_new is not jam_fixture
        __test_effect(jam_orig, jam_fixture)

        assert not np.allclose(jam_orig.sandbox.muda['_audio']['y'],
                               jam_new.sandbox.muda['_audio']['y'])

        d_state = jam_new.sandbox.muda.history[-1]['state']
        filename = d_state['filename']
        start = d_state['start']
        stop = d_state['stop']

        with psf.SoundFile(str(filename), mode='r') as soundf:
            max_index = len(soundf)
            noise_sr = soundf.samplerate

        assert 0 <= start < stop
        assert start < stop <= max_index
        assert ((stop - start) / float(noise_sr)) == orig_duration

        __test_effect(jam_orig, jam_new)
        n_out += 1

    assert n_out == n_samples
    # Serialization test
    D2 = muda.deserialize(muda.serialize(D))
    __test_params(D, D2) 
Example #14
Source File: base.py    From crema with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def predict(self, filename=None, y=None, sr=None, outputs=None):
        '''Predict annotations

        Parameters
        ----------
        filename : str (optional)
            Path to audio file

        y, sr : (optional)
            Audio buffer and sample rate

        outputs : (optional)
            Pre-computed model outputs as produced by `CremaModel.outputs`.
            If provided, then predictions are derived from these instead of
            `filename` or `(y, sr)`.


        .. note:: At least one of `filename`, `y, sr` must be provided.

        Returns
        -------
        jams.Annotation
            The predicted annotation
        '''

        # Pump the input features
        output_key = self.model.output_names[0]

        if outputs is None:
            outputs = self.outputs(filename=filename, y=y, sr=sr)

        # Invert the prediction.  This is always the first output layer.
        ann = self.pump[output_key].inverse(outputs[output_key])

        # Populate the metadata
        ann.annotation_metadata.version = self.version
        ann.annotation_metadata.annotation_tools = 'CREMA {}'.format(version)
        ann.annotation_metadata.data_source = 'program'
        ann.duration = librosa.get_duration(y=y, sr=sr, filename=filename)

        return ann 
Example #15
Source File: test_deformers.py    From muda with ISC License 5 votes vote down vote up
def __test_duration(jam_orig, jam_shifted, orig_duration):
    #Verify the duration of last delayed annotation is in valid range
    #Verify the total duration hasn't changed
    assert (librosa.get_duration(**jam_shifted.sandbox.muda['_audio'])) == orig_duration

    shifted_data = jam_shifted.search(namespace='chord')[0].data
    #the expected duration of last annotation = Duration - Onset of last annotation
    ref_duration = orig_duration - shifted_data[-1][0] #[-1][0] indicates the 'time' of last observation
    #deformed duration:
    derformed_duration = shifted_data[-1][1] #[-1][0] indicates the 'duration' of last observation
    isclose_(ref_duration,derformed_duration,rtol=1e-5, atol=1e-1) 
Example #16
Source File: pre_processing.py    From audio-source-separation with MIT License 5 votes vote down vote up
def process(file_path,direc,destination_path,phase_bool,destination_phase_path):
	t1,t2=librosa.load(file_path,sr=None)
	duration=librosa.get_duration(t1,t2)
	regex = re.compile(r'\d+')
	index=regex.findall(direc)
	#print(index)
	num_segments=0
	#mean=np.zeros((513,52))
	#var=np.zeros((513,52))
	for start in range(30,int(200)):

		wave_array, fs = librosa.load(file_path,sr=44100,offset=start*0.3,duration = 0.3)

		mag, phase = librosa.magphase(librosa.stft(wave_array, n_fft=1024,hop_length=256,window='hann',center='True'))
		#mean+=mag
		#num_segments+=1;
		if not os.path.exists(destination_path):
			os.makedirs(destination_path)
		#print(mag.shape)
		#print(torch.from_numpy(np.expand_dims(mag,axis=0)).shape)

		# magnitude stored as tensor, phase as np array
		#pickle.dump(torch.from_numpy(np.expand_dims(mag,axis=2)),open(os.path.join(destination_path,(index[0] +"_" + str(start) +'_m.pt')),'wb'))
		torch.save(torch.from_numpy(np.expand_dims(mag,axis=0)),os.path.join(destination_path,(index[0] +"_" + str(start) +'_m.pt')))
		if phase_bool:
			if not os.path.exists(destination_phase_path):
				os.makedirs(destination_phase_path)
			np.save(os.path.join(destination_phase_path,(index[0]+"_" +str(start)+'_p.npy')),phase)
	return

#--------- training data------------------------------------- 
Example #17
Source File: audio.py    From Multilingual_Text_to_Speech with MIT License 5 votes vote down vote up
def duration(data):
    """Return duration of an audio signal in seconds."""
    return librosa.get_duration(data, sr=hp.sample_rate) 
Example #18
Source File: utils.py    From vadnet with GNU Lesser General Public License v3.0 5 votes vote down vote up
def audio_dur(path, ext='', root=''):
    path = os.path.join(root, '{}{}'.format(path, ext))
    try:
        return lr.get_duration(filename=path)
    except Exception as ex:        
        print_err('could not read {}\n{}'.format(path, ex))
        return 0 
Example #19
Source File: eda_vlsp.py    From automatic_speech_recognition with GNU General Public License v3.0 5 votes vote down vote up
def stat_acoustic():
    print("\nAcoustic Data:")
    wav_folder = join(ROOT_FOLDER, "data", "vlsp", "wav")
    files = listdir(wav_folder)
    files = [join(wav_folder, file) for file in files]
    durations = [librosa.get_duration(filename=file) for file in files]
    durations = pd.Series(durations)
    print(f"Total: {durations.sum():.2f} seconds ({durations.sum() / 3600:.2f} hours)")
    print(durations.describe()) 
Example #20
Source File: mel.py    From pumpp with ISC License 5 votes vote down vote up
def transform_audio(self, y):
        '''Compute the Mel spectrogram

        Parameters
        ----------
        y : np.ndarray
            The audio buffer

        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape=(n_frames, n_mels)
                The Mel spectrogram
        '''
        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        mel = np.sqrt(melspectrogram(y=y, sr=self.sr,
                                     n_fft=self.n_fft,
                                     hop_length=self.hop_length,
                                     n_mels=self.n_mels,
                                     fmax=self.fmax))

        mel = fix_length(mel, n_frames)

        if self.log:
            mel = amplitude_to_db(mel, ref=np.max)

        # Type convert
        mel = to_dtype(mel, self.dtype)

        return {'mag': mel.T[self.idx]} 
Example #21
Source File: cqt.py    From pumpp with ISC License 5 votes vote down vote up
def transform_audio(self, y):
        '''Compute the CQT

        Parameters
        ----------
        y : np.ndarray
            The audio buffer

        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape = (n_frames, n_bins)
                The CQT magnitude

            data['phase']: np.ndarray, shape = mag.shape
                The CQT phase
        '''
        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        C = cqt(y=y, sr=self.sr, hop_length=self.hop_length,
                fmin=self.fmin,
                n_bins=(self.n_octaves * self.over_sample * 12),
                bins_per_octave=(self.over_sample * 12))

        C = fix_length(C, n_frames)

        cqtm, phase = magphase(C)
        if self.log:
            cqtm = amplitude_to_db(cqtm, ref=np.max)

        dphase = phase_diff(np.angle(phase).T[self.idx], self.conv)

        return {'mag': to_dtype(cqtm.T[self.idx], self.dtype),
                'dphase': to_dtype(dphase, self.dtype)} 
Example #22
Source File: cqt.py    From pumpp with ISC License 5 votes vote down vote up
def transform_audio(self, y):
        '''Compute the CQT

        Parameters
        ----------
        y : np.ndarray
            The audio buffer

        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape = (n_frames, n_bins)
                The CQT magnitude

            data['phase']: np.ndarray, shape = mag.shape
                The CQT phase
        '''
        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        C = cqt(y=y, sr=self.sr, hop_length=self.hop_length,
                fmin=self.fmin,
                n_bins=(self.n_octaves * self.over_sample * 12),
                bins_per_octave=(self.over_sample * 12))

        C = fix_length(C, n_frames)

        cqtm, phase = magphase(C)
        if self.log:
            cqtm = amplitude_to_db(cqtm, ref=np.max)

        return {'mag': to_dtype(cqtm.T[self.idx], self.dtype),
                'phase': to_dtype(np.angle(phase).T[self.idx], self.dtype)} 
Example #23
Source File: fft.py    From pumpp with ISC License 5 votes vote down vote up
def transform_audio(self, y):
        '''Compute the STFT magnitude and phase differential.

        Parameters
        ----------
        y : np.ndarray
            The audio buffer

        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape=(n_frames, 1 + n_fft//2)
                STFT magnitude

            data['dphase'] : np.ndarray, shape=(n_frames, 1 + n_fft//2)
                STFT phase
        '''
        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        D = stft(y, hop_length=self.hop_length,
                 n_fft=self.n_fft)

        D = fix_length(D, n_frames)

        mag, phase = magphase(D)
        if self.log:
            mag = amplitude_to_db(mag, ref=np.max)

        phase = phase_diff(np.angle(phase.T)[self.idx], self.conv)

        return {'mag': to_dtype(mag.T[self.idx], self.dtype),
                'dphase': to_dtype(phase, self.dtype)} 
Example #24
Source File: fft.py    From pumpp with ISC License 5 votes vote down vote up
def transform_audio(self, y):
        '''Compute the STFT magnitude and phase.

        Parameters
        ----------
        y : np.ndarray
            The audio buffer

        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape=(n_frames, 1 + n_fft//2)
                STFT magnitude

            data['phase'] : np.ndarray, shape=(n_frames, 1 + n_fft//2)
                STFT phase
        '''
        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        D = stft(y, hop_length=self.hop_length,
                 n_fft=self.n_fft)

        D = fix_length(D, n_frames)

        mag, phase = magphase(D)
        if self.log:
            mag = amplitude_to_db(mag, ref=np.max)

        return {'mag': to_dtype(mag.T[self.idx], self.dtype),
                'phase': to_dtype(np.angle(phase.T)[self.idx], self.dtype)} 
Example #25
Source File: time.py    From pumpp with ISC License 5 votes vote down vote up
def transform_audio(self, y):
        '''Compute the time position encoding

        Parameters
        ----------
        y : np.ndarray
            Audio buffer

        Returns
        -------
        data : dict
            data['relative'] = np.ndarray, shape=(n_frames, 2)
            data['absolute'] = np.ndarray, shape=(n_frames, 2)

                Relative and absolute time positional encodings.
        '''

        duration = get_duration(y=y, sr=self.sr)
        n_frames = self.n_frames(duration)

        relative = np.zeros((n_frames, 2), dtype=np.float32)
        relative[:, 0] = np.cos(np.pi * np.linspace(0, 1, num=n_frames))
        relative[:, 1] = np.sin(np.pi * np.linspace(0, 1, num=n_frames))

        absolute = relative * np.sqrt(duration)

        return {'relative': to_dtype(relative[self.idx], self.dtype),
                'absolute': to_dtype(absolute[self.idx], self.dtype)} 
Example #26
Source File: Input.py    From vimss with GNU General Public License v3.0 5 votes vote down vote up
def randomPositionInAudio(audio_path, duration):
    length = librosa.get_duration(filename=audio_path)
    if duration >= length:
        return 0.0, None
    else:
        offset = np.random.uniform() * (length - duration)
        return offset, duration 
Example #27
Source File: analyze.py    From crema with BSD 2-Clause "Simplified" License 4 votes vote down vote up
def analyze(filename=None, y=None, sr=None):
    '''Analyze a recording for all tasks.

    Parameters
    ----------
    filename : str, optional
        Path to audio file

    y : np.ndarray, optional
    sr : number > 0, optional
        Audio buffer and sampling rate

    .. note:: At least one of `filename` or `y, sr` must be provided.

    Returns
    -------
    jam : jams.JAMS
        a JAMS object containing all estimated annotations

    Examples
    --------
    >>> from crema.analyze import analyze
    >>> import librosa
    >>> jam = analyze(filename=librosa.util.example_audio_file())
    >>> jam
    <JAMS(file_metadata=<FileMetadata(...)>,
          annotations=[1 annotation],
          sandbox=<Sandbox(...)>)>
    >>> # Get the chord estimates
    >>> chords = jam.annotations['chord', 0]
    >>> chords.to_dataframe().head(5)
           time  duration  value  confidence
    0  0.000000  0.092880  E:maj    0.336977
    1  0.092880  0.464399    E:7    0.324255
    2  0.557279  1.021678  E:min    0.448759
    3  1.578957  2.693515  E:maj    0.501462
    4  4.272472  1.486077  E:min    0.287264
    '''

    _load_models()

    jam = jams.JAMS()
    # populate file metadata

    jam.file_metadata.duration = librosa.get_duration(y=y, sr=sr,
                                                      filename=filename)

    for model in __MODELS__:
        jam.annotations.append(model.predict(filename=filename, y=y, sr=sr))

    return jam 
Example #28
Source File: speech_cls_task.py    From delta with Apache License 2.0 4 votes vote down vote up
def get_class_files_duration(self):
    ''' dirnames under dataset is class name
     all data_path have same dirnames '''
    classes = None
    for root, dirnames, filenames in os.walk(self._data_path[0]):
      classes = dirnames
      break

    assert classes, 'can not acsess {}'.format(self._data_path[0])
    assert set(classes) == set(self._classes.keys()), '{} {}'.format(
        classes, self._classes.keys())

    def _get_class(path):
      ret = None
      for cls in self._classes:
        if cls in path:
          ret = cls
      return ret

    # to exclude some data under some dir
    excludes = []
    #pylint: disable=too-many-nested-blocks
    for data_path in self._data_path:
      logging.debug("data path: {}".format(data_path))
      for root, dirname, filenames in os.walk(data_path):
        del dirname
        for filename in filenames:
          if filename.endswith(self._file_suffix):
            class_name = _get_class(root)  # 'conflict' or 'normal' str
            assert class_name is not None
            filename = os.path.join(root, filename)

            if excludes:
              for exclude in excludes:
                if exclude in filename:
                  pass

            duration = self.get_duration(
                filename=filename, sr=self._sample_rate)
            self._class_file[class_name].append(
                (filename, duration, class_name))
          else:
            pass

    if not self._class_file:
      logging.debug("class file: {}".format(self._class_file))
      logging.warn("maybe the suffix {} file not exits".format(
          self._file_suffix)) 
Example #29
Source File: convert.py    From ZeroSpeech-TTS-without-T with MIT License 4 votes vote down vote up
def encode_for_tacotron(target, trainer, seg_len, multi2idx_path, wav_path, result_path):
	wavs = sorted(glob.glob(os.path.join(wav_path, '*.wav')))
	print('[Converter] - Number of wav files to encoded: ', len(wavs))

	names = []
	enc_outputs = []

	for wav_path in tqdm(wavs):
		name = wav_path.split('/')[-1].split('.')[0]
		s_id = name.split('_')[0]
		u_id = name.split('_')[1]
		if s_id != target:
			continue

		y, sr = librosa.load(wav_path)
		d = librosa.get_duration(y=y, sr=sr)
		if d > 25: 
			continue # --> this filter out too long utts, 3523/3533 for V001 and V002 together in the english dataset
		

		_, spec = get_spectrograms(wav_path)
		encodings = encode(spec, trainer, seg_len, save=False)
		encodings = parse_encodings(encodings)
		enc_outputs.append(encodings)
		names.append((s_id, u_id))

	# build encodings to character mapping
	idx = 0
	multi2idx = {}
	print('[Converter] - Building encoding to symbol mapping...')
	for encodings in tqdm(enc_outputs):
		for encoding in encodings:
			if str(encoding) not in multi2idx:
				multi2idx[str(encoding)] = symbols[idx]
				idx += 1

	print('[Converter] - Number of unique discret units: ', len(multi2idx))
	with open(multi2idx_path, 'w') as file:
		file.write(json.dumps(multi2idx))
	
	result_path = result_path.replace('target', target)	
	print('[Converter] - Writing to meta file...')
	with open(result_path, 'w') as file:
		for i, encodings in enumerate(enc_outputs):
			file.write(str(names[i][0]) + '_' + str(names[i][1] + '|'))
			for encoding in encodings:
				file.write(multi2idx[str(encoding)])
			file.write('\n') 
Example #30
Source File: cqt.py    From pumpp with ISC License 4 votes vote down vote up
def transform_audio(self, y):
        '''Compute the HCQT

        Parameters
        ----------
        y : np.ndarray
            The audio buffer

        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape = (n_frames, n_bins, n_harmonics)
                The CQT magnitude

            data['phase']: np.ndarray, shape = mag.shape
                The CQT phase
        '''
        cqtm, phase = [], []

        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        for h in self.harmonics:
            C = cqt(y=y, sr=self.sr, hop_length=self.hop_length,
                    fmin=self.fmin * h,
                    n_bins=(self.n_octaves * self.over_sample * 12),
                    bins_per_octave=(self.over_sample * 12))

            C = fix_length(C, n_frames)

            C, P = magphase(C)
            if self.log:
                C = amplitude_to_db(C, ref=np.max)
            cqtm.append(C)
            phase.append(P)

        cqtm = to_dtype(np.asarray(cqtm), self.dtype)
        phase = np.angle(np.asarray(phase))

        dphase = to_dtype(phase_diff(self._index(phase), self.conv),
                          self.dtype)

        return {'mag': self._index(cqtm),
                'dphase': dphase}