Python python_speech_features.fbank() Examples

The following are 10 code examples of python_speech_features.fbank(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module python_speech_features , or try the search function .
Example #1
Source File: make_fbanks.py    From pase with MIT License 5 votes vote down vote up
def wav2fbank(args):
    wavname, out_dir, nfilt, log = args
    x, rate = sf.read(wavname)
    fb, egy = fbank(x, rate, nfilt=nfilt)
    if log:
        fb = np.log(fb)
    bname = os.path.splitext(os.path.basename(wavname))[0]
    outfile = os.path.join(out_dir, bname + '.fb')
    np.save(outfile, fb) 
Example #2
Source File: features.py    From phoneme_recognition with MIT License 5 votes vote down vote up
def get_features(filename, numcep, numfilt, winlen, winstep, grad):

    f = Sndfile(filename, 'r')

    frames = f.nframes
    samplerate = f.samplerate
    data = f.read_frames(frames)
    data = np.asarray(data)

    #calc mfcc
    feat_raw,energy = sf.fbank(data, samplerate,winlen,winstep, nfilt=numfilt)
    feat = np.log(feat_raw)
    feat = sf.dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
    feat = sf.lifter(feat,L=22)
    feat = np.asarray(feat)

    #calc log energy
    log_energy = np.log(energy) #np.log( np.sum(feat_raw**2, axis=1) )
    log_energy = log_energy.reshape([log_energy.shape[0],1])

    mat = ( feat - np.mean(feat, axis=0) ) / (0.5 * np.std(feat, axis=0))
    mat = np.concatenate((mat, log_energy), axis=1)

    #calc first order derivatives
    if grad >= 1:
        gradf = np.gradient(mat)[0]
        mat = np.concatenate((mat, gradf), axis=1)

    #calc second order derivatives
    if grad == 2:
        grad2f = np.gradient(gradf)[0]
        mat = np.concatenate((mat, grad2f), axis=1)

    return mat, frames, samplerate 
Example #3
Source File: audio_processing.py    From DeepSpeaker-pytorch with MIT License 5 votes vote down vote up
def mk_MFB(filename, sample_rate=c.SAMPLE_RATE,use_delta = c.USE_DELTA,use_scale = c.USE_SCALE,use_logscale = c.USE_LOGSCALE):
    audio, sr = librosa.load(filename, sr=sample_rate, mono=True)
    #audio = audio.flatten()


    filter_banks, energies = fbank(audio, samplerate=sample_rate, nfilt=c.FILTER_BANK, winlen=0.025)

    if use_logscale:
        filter_banks = 20 * np.log10(np.maximum(filter_banks,1e-5))

    if use_delta:
        delta_1 = delta(filter_banks, N=1)
        delta_2 = delta(delta_1, N=1)

        filter_banks = normalize_frames(filter_banks, Scale=use_scale)
        delta_1 = normalize_frames(delta_1, Scale=use_scale)
        delta_2 = normalize_frames(delta_2, Scale=use_scale)

        frames_features = np.hstack([filter_banks, delta_1, delta_2])
    else:
        filter_banks = normalize_frames(filter_banks, Scale=use_scale)
        frames_features = filter_banks



    np.save(filename.replace('.wav', '.npy'),frames_features)

    return 
Example #4
Source File: audio_processing.py    From DeepSpeaker-pytorch with MIT License 5 votes vote down vote up
def pre_process_inputs(signal=np.random.uniform(size=32000), target_sample_rate=8000,use_delta = c.USE_DELTA):
    filter_banks, energies = fbank(signal, samplerate=target_sample_rate, nfilt=c.FILTER_BANK, winlen=0.025)
    delta_1 = delta(filter_banks, N=1)
    delta_2 = delta(delta_1, N=1)

    filter_banks = normalize_frames(filter_banks)
    delta_1 = normalize_frames(delta_1)
    delta_2 = normalize_frames(delta_2)

    if use_delta:
        frames_features = np.hstack([filter_banks, delta_1, delta_2])
    else:
        frames_features = filter_banks
    num_frames = len(frames_features)
    network_inputs = []
    """Too complicated
    for j in range(c.NUM_PREVIOUS_FRAME, num_frames - c.NUM_NEXT_FRAME):
        frames_slice = frames_features[j - c.NUM_PREVIOUS_FRAME:j + c.NUM_NEXT_FRAME]
        #network_inputs.append(np.reshape(frames_slice, (32, 20, 3)))
        network_inputs.append(frames_slice)
        
    """
    import random
    j = random.randrange(c.NUM_PREVIOUS_FRAME, num_frames - c.NUM_NEXT_FRAME)
    frames_slice = frames_features[j - c.NUM_PREVIOUS_FRAME:j + c.NUM_NEXT_FRAME]
    network_inputs.append(frames_slice)
    return np.array(network_inputs) 
Example #5
Source File: speech.py    From neuralmonkey with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def SpeechFeaturesPreprocessor(feature_type: str = "mfcc",
                               delta_order: int = 0,
                               delta_window: int = 2,
                               **kwargs) -> Callable:
    """Calculate speech features.

    First, the given type of features (e.g. MFCC) is computed using a window
    of length `winlen` and step `winstep`; for additional keyword arguments
    (specific to each feature type), see
    http://python-speech-features.readthedocs.io/. Then, delta features up to
    `delta_order` are added.

    By default, 13 MFCCs per frame are computed. To add delta and delta-delta
    features (resulting in 39 coefficients per frame), set `delta_order=2`.

    Arguments:
        feature_type: mfcc, fbank, logfbank or ssc (default is mfcc)
        delta_order: maximum order of the delta features (default is 0)
        delta_window: window size for delta features (default is 2)
        **kwargs: keyword arguments for the appropriate function from
            python_speech_features

    Returns:
        A numpy array of shape [num_frames, num_features].
    """

    if feature_type not in FEATURE_TYPES:
        raise ValueError(
            "Unknown speech feature type '{}'".format(feature_type))

    def preprocess(audio: Audio) -> np.ndarray:
        features = [FEATURE_TYPES[feature_type](
            audio.data, samplerate=audio.rate, **kwargs)]

        for _ in range(delta_order):
            features.append(delta(features[-1], delta_window))

        return np.concatenate(features, axis=1)

    return preprocess 
Example #6
Source File: speech.py    From neuralmonkey with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _fbank(*args, **kwargs) -> np.ndarray:
    feat, _ = fbank(*args, **kwargs)
    return feat 
Example #7
Source File: speech.py    From neuralmonkey with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def SpeechFeaturesPreprocessor(feature_type: str = "mfcc",
                               delta_order: int = 0,
                               delta_window: int = 2,
                               **kwargs) -> Callable:
    """Calculate speech features.

    First, the given type of features (e.g. MFCC) is computed using a window
    of length `winlen` and step `winstep`; for additional keyword arguments
    (specific to each feature type), see
    http://python-speech-features.readthedocs.io/. Then, delta features up to
    `delta_order` are added.

    By default, 13 MFCCs per frame are computed. To add delta and delta-delta
    features (resulting in 39 coefficients per frame), set `delta_order=2`.

    Arguments:
        feature_type: mfcc, fbank, logfbank or ssc (default is mfcc)
        delta_order: maximum order of the delta features (default is 0)
        delta_window: window size for delta features (default is 2)
        **kwargs: keyword arguments for the appropriate function from
            python_speech_features

    Returns:
        A numpy array of shape [num_frames, num_features].
    """

    if feature_type not in FEATURE_TYPES:
        raise ValueError(
            "Unknown speech feature type '{}'".format(feature_type))

    def preprocess(audio: Audio) -> np.ndarray:
        features = [FEATURE_TYPES[feature_type](
            audio.data, samplerate=audio.rate, **kwargs)]

        for _ in range(delta_order):
            features.append(delta(features[-1], delta_window))

        return np.concatenate(features, axis=1)

    return preprocess 
Example #8
Source File: speech.py    From neuralmonkey with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _fbank(*args, **kwargs) -> np.ndarray:
    feat, _ = fbank(*args, **kwargs)
    return feat 
Example #9
Source File: speech.py    From neuralmonkey with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def SpeechFeaturesPreprocessor(feature_type: str = "mfcc",
                               delta_order: int = 0,
                               delta_window: int = 2,
                               **kwargs) -> Callable:
    """Calculate speech features.

    First, the given type of features (e.g. MFCC) is computed using a window
    of length `winlen` and step `winstep`; for additional keyword arguments
    (specific to each feature type), see
    http://python-speech-features.readthedocs.io/. Then, delta features up to
    `delta_order` are added.

    By default, 13 MFCCs per frame are computed. To add delta and delta-delta
    features (resulting in 39 coefficients per frame), set `delta_order=2`.

    Arguments:
        feature_type: mfcc, fbank, logfbank or ssc (default is mfcc)
        delta_order: maximum order of the delta features (default is 0)
        delta_window: window size for delta features (default is 2)
        **kwargs: keyword arguments for the appropriate function from
            python_speech_features

    Returns:
        A numpy array of shape [num_frames, num_features].
    """

    if feature_type not in FEATURE_TYPES:
        raise ValueError(
            "Unknown speech feature type '{}'".format(feature_type))

    def preprocess(audio: Audio) -> np.ndarray:
        features = [FEATURE_TYPES[feature_type](
            audio.data, samplerate=audio.rate, **kwargs)]

        for _ in range(delta_order):
            features.append(delta(features[-1], delta_window))

        return np.concatenate(features, axis=1)

    return preprocess 
Example #10
Source File: audio.py    From bootcamp with Apache License 2.0 5 votes vote down vote up
def mfcc_fbank(signal: np.array, sample_rate: int):  # 1D signal array.
    # Returns MFCC with shape (num_frames, n_filters, 3).
    filter_banks, energies = fbank(signal, samplerate=sample_rate, nfilt=NUM_FBANKS)
    frames_features = normalize_frames(filter_banks)
    # delta_1 = delta(filter_banks, N=1)
    # delta_2 = delta(delta_1, N=1)
    # frames_features = np.transpose(np.stack([filter_banks, delta_1, delta_2]), (1, 2, 0))
    return np.array(frames_features, dtype=np.float32)  # Float32 precision is enough here.