Python python_speech_features.fbank() Examples
The following are 10
code examples of python_speech_features.fbank().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
python_speech_features
, or try the search function
.
Example #1
Source File: make_fbanks.py From pase with MIT License | 5 votes |
def wav2fbank(args): wavname, out_dir, nfilt, log = args x, rate = sf.read(wavname) fb, egy = fbank(x, rate, nfilt=nfilt) if log: fb = np.log(fb) bname = os.path.splitext(os.path.basename(wavname))[0] outfile = os.path.join(out_dir, bname + '.fb') np.save(outfile, fb)
Example #2
Source File: features.py From phoneme_recognition with MIT License | 5 votes |
def get_features(filename, numcep, numfilt, winlen, winstep, grad): f = Sndfile(filename, 'r') frames = f.nframes samplerate = f.samplerate data = f.read_frames(frames) data = np.asarray(data) #calc mfcc feat_raw,energy = sf.fbank(data, samplerate,winlen,winstep, nfilt=numfilt) feat = np.log(feat_raw) feat = sf.dct(feat, type=2, axis=1, norm='ortho')[:,:numcep] feat = sf.lifter(feat,L=22) feat = np.asarray(feat) #calc log energy log_energy = np.log(energy) #np.log( np.sum(feat_raw**2, axis=1) ) log_energy = log_energy.reshape([log_energy.shape[0],1]) mat = ( feat - np.mean(feat, axis=0) ) / (0.5 * np.std(feat, axis=0)) mat = np.concatenate((mat, log_energy), axis=1) #calc first order derivatives if grad >= 1: gradf = np.gradient(mat)[0] mat = np.concatenate((mat, gradf), axis=1) #calc second order derivatives if grad == 2: grad2f = np.gradient(gradf)[0] mat = np.concatenate((mat, grad2f), axis=1) return mat, frames, samplerate
Example #3
Source File: audio_processing.py From DeepSpeaker-pytorch with MIT License | 5 votes |
def mk_MFB(filename, sample_rate=c.SAMPLE_RATE,use_delta = c.USE_DELTA,use_scale = c.USE_SCALE,use_logscale = c.USE_LOGSCALE): audio, sr = librosa.load(filename, sr=sample_rate, mono=True) #audio = audio.flatten() filter_banks, energies = fbank(audio, samplerate=sample_rate, nfilt=c.FILTER_BANK, winlen=0.025) if use_logscale: filter_banks = 20 * np.log10(np.maximum(filter_banks,1e-5)) if use_delta: delta_1 = delta(filter_banks, N=1) delta_2 = delta(delta_1, N=1) filter_banks = normalize_frames(filter_banks, Scale=use_scale) delta_1 = normalize_frames(delta_1, Scale=use_scale) delta_2 = normalize_frames(delta_2, Scale=use_scale) frames_features = np.hstack([filter_banks, delta_1, delta_2]) else: filter_banks = normalize_frames(filter_banks, Scale=use_scale) frames_features = filter_banks np.save(filename.replace('.wav', '.npy'),frames_features) return
Example #4
Source File: audio_processing.py From DeepSpeaker-pytorch with MIT License | 5 votes |
def pre_process_inputs(signal=np.random.uniform(size=32000), target_sample_rate=8000,use_delta = c.USE_DELTA): filter_banks, energies = fbank(signal, samplerate=target_sample_rate, nfilt=c.FILTER_BANK, winlen=0.025) delta_1 = delta(filter_banks, N=1) delta_2 = delta(delta_1, N=1) filter_banks = normalize_frames(filter_banks) delta_1 = normalize_frames(delta_1) delta_2 = normalize_frames(delta_2) if use_delta: frames_features = np.hstack([filter_banks, delta_1, delta_2]) else: frames_features = filter_banks num_frames = len(frames_features) network_inputs = [] """Too complicated for j in range(c.NUM_PREVIOUS_FRAME, num_frames - c.NUM_NEXT_FRAME): frames_slice = frames_features[j - c.NUM_PREVIOUS_FRAME:j + c.NUM_NEXT_FRAME] #network_inputs.append(np.reshape(frames_slice, (32, 20, 3))) network_inputs.append(frames_slice) """ import random j = random.randrange(c.NUM_PREVIOUS_FRAME, num_frames - c.NUM_NEXT_FRAME) frames_slice = frames_features[j - c.NUM_PREVIOUS_FRAME:j + c.NUM_NEXT_FRAME] network_inputs.append(frames_slice) return np.array(network_inputs)
Example #5
Source File: speech.py From neuralmonkey with BSD 3-Clause "New" or "Revised" License | 5 votes |
def SpeechFeaturesPreprocessor(feature_type: str = "mfcc", delta_order: int = 0, delta_window: int = 2, **kwargs) -> Callable: """Calculate speech features. First, the given type of features (e.g. MFCC) is computed using a window of length `winlen` and step `winstep`; for additional keyword arguments (specific to each feature type), see http://python-speech-features.readthedocs.io/. Then, delta features up to `delta_order` are added. By default, 13 MFCCs per frame are computed. To add delta and delta-delta features (resulting in 39 coefficients per frame), set `delta_order=2`. Arguments: feature_type: mfcc, fbank, logfbank or ssc (default is mfcc) delta_order: maximum order of the delta features (default is 0) delta_window: window size for delta features (default is 2) **kwargs: keyword arguments for the appropriate function from python_speech_features Returns: A numpy array of shape [num_frames, num_features]. """ if feature_type not in FEATURE_TYPES: raise ValueError( "Unknown speech feature type '{}'".format(feature_type)) def preprocess(audio: Audio) -> np.ndarray: features = [FEATURE_TYPES[feature_type]( audio.data, samplerate=audio.rate, **kwargs)] for _ in range(delta_order): features.append(delta(features[-1], delta_window)) return np.concatenate(features, axis=1) return preprocess
Example #6
Source File: speech.py From neuralmonkey with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _fbank(*args, **kwargs) -> np.ndarray: feat, _ = fbank(*args, **kwargs) return feat
Example #7
Source File: speech.py From neuralmonkey with BSD 3-Clause "New" or "Revised" License | 5 votes |
def SpeechFeaturesPreprocessor(feature_type: str = "mfcc", delta_order: int = 0, delta_window: int = 2, **kwargs) -> Callable: """Calculate speech features. First, the given type of features (e.g. MFCC) is computed using a window of length `winlen` and step `winstep`; for additional keyword arguments (specific to each feature type), see http://python-speech-features.readthedocs.io/. Then, delta features up to `delta_order` are added. By default, 13 MFCCs per frame are computed. To add delta and delta-delta features (resulting in 39 coefficients per frame), set `delta_order=2`. Arguments: feature_type: mfcc, fbank, logfbank or ssc (default is mfcc) delta_order: maximum order of the delta features (default is 0) delta_window: window size for delta features (default is 2) **kwargs: keyword arguments for the appropriate function from python_speech_features Returns: A numpy array of shape [num_frames, num_features]. """ if feature_type not in FEATURE_TYPES: raise ValueError( "Unknown speech feature type '{}'".format(feature_type)) def preprocess(audio: Audio) -> np.ndarray: features = [FEATURE_TYPES[feature_type]( audio.data, samplerate=audio.rate, **kwargs)] for _ in range(delta_order): features.append(delta(features[-1], delta_window)) return np.concatenate(features, axis=1) return preprocess
Example #8
Source File: speech.py From neuralmonkey with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _fbank(*args, **kwargs) -> np.ndarray: feat, _ = fbank(*args, **kwargs) return feat
Example #9
Source File: speech.py From neuralmonkey with BSD 3-Clause "New" or "Revised" License | 5 votes |
def SpeechFeaturesPreprocessor(feature_type: str = "mfcc", delta_order: int = 0, delta_window: int = 2, **kwargs) -> Callable: """Calculate speech features. First, the given type of features (e.g. MFCC) is computed using a window of length `winlen` and step `winstep`; for additional keyword arguments (specific to each feature type), see http://python-speech-features.readthedocs.io/. Then, delta features up to `delta_order` are added. By default, 13 MFCCs per frame are computed. To add delta and delta-delta features (resulting in 39 coefficients per frame), set `delta_order=2`. Arguments: feature_type: mfcc, fbank, logfbank or ssc (default is mfcc) delta_order: maximum order of the delta features (default is 0) delta_window: window size for delta features (default is 2) **kwargs: keyword arguments for the appropriate function from python_speech_features Returns: A numpy array of shape [num_frames, num_features]. """ if feature_type not in FEATURE_TYPES: raise ValueError( "Unknown speech feature type '{}'".format(feature_type)) def preprocess(audio: Audio) -> np.ndarray: features = [FEATURE_TYPES[feature_type]( audio.data, samplerate=audio.rate, **kwargs)] for _ in range(delta_order): features.append(delta(features[-1], delta_window)) return np.concatenate(features, axis=1) return preprocess
Example #10
Source File: audio.py From bootcamp with Apache License 2.0 | 5 votes |
def mfcc_fbank(signal: np.array, sample_rate: int): # 1D signal array. # Returns MFCC with shape (num_frames, n_filters, 3). filter_banks, energies = fbank(signal, samplerate=sample_rate, nfilt=NUM_FBANKS) frames_features = normalize_frames(filter_banks) # delta_1 = delta(filter_banks, N=1) # delta_2 = delta(delta_1, N=1) # frames_features = np.transpose(np.stack([filter_banks, delta_1, delta_2]), (1, 2, 0)) return np.array(frames_features, dtype=np.float32) # Float32 precision is enough here.