python source code of pYAAPT

# -*- coding: utf-8 -*-
"""
This a ported version for Python from the YAAPT algorithm. The original MATLAB
program was written by Hongbing Hu and Stephen A.Zahorian.

The YAAPT program, designed for fundamental frequency tracking,
is extremely robust for both high quality and telephone speech.

The YAAPT program was created by the Speech Communication Laboratory of
the state university of New York at Binghamton. The original program is
available at http://www.ws.binghamton.edu/zahorian as free software. Further
information about the program could be found at Stephen A. Zahorian, and
Hongbing Hu, "A spectral/temporal method for robust fundamental frequency
tracking," J. Acoust. Soc. Am. 123(6), June 2008.

It must be noticed that, although this ported version is almost equal to the
original, some few changes were made in order to make the program more "pythonic"
and improve its performance. Nevertheless, the results obtained with both
algorithms were similar.

USAGE:
    pitch = yaapt(signal, <options>)

INPUTS:
    signal: signal object created by amfm_decompy.basic_tools. For more
    information about its properties, please consult the documentation file.

    <options>: must be formated as follows:
               **{'option_name1' : value1, 'option_name2' : value2, ...}
               The default configuration values for all of them are the same as
               in the original version. The main yaapt function in this file
               provides a short description about each option.
               For more information, please refer to the original bibliography.

OUTPUTS:
    pitch: pitch object. For more information about its properties, please
           consult the documentation file.

Version 1.0.9.1
11/Feb/2020 Bernardo J.B. Schmitt - bernardo.jb.schmitt@gmail.com
"""

import numpy as np
import numpy.lib.stride_tricks as stride_tricks
from scipy.signal import firwin, medfilt, lfilter
from scipy.signal.windows import hann, kaiser
import scipy.interpolate as scipy_interp
import amfm_decompy.basic_tools as basic


"""
--------------------------------------------
                Classes.
--------------------------------------------
"""
"""
Auxiliary class to handle the class properties.
"""
class ClassProperty(object):

    def __init__(self, initval=None):
        self.val = initval

    def __get__(self, obj, objtype):
        return self.val

    def __set__(self, obj, val):
        self.val = val


"""
Creates a pitch object.
"""
class PitchObj(object):

    PITCH_HALF = ClassProperty(0)
    PITCH_HALF_SENS = ClassProperty(2.9)
    PITCH_DOUBLE = ClassProperty(0)
    PITCH_DOUBLE_SENS = ClassProperty(2.9)
    SMOOTH_FACTOR = ClassProperty(5)
    SMOOTH = ClassProperty(5)
    PTCH_TYP = ClassProperty(100.0)

    def __init__(self, frame_size, frame_jump, nfft=8192):

        self.nfft = nfft
        self.frame_size = frame_size
        self.frame_jump = frame_jump
        self.noverlap = self.frame_size-self.frame_jump

    def set_energy(self, energy, threshold):
        self.mean_energy = np.mean(energy)
        self.energy = energy/self.mean_energy
        self.vuv = (self.energy > threshold)

    def set_frames_pos(self, frames_pos):
        self.frames_pos = frames_pos
        self.nframes = len(self.frames_pos)

    def set_values(self, samp_values, file_size, interp_tech='pchip'):
        self.samp_values = samp_values
        self.fix()
        self.values = self.upsample(self.samp_values, file_size, 0, 0,
                                    interp_tech)
        self.edges = self.edges_finder(self.values)
        self.interpolate()
        self.values_interp = self.upsample(self.samp_interp, file_size,
                                           self.samp_interp[0],
                                           self.samp_interp[-1], interp_tech)

    """
    For the voiced/unvoiced version of the pitch data, finds the n samples where
    the transitions between these two states occur.
    """
    def edges_finder(self, values):
        vec1 = (np.abs(values[1:]+values[:-1]) > 0)
        vec2 = (np.abs(values[1:]*values[:-1]) == 0)
        edges = np.logical_and(vec1, vec2)
        # The previous logical operation detects where voiced/unvoiced transitions
        # occur. Thus, a 'True' in the edges[n] sample indicates that the sample
        # value[n+1] has a different state than value[n](i.e. if values[n] is
        # voiced, then values[n+1] is unvoiced - and vice-versa). Consequently,
        # the last sample from edges array will always be 'False' and is not
        # calculated (because "there is no n+1 sample" for it. That's why
        # len(edges) = len(values)-1). However, just for sake of comprehension
        # (and also to avoid python warnings about array length mismatchs), I
        # add a 'False' to edges the array. But in pratice, this 'False' is
        # useless.
        edges = np.append(edges,[False])
        index = np.arange(len(values))
        index = index[edges > 0]
        return index.tolist()

    """
    This method corresponds to the first half of the ptch_fix.m file. It tries
    to fix half pitch and double pitch errors.
    """
    def fix(self):
        if self.PITCH_HALF > 0:
            nz_pitch = self.samp_values[self.samp_values > 0]
            idx = self.samp_values < (np.mean(nz_pitch)-self.PITCH_HALF_SENS *
                                      np.std(nz_pitch))
            if self.PITCH_HALF == 1:
                self.samp_values[idx] = 0
            elif self.PITCH_HALF == 2:
                self.samp_values[idx] = 2*self.samp_values[idx]

        if self.PITCH_DOUBLE > 0:
            nz_pitch = self.samp_values[self.samp_values > 0]
            idx = self.samp_values > (np.mean(nz_pitch)+self.PITCH_DOUBLE_SENS *
                                      np.std(nz_pitch))
            if self.PITCH_DOUBLE == 1:
                self.samp_values[idx] = 0
            elif self.PITCH_DOUBLE == 2:
                self.samp_values[idx] = 0.5*self.samp_values[idx]

    """
    Corresponds to the second half of the ptch_fix.m file. Creates the
    interpolated pitch data.
    """
    def interpolate(self):
        pitch = np.zeros((self.nframes))
        pitch[:] = self.samp_values
        pitch2 = medfilt(self.samp_values, self.SMOOTH_FACTOR)

        # This part in the original code is kind of confused and caused
        # some problems with the extrapolated points before the first
        # voiced frame and after the last voiced frame. So, I made some
        # small modifications in order to make it work better.
        edges = self.edges_finder(pitch)
        first_sample = pitch[0]
        last_sample = pitch[-1]

        if len(np.nonzero(pitch2)[0]) < 2:
            pitch[pitch == 0] = self.PTCH_TYP
        else:
            nz_pitch = pitch2[pitch2 > 0]
            pitch2 = scipy_interp.pchip(np.nonzero(pitch2)[0],
                                        nz_pitch)(range(self.nframes))
            pitch[pitch == 0] = pitch2[pitch == 0]
        if self.SMOOTH > 0:
            pitch = medfilt(pitch, self.SMOOTH_FACTOR)
        try:
            if first_sample == 0:
                pitch[:edges[0]-1] = pitch[edges[0]]
            if last_sample == 0:
                pitch[edges[-1]+1:] = pitch[edges[-1]]
        except:
            pass
        self.samp_interp = pitch

    """
    Upsample the pitch data so that it length becomes the same as the speech
    value.
    """
    def upsample(self, samp_values, file_size, first_samp=0, last_samp=0,
                 interp_tech='pchip'):
        if interp_tech is 'step':
            beg_pad = int((self.noverlap)/2)
            up_version = np.zeros((file_size))
            up_version[:beg_pad] = first_samp
            up_version[beg_pad:beg_pad+self.frame_jump*self.nframes] = \
                                    np.repeat(samp_values, self.frame_jump)
            up_version[beg_pad+self.frame_jump*self.nframes:] = last_samp

        elif interp_tech is 'pchip' or 'spline':
            if np.amin(samp_values) > 0:
                if interp_tech is 'pchip':
                    up_version = scipy_interp.pchip(self.frames_pos,
                                                    samp_values)(range(file_size))

                elif interp_tech is 'spline':
                    tck, u_original = scipy_interp.splprep(
                                                [self.frames_pos, samp_values],
                                                u=self.frames_pos)
                    up_version = scipy_interp.splev(range(file_size), tck)[1]
            else:
                beg_pad = int((self.noverlap)/2)
                up_version = np.zeros((file_size))
                up_version[:beg_pad] = first_samp
                voiced_frames = np.nonzero(samp_values)[0]
                edges = np.nonzero((voiced_frames[1:]-voiced_frames[:-1]) > 1)[0]
                edges = np.insert(edges, len(edges), len(voiced_frames)-1)
                voiced_frames = np.split(voiced_frames, edges+1)[:-1]

                for frame in voiced_frames:
                    up_interval = self.frames_pos[frame]
                    tot_interval = np.arange(int(up_interval[0]-(self.frame_jump/2)),
                                          int(up_interval[-1]+(self.frame_jump/2)))

                    if interp_tech is 'pchip' and len(frame) > 2:
                        up_version[tot_interval] = scipy_interp.pchip(
                                                    up_interval,
                                                    samp_values[frame])(tot_interval)

                    elif interp_tech is 'spline' and len(frame) > 3:
                        tck, u_original = scipy_interp.splprep(
                                            [up_interval, samp_values[frame]],
                                             u=up_interval)
                        up_version[tot_interval] = scipy_interp.splev(tot_interval, tck)[1]

                    # MD: In case len(frame)==2, above methods fail.
                    #Use linear interpolation instead.
                    elif len(frame) > 1:
                        up_version[tot_interval] = scipy_interp.interp1d(
                                                    up_interval,
                                                    samp_values[frame],
                                        fill_value='extrapolate')(tot_interval)

                    elif len(frame) == 1:
                        up_version[tot_interval] = samp_values[frame]


                up_version[beg_pad+self.frame_jump*self.nframes:] = last_samp

        return up_version

"""
Creates a bandpass filter object.
"""
class BandpassFilter(object):

    def __init__(self, fs, parameters):

        fs_min = 1000.0
        if (fs > fs_min):
            dec_factor = parameters['dec_factor']
        else:
            dec_factor = 1

        filter_order = parameters['bp_forder']
        f_hp = parameters['bp_low']
        f_lp = parameters['bp_high']

        f1 = f_hp/(fs/2)
        f2 = f_lp/(fs/2)

        self.b = firwin(filter_order+1, [f1, f2], pass_zero=False)
        self.a = 1
        self.dec_factor = dec_factor


"""
--------------------------------------------
                Main function.
--------------------------------------------
"""
def yaapt(signal, **kwargs):

    # Rename the YAAPT v4.0 parameter "frame_lengtht" to "tda_frame_length"
    # (if provided).
    if 'frame_lengtht' in kwargs:
        if 'tda_frame_length' in kwargs:
            warning_str = 'WARNING: Both "tda_frame_length" and "frame_lengtht" '
            warning_str += 'refer to the same parameter. Therefore, the value '
            warning_str += 'of "frame_lengtht" is going to be discarded.'
            print(warning_str)
        else:
            kwargs['tda_frame_length'] = kwargs.pop('frame_lengtht')

    #---------------------------------------------------------------
    # Set the default values for the parameters.
    #---------------------------------------------------------------
    parameters = {}
    parameters['frame_length'] = kwargs.get('frame_length', 35.0)   #Length of each analysis frame (ms)
    # WARNING: In the original MATLAB YAAPT 4.0 code the next parameter is called
    # "frame_lengtht" which is quite similar to the previous one "frame_length".
    # Therefore, I've decided to rename it to "tda_frame_length" in order to
    # avoid confusion between them. Nevertheless, both inputs ("frame_lengtht"
    # and "tda_frame_length") are accepted when the function is called.
    parameters['tda_frame_length'] = \
                              kwargs.get('tda_frame_length', 35.0)  #Frame length employed in the time domain analysis (ms)
    parameters['frame_space'] = kwargs.get('frame_space', 10.0)     #Spacing between analysis frames (ms)
    parameters['f0_min'] = kwargs.get('f0_min', 60.0)               #Minimum F0 searched (Hz)
    parameters['f0_max'] = kwargs.get('f0_max', 400.0)              #Maximum F0 searched (Hz)
    parameters['fft_length'] = kwargs.get('fft_length', 8192)       #FFT length
    parameters['bp_forder'] = kwargs.get('bp_forder', 150)          #Order of band-pass filter
    parameters['bp_low'] = kwargs.get('bp_low', 50.0)               #Low frequency of filter passband (Hz)
    parameters['bp_high'] = kwargs.get('bp_high', 1500.0)           #High frequency of filter passband (Hz)
    parameters['nlfer_thresh1'] = kwargs.get('nlfer_thresh1', 0.75) #NLFER boundary for voiced/unvoiced decisions
    parameters['nlfer_thresh2'] = kwargs.get('nlfer_thresh2', 0.1)  #Threshold for NLFER definitely unvoiced
    parameters['shc_numharms'] = kwargs.get('shc_numharms', 3)      #Number of harmonics in SHC calculation
    parameters['shc_window'] = kwargs.get('shc_window', 40.0)       #SHC window length (Hz)
    parameters['shc_maxpeaks'] = kwargs.get('shc_maxpeaks', 4)      #Maximum number of SHC peaks to be found
    parameters['shc_pwidth'] = kwargs.get('shc_pwidth', 50.0)       #Window width in SHC peak picking (Hz)
    parameters['shc_thresh1'] = kwargs.get('shc_thresh1', 5.0)      #Threshold 1 for SHC peak picking
    parameters['shc_thresh2'] = kwargs.get('shc_thresh2', 1.25)     #Threshold 2 for SHC peak picking
    parameters['f0_double'] = kwargs.get('f0_double', 150.0)        #F0 doubling decision threshold (Hz)
    parameters['f0_half'] = kwargs.get('f0_half', 150.0)            #F0 halving decision threshold (Hz)
    parameters['dp5_k1'] = kwargs.get('dp5_k1', 11.0)               #Weight used in dynamic program
    parameters['dec_factor'] = kwargs.get('dec_factor', 1)          #Factor for signal resampling
    parameters['nccf_thresh1'] = kwargs.get('nccf_thresh1', 0.3)    #Threshold for considering a peak in NCCF
    parameters['nccf_thresh2'] = kwargs.get('nccf_thresh2', 0.9)    #Threshold for terminating serach in NCCF
    parameters['nccf_maxcands'] = kwargs.get('nccf_maxcands', 3)    #Maximum number of candidates found
    parameters['nccf_pwidth'] = kwargs.get('nccf_pwidth', 5)        #Window width in NCCF peak picking
    parameters['merit_boost'] = kwargs.get('merit_boost', 0.20)     #Boost merit
    parameters['merit_pivot'] = kwargs.get('merit_pivot', 0.99)     #Merit assigned to unvoiced candidates in
                                                                    #defintely unvoiced frames
    parameters['merit_extra'] = kwargs.get('merit_extra', 0.4)      #Merit assigned to extra candidates
                                                                    #in reducing F0 doubling/halving errors
    parameters['median_value'] = kwargs.get('median_value', 7)      #Order of medial filter
    parameters['dp_w1'] = kwargs.get('dp_w1', 0.15)                 #DP weight factor for V-V transitions
    parameters['dp_w2'] = kwargs.get('dp_w2', 0.5)                  #DP weight factor for V-UV or UV-V transitions
    parameters['dp_w3'] = kwargs.get('dp_w3', 0.1)                  #DP weight factor of UV-UV transitions
    parameters['dp_w4'] = kwargs.get('dp_w4', 0.9)                  #Weight factor for local costs

    #---------------------------------------------------------------
    # Create the signal objects and filter them.
    #---------------------------------------------------------------
    fir_filter = BandpassFilter(signal.fs, parameters)
    nonlinear_sign = basic.SignalObj(signal.data**2, signal.fs)

    signal.filtered_version(fir_filter)
    nonlinear_sign.filtered_version(fir_filter)

    #---------------------------------------------------------------
    # Create the pitch object.
    #---------------------------------------------------------------
    nfft = parameters['fft_length']
    frame_size = int(np.fix(parameters['frame_length']*signal.fs/1000))
    frame_jump = int(np.fix(parameters['frame_space']*signal.fs/1000))
    pitch = PitchObj(frame_size, frame_jump, nfft)

    assert pitch.frame_size > 15, 'Frame length value {} is too short.'.format(pitch.frame_size)
    assert pitch.frame_size < 2048, 'Frame length value {} exceeds the limit.'.format(pitch.frame_size)


    #---------------------------------------------------------------
    # Calculate NLFER and determine voiced/unvoiced frames.
    #---------------------------------------------------------------
    nlfer(signal, pitch, parameters)

    #---------------------------------------------------------------
    # Calculate an approximate pitch track from the spectrum.
    #---------------------------------------------------------------
    spec_pitch, pitch_std = spec_track(nonlinear_sign, pitch, parameters)

    #---------------------------------------------------------------
    # Temporal pitch tracking based on NCCF.
    #---------------------------------------------------------------
    time_pitch1, time_merit1 = time_track(signal, spec_pitch, pitch_std, pitch,
                                          parameters)

    time_pitch2, time_merit2 = time_track(nonlinear_sign, spec_pitch, pitch_std,
                                          pitch, parameters)

    # Added in YAAPT 4.0
    if time_pitch1.shape[1] < len(spec_pitch):
        len_time = time_pitch1.shape[1]
        len_spec = len(spec_pitch)
        time_pitch1 = np.concatenate((time_pitch1, np.zeros((3,len_spec-len_time),
                                      dtype=time_pitch1.dtype)),axis=1)
        time_pitch2 = np.concatenate((time_pitch2, np.zeros((3,len_spec-len_time),
                                      dtype=time_pitch2.dtype)),axis=1)
        time_merit1 = np.concatenate((time_merit1, np.zeros((3,len_spec-len_time),
                                      dtype=time_merit1.dtype)),axis=1)
        time_merit2 = np.concatenate((time_merit2, np.zeros((3,len_spec-len_time),
                                      dtype=time_merit2.dtype)),axis=1)

    #---------------------------------------------------------------
    # Refine pitch candidates.
    #---------------------------------------------------------------
    ref_pitch, ref_merit = refine(time_pitch1, time_merit1, time_pitch2,
                                  time_merit2, spec_pitch, pitch, parameters)

    #---------------------------------------------------------------
    # Use dyanamic programming to determine the final pitch.
    #---------------------------------------------------------------
    final_pitch = dynamic(ref_pitch, ref_merit, pitch, parameters)

    pitch.set_values(final_pitch, signal.size)

    return pitch


"""
--------------------------------------------
                Side functions.
--------------------------------------------
"""

"""
Normalized Low Frequency Energy Ratio function. Corresponds to the nlfer.m file,
but instead of returning the results to them function, encapsulates them in the
pitch object.
"""
def nlfer(signal, pitch, parameters):

    #---------------------------------------------------------------
    # Set parameters.
    #---------------------------------------------------------------
    N_f0_min = np.around((parameters['f0_min']*2/float(signal.new_fs))*pitch.nfft)
    N_f0_max = np.around((parameters['f0_max']/float(signal.new_fs))*pitch.nfft)

    window = hann(pitch.frame_size+2)[1:-1]
    data = np.zeros((signal.size))  #Needs other array, otherwise stride and
    data[:] = signal.filtered     #windowing will modify signal.filtered

    #---------------------------------------------------------------
    # Main routine.
    #---------------------------------------------------------------
    samples = np.arange(int(np.fix(float(pitch.frame_size)/2)),
                        signal.size-int(np.fix(float(pitch.frame_size)/2)),
                        pitch.frame_jump)

    data_matrix = np.empty((len(samples), pitch.frame_size))
    data_matrix[:, :] = stride_matrix(data, len(samples),
                                    pitch.frame_size, pitch.frame_jump)
    data_matrix *= window

    specData = np.fft.rfft(data_matrix, pitch.nfft)

    frame_energy = np.abs(specData[:, int(N_f0_min-1):int(N_f0_max)]).sum(axis=1)
    pitch.set_energy(frame_energy, parameters['nlfer_thresh1'])
    pitch.set_frames_pos(samples)

"""
Spectral pitch tracking. Computes estimates of pitch using nonlinearly processed
speech (typically square or absolute value) and frequency domain processing.
Search for frequencies which have energy at multiplies of that frequency.
Corresponds to the spec_trk.m file.
"""
def spec_track(signal, pitch, parameters):

    #---------------------------------------------------------------
    # Set parameters.
    #---------------------------------------------------------------
    nframe_size = pitch.frame_size*2
    maxpeaks = parameters['shc_maxpeaks']
    delta = signal.new_fs/pitch.nfft

    window_length = int(np.fix(parameters['shc_window']/delta))
    half_window_length = int(np.fix(float(window_length)/2))
    if not(window_length % 2):
        window_length += 1

    max_SHC = int(np.fix((parameters['f0_max']+parameters['shc_pwidth']*2)/delta))
    min_SHC = int(np.ceil(parameters['f0_min']/delta))
    num_harmonics = parameters['shc_numharms']

    #---------------------------------------------------------------
    # Main routine.
    #---------------------------------------------------------------
    cand_pitch = np.zeros((maxpeaks, pitch.nframes))
    cand_merit = np.ones((maxpeaks, pitch.nframes))

    data = np.append(signal.filtered,
                  np.zeros((1, nframe_size +
                         ((pitch.nframes-1)*pitch.frame_jump-signal.size))))

    #Compute SHC for voiced frame
    window = kaiser(nframe_size, 0.5)
    SHC = np.zeros((max_SHC))
    row_mat_list = np.array([np.empty((max_SHC-min_SHC+1, window_length))
                            for x in range(num_harmonics+1)])

    magnitude = np.zeros(int((half_window_length+(pitch.nfft/2)+1)))

    for frame in np.where(pitch.vuv)[0].tolist():
        fir_step = frame*pitch.frame_jump

        data_slice = data[fir_step:fir_step+nframe_size]*window
        data_slice -= np.mean(data_slice)

        magnitude[half_window_length:] = np.abs(np.fft.rfft(data_slice,
                                                pitch.nfft))

        for idx,row_mat in enumerate(row_mat_list):
            row_mat[:, :] = stride_matrix(magnitude[min_SHC*(idx+1):],
                                          max_SHC-min_SHC+1,
                                          window_length, idx+1)
        SHC[min_SHC-1:max_SHC] = np.sum(np.prod(row_mat_list,axis=0),axis=1)

        cand_pitch[:, frame], cand_merit[:, frame] = \
            peaks(SHC, delta, maxpeaks, parameters)

    #Extract the pitch candidates of voiced frames for the future pitch selection.
    spec_pitch = cand_pitch[0, :]
    voiced_cand_pitch = cand_pitch[:, cand_pitch[0, :] > 0]
    voiced_cand_merit = cand_merit[:, cand_pitch[0, :] > 0]
    num_voiced_cand = len(voiced_cand_pitch[0, :])
    avg_voiced = np.mean(voiced_cand_pitch[0, :])
    std_voiced = np.std(voiced_cand_pitch[0, :])

    #Interpolation of the weigthed candidates.
    delta1 = abs((voiced_cand_pitch - 0.8*avg_voiced))*(3-voiced_cand_merit)
    index = delta1.argmin(0)

    voiced_peak_minmrt = voiced_cand_pitch[index, range(num_voiced_cand)]
    voiced_merit_minmrt = voiced_cand_merit[index, range(num_voiced_cand)]

    voiced_peak_minmrt = medfilt(voiced_peak_minmrt,
                                 max(1, parameters['median_value']-2))

    #Replace the lowest merit candidates by the median smoothed ones
    #computed from highest merit peaks above.
    voiced_cand_pitch[index, range(num_voiced_cand)] = voiced_peak_minmrt
    voiced_cand_merit[index, range(num_voiced_cand)] = voiced_merit_minmrt

    #Use dynamic programming to find best overal path among pitch candidates.
    #Dynamic weight for transition costs balance between local and
    #transition costs.
    weight_trans = parameters['dp5_k1']*std_voiced/avg_voiced

    if num_voiced_cand > 2:
        voiced_pitch = dynamic5(voiced_cand_pitch, voiced_cand_merit,
                                weight_trans, parameters['f0_min'])
        voiced_pitch = medfilt(voiced_pitch, max(1, parameters['median_value']-2))

    else:
        if num_voiced_cand > 0:
            voiced_pitch = (np.ones((num_voiced_cand)))*150.0
        else:
            voiced_pitch = np.array([150.0])
            cand_pitch[0, 0] = 0

    pitch_avg = np.mean(voiced_pitch)
    pitch_std = np.std(voiced_pitch)
    spec_pitch[cand_pitch[0, :] > 0] = voiced_pitch[:]

    if (spec_pitch[0] < pitch_avg/2):
        spec_pitch[0] = pitch_avg

    if (spec_pitch[-1] < pitch_avg/2):
        spec_pitch[-1] = pitch_avg

    spec_voiced = np.array(np.nonzero(spec_pitch)[0])
    spec_pitch = scipy_interp.pchip(spec_voiced,
                                    spec_pitch[spec_voiced])(range(pitch.nframes))

    spec_pitch = lfilter(np.ones((3))/3, 1.0, spec_pitch)

    spec_pitch[0] = spec_pitch[2]
    spec_pitch[1] = spec_pitch[3]

    return spec_pitch, pitch_std

"""
Temporal pitch tracking.
Corresponds to the tm_trk.m file.
"""
def time_track(signal, spec_pitch, pitch_std, pitch, parameters):

    #---------------------------------------------------------------
    # Set parameters.
    #---------------------------------------------------------------
    tda_frame_length = int(parameters['tda_frame_length']*signal.fs/1000)
    tda_noverlap = tda_frame_length-pitch.frame_jump
    tda_nframes = int((len(signal.data)-tda_noverlap)/pitch.frame_jump)

    len_spectral = len(spec_pitch)
    if tda_nframes < len_spectral:
        spec_pitch = spec_pitch[:tda_nframes]
    elif tda_nframes > len_spectral:
        tda_nframes = len_spectral

    merit_boost = parameters['merit_boost']
    maxcands = parameters['nccf_maxcands']
    freq_thresh = 5.0*pitch_std

    spec_range = np.maximum(spec_pitch-2.0*pitch_std, parameters['f0_min'])
    spec_range = np.vstack((spec_range,
                         np.minimum(spec_pitch+2.0*pitch_std, parameters['f0_max'])))

    time_pitch = np.zeros((maxcands, tda_nframes))
    time_merit = np.zeros((maxcands, tda_nframes))

    #---------------------------------------------------------------
    # Main routine.
    #---------------------------------------------------------------
    data = np.zeros((signal.size))  #Needs other array, otherwise stride and
    data[:] = signal.filtered       #windowing will modify signal.filtered
    signal_frames = stride_matrix(data, tda_nframes,tda_frame_length,
                                  pitch.frame_jump)
    for frame in range(tda_nframes):
        lag_min0 = (int(np.fix(signal.new_fs/spec_range[1, frame])) -
                                    int(np.fix(parameters['nccf_pwidth']/2.0)))
        lag_max0 = (int(np.fix(signal.new_fs/spec_range[0, frame])) +
                                    int(np.fix(parameters['nccf_pwidth']/2.0)))

        phi = crs_corr(signal_frames[frame, :], lag_min0, lag_max0)
        time_pitch[:, frame], time_merit[:, frame] = \
            cmp_rate(phi, signal.new_fs, maxcands, lag_min0, lag_max0, parameters)

    diff = np.abs(time_pitch - spec_pitch)
    match1 = (diff < freq_thresh)
    match = ((1 - diff/freq_thresh) * match1)
    time_merit = (((1+merit_boost)*time_merit) * match)

    return time_pitch, time_merit

"""
Refines pitch candidates obtained from NCCF using spectral pitch track and
NLFER energy information.
Corresponds to the refine.m file.
"""
def refine(time_pitch1, time_merit1, time_pitch2, time_merit2, spec_pitch,
           pitch, parameters):

    #---------------------------------------------------------------
    # Set parameters.
    #---------------------------------------------------------------
    nlfer_thresh2 = parameters['nlfer_thresh2']
    merit_pivot = parameters['merit_pivot']

    #---------------------------------------------------------------
    # Main routine.
    #---------------------------------------------------------------
    time_pitch = np.append(time_pitch1, time_pitch2, 0)
    time_merit = np.append(time_merit1, time_merit2, 0)
    maxcands = time_pitch.shape[0]

    idx = np.argsort(-time_merit, axis=0)
    time_merit.sort(axis=0)
    time_merit[:, :] = time_merit[::-1,:]

    time_pitch = time_pitch[idx, range(pitch.nframes)]

    best_pitch = medfilt(time_pitch[0, :], parameters['median_value'])*pitch.vuv

    idx1 = pitch.energy <= nlfer_thresh2
    idx2 = (pitch.energy > nlfer_thresh2) & (time_pitch[0, :] > 0)
    idx3 = (pitch.energy > nlfer_thresh2) & (time_pitch[0, :] <= 0)
    merit_mat = (time_pitch[1:maxcands-1, :] == 0) & idx2
    merit_mat = np.insert(merit_mat, [0, maxcands-2],
                          np.zeros((1, pitch.nframes), dtype=bool), 0)

    time_pitch[:, idx1] = 0
    time_merit[:, idx1] = merit_pivot

    time_pitch[maxcands-1, idx2] = 0.0
    time_merit[maxcands-1, idx2] = 1.0-time_merit[0, idx2]
    time_merit[merit_mat] = 0.0

    time_pitch[0, idx3] = spec_pitch[idx3]
    time_merit[0, idx3] = np.minimum(1, pitch.energy[idx3]/2.0)
    time_pitch[1:maxcands, idx3] = 0.0
    time_merit[1:maxcands, idx3] = 1.0-time_merit[0, idx3]

    time_pitch[maxcands-2, :] = best_pitch
    non_zero_frames = best_pitch > 0.0
    time_merit[maxcands-2, non_zero_frames] = time_merit[0, non_zero_frames]
    time_merit[maxcands-2, ~(non_zero_frames)] = 1.0-np.minimum(1,
                                       pitch.energy[~(non_zero_frames)]/2.0)

    time_pitch[maxcands-3, :] = spec_pitch
    time_merit[maxcands-3, :] = pitch.energy/5.0

    return time_pitch, time_merit


"""
Dynamic programming used to compute local and transition cost matrices,
enabling the lowest cost tracking of pitch candidates.
It uses NFLER from the spectrogram and the highly robust spectral F0 track,
plus the merits, for computation of the cost matrices.
Corresponds to the dynamic.m file.
"""
def dynamic(ref_pitch, ref_merit, pitch, parameters):

    #---------------------------------------------------------------
    # Set parameters.
    #---------------------------------------------------------------
    num_cands = ref_pitch.shape[0]
    best_pitch = ref_pitch[num_cands-2, :]
    mean_pitch = np.mean(best_pitch[best_pitch > 0])

    dp_w1 = parameters['dp_w1']
    dp_w2 = parameters['dp_w2']
    dp_w3 = parameters['dp_w3']
    dp_w4 = parameters['dp_w4']

    #---------------------------------------------------------------
    # Main routine.
    #---------------------------------------------------------------
    local_cost = 1 - ref_merit
    trans_cmatrix = np.ones((num_cands, num_cands, pitch.nframes))

    ref_mat1 = np.zeros((num_cands, num_cands, pitch.nframes))
    ref_mat2 = np.zeros((num_cands, num_cands, pitch.nframes))
    idx_mat1 = np.zeros((num_cands, num_cands, pitch.nframes), dtype=bool)
    idx_mat2 = np.zeros((num_cands, num_cands, pitch.nframes), dtype=bool)
    idx_mat3 = np.zeros((num_cands, num_cands, pitch.nframes), dtype=bool)

    ref_mat1[:, :, 1:] = np.tile(ref_pitch[:, 1:].reshape(1, num_cands,
                        pitch.nframes-1), (num_cands, 1, 1))
    ref_mat2[:, :, 1:] = np.tile(ref_pitch[:, :-1].reshape(num_cands, 1,
                        pitch.nframes-1), (1, num_cands, 1))

    idx_mat1[:, :, 1:] = (ref_mat1[:, :, 1:] > 0) & (ref_mat2[:, :, 1:] > 0)
    idx_mat2[:, :, 1:] = (((ref_mat1[:, :, 1:] == 0) & (ref_mat2[:, :, 1:] > 0)) |
                       ((ref_mat1[:, :, 1:] > 0) & (ref_mat2[:, :, 1:] == 0)))
    idx_mat3[:, :, 1:] = (ref_mat1[:, :, 1:] == 0) & (ref_mat2[:, :, 1:] == 0)

    mat1_values = np.abs(ref_mat1-ref_mat2)/mean_pitch
    benefit2 = np.insert(np.minimum(1, abs(pitch.energy[:-1]-pitch.energy[1:])),
                         0, 0)
    benefit2 = np.tile(benefit2, (num_cands, num_cands, 1))

    trans_cmatrix[idx_mat1] = dp_w1*mat1_values[idx_mat1]
    trans_cmatrix[idx_mat2] = dp_w2*(1-benefit2[idx_mat2])
    trans_cmatrix[idx_mat3] = dp_w3

    trans_cmatrix = trans_cmatrix/dp_w4
    path = path1(local_cost, trans_cmatrix, num_cands, pitch.nframes)
    final_pitch = ref_pitch[path, range(pitch.nframes)]

    return final_pitch

"""
--------------------------------------------
                Auxiliary functions.
--------------------------------------------
"""

"""
Computes peaks in a frequency domain function associated with the peaks found
in each frame based on the correlation sequence.
Corresponds to the peaks.m file.
"""
def peaks(data, delta, maxpeaks, parameters):

    #---------------------------------------------------------------
    # Set parameters.
    #---------------------------------------------------------------
    PEAK_THRESH1 = parameters['shc_thresh1']
    PEAK_THRESH2 = parameters['shc_thresh2']

    epsilon = .00000000000001

    width = int(np.fix(parameters['shc_pwidth']/delta))
    if not(float(width) % 2):
        width = width + 1

    center = int(np.ceil(width/2))

    min_lag = int(np.fix(parameters['f0_min']/delta - center))
    max_lag = int(np.fix(parameters['f0_max']/delta + center))

    if (min_lag < 1):
        min_lag = 1
        print('Min_lag is too low and adjusted ({}).'.format(min_lag))

    if max_lag > (len(data) - width):
        max_lag = len(data) - width
        print('Max_lag is too high and adjusted ({}).'.format(max_lag))

    pitch = np.zeros((maxpeaks))
    merit = np.zeros((maxpeaks))

    #---------------------------------------------------------------
    # Main routine.
    #---------------------------------------------------------------
    max_data = max(data[min_lag:max_lag+1])

    if (max_data > epsilon):
        data = data/max_data

    avg_data = np.mean(data[min_lag:max_lag+1])

    if (avg_data > 1/PEAK_THRESH1):
        pitch = np.zeros((maxpeaks))
        merit = np.ones((maxpeaks))
        return pitch, merit

    #---------------------------------------------------------------
    #Step1 (this step was implemented differently than in original version)
    #---------------------------------------------------------------
    numpeaks = 0
    vec_back = (data[min_lag+center+1:max_lag-center+1] >
                                            data[min_lag+center:max_lag-center])
    vec_forw = (data[min_lag+center+1:max_lag-center+1] >
                                        data[min_lag+center+2:max_lag-center+2])
    above_thresh = (data[min_lag+center+1:max_lag-center+1] >
                                        PEAK_THRESH2*avg_data)
    peaks = np.logical_and(np.logical_and(vec_back, vec_forw), above_thresh)

    for n in (peaks.ravel().nonzero()[0]+min_lag+center+1).tolist():
        if np.argmax(data[n-center:n+center+1]) == center:
            if numpeaks >= maxpeaks:
                pitch = np.append(pitch, np.zeros((1)))
                merit = np.append(merit, np.zeros((1)))

            pitch[numpeaks] = float(n)*delta
            merit[numpeaks] = data[n]
            numpeaks += 1

    #---------------------------------------------------------------
    #Step2
    #---------------------------------------------------------------
    if (max(merit)/avg_data < PEAK_THRESH1):
        pitch = np.zeros((maxpeaks))
        merit = np.ones((maxpeaks))
        return pitch, merit

    #---------------------------------------------------------------
    #Step3
    #---------------------------------------------------------------
    idx = (-merit).ravel().argsort().tolist()
    merit = merit[idx]
    pitch = pitch[idx]

    numpeaks = min(numpeaks, maxpeaks)
    pitch = np.append(pitch[:numpeaks], np.zeros((maxpeaks-numpeaks)))
    merit = np.append(merit[:numpeaks], np.zeros((maxpeaks-numpeaks)))

    #---------------------------------------------------------------
    #Step4
    #---------------------------------------------------------------

    if (numpeaks > 0):
        # The first two "if pitch[0]" statements seem to had been deprecated in
        # the original YAAPT Matlab code, so they may be removed here as well.
        if (pitch[0] > parameters['f0_double']):
            numpeaks = min(numpeaks+1, maxpeaks)
            pitch[numpeaks-1] = pitch[0]/2.0
            merit[numpeaks-1] = parameters['merit_extra']

        if (pitch[0] < parameters['f0_half']):
            numpeaks = min(numpeaks+1, maxpeaks)
            pitch[numpeaks-1] = pitch[0]*2.0
            merit[numpeaks-1] = parameters['merit_extra']

        if (numpeaks < maxpeaks):
            pitch[numpeaks:maxpeaks] = pitch[0]
            merit[numpeaks:maxpeaks] = merit[0]

    else:
        pitch = np.zeros((maxpeaks))
        merit = np.ones((maxpeaks))

    return np.transpose(pitch), np.transpose(merit)

"""
Dynamic programming used to compute local and transition cost matrices,
enabling the lowest cost tracking of pitch candidates.
It uses NFLER from the spectrogram and the highly robust spectral F0 track,
plus the merits, for computation of the cost matrices.
Corresponds to the dynamic5.m file.
"""
def dynamic5(pitch_array, merit_array, k1, f0_min):

    num_cand = pitch_array.shape[0]
    num_frames = pitch_array.shape[1]

    local = 1-merit_array
    trans = np.zeros((num_cand, num_cand, num_frames))

    trans[:, :, 1:] = abs(pitch_array[:, 1:].reshape(1, num_cand, num_frames-1) -
                    pitch_array[:, :-1].reshape(num_cand, 1, num_frames-1))/f0_min
    trans[:, :, 1:] = 0.05*trans[:, :, 1:] + trans[:, :, 1:]**2

    trans = k1*trans
    path = path1(local, trans, num_cand, num_frames)

    final_pitch = pitch_array[path, range(num_frames)]

    return final_pitch

"""
Finds the optimal path with the lowest cost if two matrice(Local cost matrix
and Transition cost) are given.
Corresponds to the path1.m file.
"""
def path1(local, trans, n_lin, n_col):

# Apparently the following lines are somehow kind of useless.
# Therefore, I removed them in the version 1.0.3.

#    if n_lin >= 100:
#        print 'Stop in Dynamic due to M>100'
#        raise KeyboardInterrupt
#
#    if n_col >= 1000:
#        print 'Stop in Dynamic due to N>1000'
#        raise KeyboardInterrupt

    PRED = np.zeros((n_lin, n_col), dtype=int)
    P = np.ones((n_col), dtype=int)
    p_small = np.zeros((n_col), dtype=int)

    PCOST = np.zeros((n_lin))
    CCOST = np.zeros((n_lin))
    PCOST = local[:, 0]

    for I in range(1, n_col):

        aux_matrix = PCOST+np.transpose(trans[:, :, I])
        K = n_lin-np.argmin(aux_matrix[:, ::-1], axis=1)-1
        PRED[:, I] = K
        CCOST = PCOST[K]+trans[K, range(n_lin), I]

        assert CCOST.any() < 1.0E+30, 'CCOST>1.0E+30, Stop in Dynamic'
        CCOST = CCOST+local[:, I]

        PCOST[:] = CCOST
        J = n_lin - np.argmin(CCOST[::-1])-1
        p_small[I] = J

    P[-1] = p_small[-1]

    for I in range(n_col-2, -1, -1):
        P[I] = PRED[P[I+1], I+1]

    return P

"""
Computes the NCCF (Normalized cross correlation Function) sequence based on
the RAPT algorithm discussed by DAVID TALKIN.
Corresponds to the crs_corr.m file.
"""
def crs_corr(data, lag_min, lag_max):

    eps1 = 0.0
    data_len = len(data)
    N = data_len-lag_max

    error_str = 'ERROR: Negative index in the cross correlation calculation of '
    error_str += 'the pYAAPT time domain analysis. Please try to increase the '
    error_str += 'value of the "tda_frame_length" parameter.'
    assert N>0, error_str

    phi = np.zeros((data_len))
    data -= np.mean(data)
    x_j = data[0:N]
    x_jr = data[lag_min:lag_max+N]
    p = np.dot(x_j, x_j)

    x_jr_matrix = stride_matrix(x_jr, lag_max-lag_min, N, 1)

    formula_nume = np.dot(x_jr_matrix, x_j)
    formula_denom = np.sum(x_jr_matrix*x_jr_matrix, axis=1)*p + eps1

    phi[lag_min:lag_max] = formula_nume/np.sqrt(formula_denom)

    return phi

"""
Computes pitch estimates and the corresponding merit values associated with the
peaks found in each frame based on the correlation sequence.
Corresponds to the cmp_rate.m file.
"""
def cmp_rate(phi, fs, maxcands, lag_min, lag_max, parameters):

    #---------------------------------------------------------------
    # Set parameters.
    #---------------------------------------------------------------
    width = parameters['nccf_pwidth']
    center = int(np.fix(width/2.0))
    merit_thresh1 = parameters['nccf_thresh1']
    merit_thresh2 = parameters['nccf_thresh2']

    numpeaks = 0
    pitch = np.zeros((maxcands))
    merit = np.zeros((maxcands))

    #---------------------------------------------------------------
    # Main routine.
    #(this step was implemented differently than in original version)
    #---------------------------------------------------------------
    vec_back = (phi[lag_min+center:lag_max-center+1] >
                                            phi[lag_min+center-1:lag_max-center])
    vec_forw = (phi[lag_min+center:lag_max-center+1] >
                                        phi[lag_min+center+1:lag_max-center+2])
    above_thresh = phi[lag_min+center:lag_max-center+1] > merit_thresh1
    peaks = np.logical_and(np.logical_and(vec_back, vec_forw), above_thresh)

    peaks = (peaks.ravel().nonzero()[0]+lag_min+center).tolist()

    if np.amax(phi) > merit_thresh2 and len(peaks) > 0:
        max_point = peaks[np.argmax(phi[peaks])]
        pitch[numpeaks] = fs/float(max_point+1)
        merit[numpeaks] = np.amax(phi[peaks])
        numpeaks += 1
    else:
        for n in peaks:
            if np.argmax(phi[n-center:n+center+1]) == center:
                try:
                    pitch[numpeaks] = fs/float(n+1)
                    merit[numpeaks] = phi[n]
                except:
                    pitch = np.hstack((pitch, fs/float(n+1)))
                    merit = np.hstack((merit, phi[n]))
                numpeaks += 1

    #---------------------------------------------------------------
    # Sort the results.
    #---------------------------------------------------------------
    idx = (-merit).ravel().argsort().tolist()
    merit = merit[idx[:maxcands]]
    pitch = pitch[idx[:maxcands]]

    if (np.amax(merit) > 1.0):
        merit = merit/np.amax(merit)

    return pitch, merit

"""
--------------------------------------------
                Extra functions.
--------------------------------------------
"""

def stride_matrix(vector, n_lin, n_col, hop):

    data_matrix = stride_tricks.as_strided(vector, shape=(n_lin, n_col),
                        strides=(vector.strides[0]*hop, vector.strides[0]))

    return data_matrix