python source code of pyQHM

# -*- coding: utf-8 -*-
"""
This package provides the tools necessary to decompose the voiced part of a
speech signal into its modulated components, aka AM-FM decomposition. This
designation is used due the fact that, in this method, the signal is modeled as
a sum of amplitude- and frequency-modulated components. The goal is to overcome
the drawbacks from Fourier-alike techniques, e.g. SFFT, wavelets, etc, which
are limited in the time-frequency analysis by the so-called Heisenberg-Gabor
inequality.

The algorithms here implemented were the QHM (Quasi-Harmonic Model), and its
upgrades, aQHM (adaptive Quasi-Harmonic Model) and eaQHM (extended adaptive
Quasi-Harmonic Model). Their formulation can be found at references [1-3].

USAGE:
    Please refer to the documentation for examples.

References:
    [1] Y. Pantazis, “Decomposition of AM-FM signals with applications in
        speech processing”, PhD Thesis, University of Creta, 2010.

    [2] Y. Pantazis, O. Rosec and Y. Stylianou, “Adaptive AM-FM signal
        decomposition with application to speech analysis”, IEEE Transactions
        on Audio, Speech and Language Processing, vol. 19, n 2, 2011.

    [3] G. P. Kafentzis, Y. Pantazis, O. Rosec and Y. Stylianou, “An extension
        of the adaptive quasi-harmonic model”, em IEEE International Conference
        on Acoustics, Speech and Signal Processing (ICASSP), 2012.

Version 1.0.9.1
11/Mar/2020 Bernardo J.B. Schmitt - bernardo.jb.schmitt@gmail.com
"""

import numpy as np
import scipy

"""
--------------------------------------------
                Classes.
--------------------------------------------
"""

"""
Creates a single component object.
"""

class ComponentObj(object):

    def __init__(self, H, harm):
        self.mag = H[harm, 0, :]
        self.phase = H[harm, 1, :]
        self.freq = H[harm, 2, :]

    """
    Synthsize the modulated component by using the extracted magnitude and
    phase.
    """

    def synthesize(self):
        self.signal = 2*self.mag*np.cos(self.phase)


"""
Creates the output signal object (which, in its turn, is formed by n_harm
modulated components).
"""

class ModulatedSign(object):

    def __init__(self, n_harm, file_size, fs, phase_tech='phase'):
        self.n_harm = n_harm
        self.size = file_size
        self.fs = fs
        self.H = np.zeros((self.n_harm, 3, self.size))
        self.harmonics = [ComponentObj(self.H, i) for i in range(self.n_harm)]
        self.error = np.zeros(self.size)
        self.phase_tech = phase_tech

    """
    Updates the 3-dimension array H, which stores the magnitude, phase and
    frequency values from all components. Its first dimension refers to the
    n_harm components, the second to the three composing parameters (where 0
    stands for the magnitude, 1 for the phase and 2 for the frequency) and the
    third dimension to the temporal axis.
    """

    def update_values(self, a, freq, frame):
        self.H[:, 0, frame] = np.abs(a)
        self.H[:, 1, frame] = np.angle(a)
        self.H[:, 2, frame] = freq

    """
    Interpolate the parameters values when the extraction is not performed
    sample-by-sample. While the interpolation from magnitude and frequency
    is pretty straightforward, the phase one is not. Therefore, references
    [1,2] present a solution for this problem.
    """

    def interpolate_samp(self, samp_frames, pitch_track):

        # Interpolation from magnitude and frequency.

        for idx, func in [(0, 'linear'), (2, 'cubic')]:
            f = scipy.interpolate.interp1d(samp_frames,
                                       self.H[:, idx, samp_frames], kind=func)
            self.H[:, idx, np.nonzero(pitch_track)[0]] = f(
                                                    np.nonzero(pitch_track)[0])

        # Interpolation from phase.

        step = samp_frames[1]-samp_frames[0]
        sin_f = np.cumsum(np.sin(np.pi*np.arange(1, step)/step)).reshape(
                                                                    1, step-1)
        for idx, frame in np.ndenumerate(samp_frames[1:]):
            if frame-samp_frames[idx] <= step:
                cum_phase = np.cumsum(self.H[:, 2, samp_frames[idx]+1:frame+1],
                                    axis=1)*2*np.pi
                bad_phase = cum_phase[:, -1]+self.H[:, 1, samp_frames[idx]]
                M = np.around(np.abs(self.H[:, 1, frame]-bad_phase)/(2*np.pi))
                if frame-samp_frames[idx] < step:
                    end_step = frame-samp_frames[idx]
                    func = np.cumsum(np.sin(np.pi*np.arange(1, end_step) /
                                            end_step)).reshape(1, end_step-1)
                else:
                    func = sin_f

                r_vec = (np.pi*(self.H[:, 1, frame]+2*np.pi*M-bad_phase) /
                        (2*(frame-samp_frames[idx]))).reshape(self.n_harm, 1)

                new_phase = cum_phase[:, :-1]+r_vec*func + \
                        self.H[:, 1, samp_frames[idx]].reshape(self.n_harm, 1)
                self.H[:, 1, samp_frames[idx]+1:frame] = ((new_phase + np.pi) %
                                                            (2*np.pi)-np.pi)

    """
    Synthesize the final signal by initially creating each modulated component
    and then summing all of them.
    """

    def synthesize(self, N=None):
        if N is None:
            N = self.n_harm
        [self.harmonics[i].synthesize()
                            for i in range(N)]
        self.signal = sum([self.harmonics[i].signal
                            for i in range(self.n_harm)])

    """
    Calculates the SRER (Signal-to-Reconstruction Error Ratio) for the
    synthesized signal.
    """

    def srer(self, orig_signal, pitch_track):
        self.SRER = 20*np.log10(np.std(orig_signal[np.nonzero(pitch_track)[0]]) /
                    np.std(orig_signal[np.nonzero(pitch_track)[0]] -
                    self.signal[np.nonzero(pitch_track)[0]]))

    """
    Extrapolates the phase at the border of the voiced frames by integrating
    the edge frequency value. This procedure is necessary for posterior aQHM
    calculations. Additionally, the method allows the replacement of the
    extracted phase by the cumulative frequency. The objective is to provide
    smoother bases for further aQHM and eaQHM calculations. Normally this is
    not necessary, since that the interpolation process already smooths the
    phase vector. But in a sample-by-sample extraction case, this substitution
    is very helpful to avoid the degradation of aQHM and eaQHM performance
    due the phase wild behaviour.
    """

    def phase_edges(self, edges, window):

        # Selects whether the phase itself or the cummulative frequency will be
        # used.
        if self.phase_tech is 'phase':
            self.extrap_phase = np.unwrap(self.H[:, 1, :])

        elif self.phase_tech is 'freq':
            delta_phase = self.H[:, 1, edges[0]+1] - \
                                            self.H[:, 2, edges[0]+1]*2*np.pi
            self.extrap_phase = np.cumsum(self.H[:, 2, :], axis=1)*2*np.pi + \
                                            delta_phase.reshape(self.n_harm, 1)

        # Extrapolate the phase edges.
        n_beg = -window.half_len_vec[::-1][:-1].reshape(1, window.N)
        n_end = window.half_len_vec[1:].reshape(1, window.N)

        for beg, end in zip(edges[::2], edges[1::2]):

            old_phase = self.extrap_phase[:, beg+1].reshape(self.n_harm, 1)
            freq = self.H[:, 2, beg+1].reshape(self.n_harm, 1)
            self.extrap_phase[:, beg-window.N+1:beg+1] = \
                                                2*np.pi*freq*n_beg+old_phase

            old_phase = self.extrap_phase[:, end].reshape(self.n_harm, 1)
            freq = self.H[:, 2, end].reshape(self.n_harm, 1)
            self.extrap_phase[:, end+1:end+window.N+1] = \
                                                2*np.pi*freq*n_end+old_phase


"""
Creates the sample window object.
"""

class SampleWindow(object):

    def __init__(self, window_duration, fs):
        self.dur = window_duration         # in seconds
        self.length = int(self.dur*fs+1)
        if not self.length %2:
            self.length -= 1
        self.data = np.hamming(self.length)
        self.data2 = self.data**2
        self.N = int(self.dur*fs/2)
        self.half_len_vec = np.arange(self.N+1)
        self.len_vec = np.arange(-self.N, self.N+1)

        self.a0 = 0.54**2 + (0.46**2)/2
        self.a1 = 0.54*0.46
        self.a2 = (0.46**2)/4

        self.R0_diag = R_eq(0, g0, self)
        self.R2_diag = sum(self.data2*(self.len_vec**2))


"""
--------------------------------------------
               Main Functions.
--------------------------------------------
"""

"""
Main QHM function.
"""

def qhm(signal, pitch, window, samp_jump=None, N_iter=1, phase_tech='phase'):

    return HM_run(qhm_iteration, signal, pitch, window, samp_jump, N_iter,
                  phase_tech)

"""
Main aQHM function.
"""

def aqhm(signal, previous_HM, pitch, window, samp_jump=None, N_iter=1,
         N_runs=float('Inf'), phase_tech='phase', eaQHM_flag=False):

    count = 1
    outflag = False

    while outflag is False:
        func_options = [previous_HM, eaQHM_flag, 0]
        HM = HM_run(aqhm_iteration, signal, pitch, window, samp_jump, N_iter,
                    phase_tech, func_options)
        if count == 1:
            previous_HM = HM
        elif (count > 1 and HM.SRER > previous_HM.SRER):
            previous_HM = HM
        else:
            outflag = True

        count += 1

        if count > N_runs:
            outflag = True

    return previous_HM

"""
Main eaQHM function (which in fact varies very few from the aQHM).
"""

def eaqhm(signal, previous_HM, pitch, window, samp_jump=None, N_iter=1,
          N_runs=float('Inf'), phase_tech='phase'):

    return aqhm(signal, previous_HM, pitch, window, samp_jump, N_iter, N_runs,
                phase_tech, eaQHM_flag=True)

"""
Parser for the three algorithms.
"""

def HM_run(func, signal, pitch, window, samp_jump=None, N_iter=1,
           phase_tech='phase', func_options=None):

    # Creates the output signal object and the dummy frequency vector.
    HM = ModulatedSign(signal.n_harm, signal.size, signal.fs, phase_tech)
    freq = np.zeros(signal.n_harm)

    # Selects whether the extration will be performed with temporal jumps or
    # not.
    if samp_jump is None:
        voiced_frames = np.nonzero(pitch.values)[0]
    else:
        jump = int(np.fix(max(samp_jump*signal.fs, 1.0)))
        voiced_frames = np.array([], dtype=int)
        for beg, end in zip(pitch.edges[::2], pitch.edges[1::2]):
            voiced_frames = np.append(voiced_frames, np.arange(
                                                        beg+1, end-1, jump))
            voiced_frames = np.append(voiced_frames, end)

    # Run the algorithm in the selected voiced frames.
    for frame in voiced_frames:
        # Uses the pitch value and the harmonic definition f_k = k*f0 to create
        # a frequency reference vector, which is employed to keep each component
        # within a frquency band and thus, avoiding least-squares instability.
        f0_ref = pitch.values[frame]*np.arange(1, signal.n_harm+1)/signal.fs

        # Set some algorithm options.
        if func is qhm_iteration:
            if frame-1 in pitch.edges[::2]:
                freq[:] = f0_ref
            func_options = freq
        elif func is aqhm_iteration:
            func_options[2] = frame

        # Core algorithm function.
        coef, freq, HM.error[frame] = func(
                                signal.data[frame-window.N:frame+window.N+1],
                                f0_ref, window, signal.fs, 20.0, func_options,
                                N_iter)

        # Updates frame parameter values in the 3-dimension storage array H.
        HM.update_values(coef[:signal.n_harm], freq, frame)

    # If the extraction was performed with temporal jumps, interpolate the
    # results.
    if samp_jump is not None:
        HM.interpolate_samp(voiced_frames, pitch.values)
    HM.synthesize()
    HM.srer(signal.data, pitch.values)
    HM.phase_edges(pitch.edges, window)

    return HM

"""
Core QHM function.
"""

def qhm_iteration(data, f0_ref, window, fs, max_step, freq, N_iter=1):

    # Initialize and allocate variables.
    K = len(freq)
    coef = np.zeros((2*K))

    E = np.ones((window.length, 2*K), dtype=complex)
    E = exp_matrix(E, freq, window, K)
    E_windowed = np.ones((window.length, 2*K), dtype=complex)

    windowed_data = (window.data*data).reshape(window.length, 1)

    # Run the QHM algorithm N_iter times.
    for k in range(N_iter):
        # Calculate the a and b coeficients via least-squares.
        coef = least_squares(E, E_windowed, windowed_data, window, K)

        # Set a magnitude reference, which is used to detect and supress
        # erroneous magnitude spikes.
        mag_ref = np.abs(coef[0])

        # Updates the frequency values.
        freq, ro = freq_correction(coef[:K], coef[K:], freq, f0_ref, mag_ref, K,
                                   max_step, fs)

        # Updates the complex exponentials matrix.
        E = exp_matrix(E, freq, window, K)

    # Compute the final coefficients values.
    coef = least_squares(E, E_windowed, windowed_data, window, K)

    # This part is a workaround not present in the original references [1-3].
    # It was created to detect and supress erroneous magnitude spikes, which
    # degradate the final synthsized signal and consequently, its SRER.
    # Alternatively, the magnitude signals could be smoothed after extraction.
    # For more details, check the README file.
    cond = (np.abs(coef[:K]) < 5.5*np.abs(coef[0]))
    if not cond.all():
        freq[~cond] = f0_ref[~cond]

        # Updates the complex exponentials matrix with the modified frequencies.
        E = exp_matrix(E, freq, window, K)

        # Recalculate the final coefficients.
        coef = least_squares(E, E_windowed, windowed_data, window, K)

    # Calculate the mean squared error between the original frame and the
    # synthesized one.
    err = error_calc(windowed_data, E, coef, window)
    return coef, freq, err

"""
Core aQHM and eaQHM function.
"""

def aqhm_iteration(data, f0_ref, window, fs, max_step, func_options,
                   N_iter=1):

    # Initialize and allocate variables.
    previous_HM = func_options[0]
    eaQHM_flag = func_options[1]
    frame = func_options[2]

    freq = previous_HM.H[:, 2, frame]
    windowed_data = (window.data*data).reshape(window.length, 1)

    # Set a magnitude reference, which is used to detect and supress
    # erroneous magnitude spikes.
    mag_ref = np.abs(previous_HM.H[0, 0, frame])

    # Ajust the phase frame.
    extrap_phase_center = previous_HM.extrap_phase[:, frame].reshape(
                                                        previous_HM.n_harm, 1)
    phase_frame = previous_HM.extrap_phase[:, frame-window.N:frame+window.N+1] - \
                    extrap_phase_center

    # Initialize the coefficients.
    coef = np.vstack((previous_HM.H[:, 0, frame].reshape(previous_HM.n_harm, 1) *
        np.exp(1j*extrap_phase_center), np.zeros((previous_HM.n_harm, 1))))[:, 0]

    # Initialize the matrices.
    E = np.ones((window.length, 2*previous_HM.n_harm), dtype=complex)
    E_ro = np.ones((window.length, 2*previous_HM.n_harm), dtype=complex)
    E_windowed = np.ones((window.length, 2*previous_HM.n_harm), dtype=complex)

    E[:, :previous_HM.n_harm] = np.exp(1j*phase_frame.T)

    # If the eaQHM algorithm was selected, ajust the exponential matrix with
    # the normalized magnitude.
    if eaQHM_flag:
        mag_center = previous_HM.H[:, 0, frame].reshape(previous_HM.n_harm, 1)
        mag_frame = previous_HM.H[:, 0, frame-window.N:frame+window.N+1] / \
                    mag_center
        E[:, :previous_HM.n_harm] = mag_frame.T*E[:, :previous_HM.n_harm]

    E[:, previous_HM.n_harm:] = E[:, :previous_HM.n_harm] * \
                                window.len_vec.reshape(window.length, 1)

    # Run the aQHM/eaQHM algorithm N_iter times.
    for k in range(N_iter):

        # Calculate the a and b coeficients via least-squares.
        coef = least_squares(E, E_windowed, windowed_data, window,
                             previous_HM.n_harm)

        # Updates the frequency values.
        freq, ro = freq_correction(coef[:previous_HM.n_harm],
                                   coef[previous_HM.n_harm:], freq, f0_ref,
                                   mag_ref, previous_HM.n_harm, max_step, fs)

        # Updates the complex exponentials matrix.
        E = E*exp_matrix(E_ro, ro/(2*np.pi), window, previous_HM.n_harm)

    # Compute the final coefficients values.
    coef = least_squares(E, E_windowed, windowed_data, window,
                         previous_HM.n_harm)

    # This part is a workaround not present in the original references [1-3].
    # It was created to detect and supress erroneous magnitude spikes, which
    # degradate the final synthsized signal and consequently, its SRER.
    # Alternatively, the magnitude signals could be smoothed after extraction.
    # For more details, check the README file.
    cond = (np.abs(coef[:previous_HM.n_harm]) < 5.5*mag_ref)
    if not cond.all():
        freq[~cond] = f0_ref[~cond]

        # Since that the troubling aQHM/eaQHM exponentials are degradating the
        # results, they are replaced by the QHM version, which is more stable.
        E[:, ~np.append(cond, cond)] = exp_matrix(E_ro, freq, window,
                              previous_HM.n_harm)[:, ~np.append(cond, cond)]

        # Recalculate the final coefficients.
        coef = least_squares(E, E_windowed, windowed_data, window,
                             previous_HM.n_harm)

    # Calculate the mean squared error between the original frame and the
    # synthsized one.
    err = error_calc(windowed_data, E, coef, window)

    return coef, freq, err

"""
--------------------------------------------
            Auxiliary Functions.
--------------------------------------------
"""

"""
Calculate the a and b coeficients via least-squares method.
"""

def least_squares(E, E_windowed, windowed_data, window, K):

    R = np.zeros((2*K, 2*K), dtype=complex)
    B = np.zeros((window.length, 1), dtype=complex)

    E_windowed[:, :] = E*window.data.reshape(window.length, 1)
    R = E_windowed.conj().T.dot(E_windowed)
    B = E_windowed.conj().T.dot(windowed_data)

    coef = np.linalg.solve(R, B)[:, 0]

    return coef

"""
Calculates the frequency mismatch and updates the frequency values.
"""

def freq_correction(a, b, freq, f0_ref, mag_ref, n_harm, max_step, fs):

    old_freq = np.zeros(n_harm)
    old_freq[:] = freq[:]

    ro = (a.real*b.imag-a.imag*b.real)/(np.abs(a)*np.abs(a))

    # If the mismatch is too high (>20Hz), the frequency update is satured to
    # this value. This avoids big fluctuations, which can spoil the algorithms
    # convergence as whole.
    over_ro = np.abs(ro) > max_step*2*np.pi/fs
    ro[over_ro] = np.sign(ro[over_ro])*(max_step*2*np.pi/fs)
    freq = freq+ro/(2*np.pi)

    # Checks whether each component frequency lies within its spectral band and
    # also checks whether there are magnitude spikes.
    cond = ((np.round(freq/f0_ref[0]) != np.arange(n_harm)+1) |
            (freq > 0.5) | (freq < 0) | (np.abs(a) > 5.5*mag_ref))

    freq[cond] = f0_ref[cond]

    return freq, (freq-old_freq)*(2*np.pi)

"""
Calculate the mean squared error between the original frame and the
synthsized one.
"""

def error_calc(windowed_data, E, coef, window):
    h = E.dot(coef)

    err = np.sum((windowed_data-2*h.real*window.data)**2)

    return err

"""
Mounts the complex exponentials matrix.
"""

def exp_matrix(E, freq, window, K):

    E[window.N+1:, :K] = np.exp(1j*np.pi*2*freq)
    E[window.N+1:, :K] = np.cumprod(E[window.N+1:, :K], axis=0)
    E[:window.N, :K] = np.conj(E[window.N+1:, :K][::-1, :])

    E[:, K:] = E[:, :K]*window.len_vec.reshape(window.length, 1)

    return E

"""
Some side functions found in reference [2].
"""

def g0(x, N):
    if x != 0:
        return np.sin((2*N+1)*x/2)/np.sin(x/2)
    else:
        return 2*N+1

def g1(x, N):
    if x != 0:
        return 1j*((np.sin(N*x)/(2*np.sin(x/2)**2)) -
                   N*(np.cos((2*N+1)*x/2)/np.sin(x/2)))
    else:
        return 0

def R_eq(delta_f, func, window):
    return (window.a0*func(2*np.pi*delta_f, window.N) +
            func(2*np.pi*(delta_f+1./(2*window.N)), window.N)*window.a1 +
            func(2*np.pi*(delta_f-1./(2*window.N)), window.N)*window.a1 +
            func(2*np.pi*(delta_f+1./window.N), window.N)*window.a2 +
            func(2*np.pi*(delta_f-1./window.N), window.N)*window.a2)