python source code of diarizationFunctions

# AUTHORS
# Jose PATINO, EURECOM, Sophia-Antipolis, France, 2019
# http://www.eurecom.fr/en/people/patino-jose
# Contact: patino[at]eurecom[dot]fr, josempatinovillar[at]gmail[dot]com

# DIARIZATION FUNCTIONS

import numpy as np
    
def extractFeatures(audioFile,framelength,frameshift,nfilters,ncoeff):
    import librosa
    y, sr = librosa.load(audioFile,sr=None)
    frame_length_inSample=framelength*sr
    hop = int(frameshift*sr)
    NFFT=int(2**np.ceil(np.log2(frame_length_inSample)))
    if sr>=16000:
        features=librosa.feature.mfcc(y=y, sr=sr,dct_type=2,n_mfcc=ncoeff,n_mels=nfilters,n_fft=NFFT,hop_length=hop,fmin=20,fmax=7600).T
    else:
        features=librosa.feature.mfcc(y=y, sr=sr,dct_type=2,n_mfcc=ncoeff,n_mels=nfilters,n_fft=NFFT,hop_length=hop).T
    return features

def getFeatures(featureFile):
    features = HTKFile()
    features.load(featureFile)
    allData = np.asarray(features.data)
    return allData

def readUEMfile(path,filename,ext,nFeatures,frameshift):    
    uemFile = path + filename + ext
    notEvaluatedFramesMask=np.zeros([1, nFeatures])    
    f=open(uemFile,'r')
    C=f.read().splitlines()
    f.close()    
    initTime = []
    endTime = []
    if C:
        if len(C[0])>1:
            for idx,x in enumerate(C):        
                initTime=np.append(initTime,float(x.strip().split(' ')[2]))
                endTime=np.append(endTime,float(x.strip().split(' ')[3]))
    else:
        print('UEM annotations file was empty. The complete file is considered evaluable.')
        
    if type(initTime) is list:
        notEvaluatedFramesMask = np.ones([1,nFeatures])
    else:
        initTime = np.floor(initTime/frameshift)        
        endTime = np.floor(endTime/frameshift)        
        for idxT,x in np.ndenumerate(initTime):
            it=initTime[idxT]        
            if endTime[idxT]>nFeatures:
                et = nFeatures
            else:
                et = endTime[idxT]        
            notEvaluatedFramesMask[0,int(it):int(et)]=1
    return notEvaluatedFramesMask
  
def readSADfile(path,filename,ext,nFeatures,frameshift, format):   
    sadFile = path + filename + ext
    notEvaluatedFramesMask=np.zeros([1, nFeatures])    
    f=open(sadFile,'r')
    C=f.read().splitlines()
    f.close()    
    initTime = []
    endTime = []
    if C:
        if len(C[0])>1:
            if format=='LBL':
                for idx,x in enumerate(C):        
                    initTime=np.append(initTime,float(x.strip().split(' ')[0]))
                    endTime=np.append(endTime,float(x.strip().split(' ')[1]))
            elif format=='RTTM':
                for idx,x in enumerate(C):        
                    initTime=np.append(initTime,float(x.strip().split(' ')[3]))
                    endTime=np.append(endTime,float(x.strip().split(' ')[3])+float(x.strip().split(' ')[4]))         
            elif format=='MDTM':
                for idx,x in enumerate(C):        
                    initTime=np.append(initTime,float(x.strip().split(' ')[2]))
                    endTime=np.append(endTime,float(x.strip().split(' ')[2])+float(x.strip().split(' ')[3]))
    else:
        print('SAD annotations file was empty. The complete file is considered speech.')
    
    if type(initTime) is list:
        notEvaluatedFramesMask = np.ones([1,nFeatures])        
    else:        
        initTime = np.round(initTime/frameshift)        
        endTime = np.round(endTime/frameshift)        
        for idxT,x in np.ndenumerate(initTime):
            it=initTime[idxT]        
            if endTime[idxT]>nFeatures:
                et = nFeatures
            else:
                et = endTime[idxT]        
            notEvaluatedFramesMask[0,int(it):int(et)]=1
    return notEvaluatedFramesMask

def getSADfile(config,filename,nFeatures):
    
    """ The following VAD applying procedure is adapted from the repository
    provided as part of the DIHARD II challenge speech enhancement system:    
    https://github.com/staplesinLA/denoising_DIHARD18/blob/master/main_get_vad.py"""
    
    import os
    from librosa import load
    data, sr = load(config['PATH']['audio']+filename+config['EXTENSION']['audio'],sr=None)    
    va_framed = py_webrtcvad(data, fs=sr, fs_vad=sr, hoplength=30, vad_mode=0)
    segments = get_py_webrtcvad_segments(va_framed,sr)
    if not os.path.isdir(config['PATH']['SAD']):
        os.mkdir(config['PATH']['SAD'])     
    if os.path.isfile(config['PATH']['SAD']+filename+'.lab'):
        os.remove(config['PATH']['SAD']+filename+'.lab')    
    output_file = open(config['PATH']['SAD']+filename+'.lab','w')
    for i in range(segments.shape[0]):
        start_time = segments[i][0]
        end_time =  segments[i][1]
        output_file.write("%.3f %.3f speech\n" % (start_time, end_time))
    output_file.close()
    maskSAD = readSADfile(config['PATH']['SAD'],filename,'.lab',nFeatures,config.getfloat('FEATURES','frameshift'),'LBL')   
    return maskSAD

def py_webrtcvad(data, fs, fs_vad, hoplength=30, vad_mode=0):    
    import webrtcvad
    from librosa.core import resample
    from librosa.util import frame
    """ Voice activity detection.
    This was implementioned for easier use of py-webrtcvad.
    Thanks to: https://github.com/wiseman/py-webrtcvad.git
    Parameters
    ----------
    data : ndarray
        numpy array of mono (1 ch) speech data.
        1-d or 2-d, if 2-d, shape must be (1, time_length) or (time_length, 1).
        if data type is int, -32768 < data < 32767.
        if data type is float, -1 < data < 1.
    fs : int
        Sampling frequency of data.
    fs_vad : int, optional
        Sampling frequency for webrtcvad.
        fs_vad must be 8000, 16000, 32000 or 48000.
        Default is 16000.
    hoplength : int, optional
        Step size[milli second].
        hoplength must be 10, 20, or 30.
        Default is 0.1.
    vad_mode : int, optional
        set vad aggressiveness.
        As vad_mode increases, it becomes more aggressive.
        vad_mode must be 0, 1, 2 or 3.
        Default is 0.
    Returns
    -------
    vact : ndarray
        voice activity. time length of vact is same as input data.
        If 0, it is unvoiced, 1 is voiced.
    """

    # check argument
    if fs_vad not in [8000, 16000, 32000, 48000]:
        raise ValueError('fs_vad must be 8000, 16000, 32000 or 48000.')

    if hoplength not in [10, 20, 30]:
        raise ValueError('hoplength must be 10, 20, or 30.')

    if vad_mode not in [0, 1, 2, 3]:
        raise ValueError('vad_mode must be 0, 1, 2 or 3.')

    # check data
    if data.dtype.kind == 'i':
        if data.max() > 2**15 - 1 or data.min() < -2**15:
            raise ValueError(
                'when data type is int, data must be -32768 < data < 32767.')
        data = data.astype('f')

    elif data.dtype.kind == 'f':
        if np.abs(data).max() >= 1:
            data = data / np.abs(data).max() * 0.9
            warnings.warn('input data was rescaled.')
        data = (data * 2**15).astype('f')
    else:
        raise ValueError('data dtype must be int or float.')

    data = data.squeeze()
    if not data.ndim == 1:
        raise ValueError('data must be mono (1 ch).')

    # resampling
    if fs != fs_vad:
        resampled = resample(data, fs, fs_vad)
    else:
        resampled = data

    resampled = resampled.astype('int16')

    hop = fs_vad * hoplength // 1000
    framelen = resampled.size // hop + 1
    padlen = framelen * hop - resampled.size
    paded = np.lib.pad(resampled, (0, padlen), 'constant', constant_values=0)
    framed = frame(paded, frame_length=hop, hop_length=hop).T    
    
    vad = webrtcvad.Vad()
    vad.set_mode(vad_mode)
    valist = [vad.is_speech(tmp.tobytes(), fs_vad) for tmp in framed]

    hop_origin = fs * hoplength // 1000
    va_framed = np.zeros([len(valist), hop_origin])
    va_framed[valist] = 1

    return va_framed.reshape(-1)[:data.size]

def get_py_webrtcvad_segments(vad_info,fs):
    vad_index = np.where( vad_info==1.0) # find the speech index   
    vad_diff = np.diff( vad_index)
            
    vad_temp = np.zeros_like(vad_diff)
    vad_temp[ np.where(vad_diff==1) ]  = 1       
    vad_temp =  np.column_stack( (np.array([0]), vad_temp, np.array([0]) ))
    final_index = np.diff(vad_temp)
            
    starts = np.where( final_index == 1)
    ends = np.where( final_index == -1)
      
    sad_info = np.column_stack( (starts[1], ends[1]) ) 
    vad_index = vad_index[0]
    
    segments = np.zeros_like(sad_info,dtype=np.float)
    for i in range(sad_info.shape[0]):
        segments[i][0] = float( vad_index [ sad_info[i][0] ] ) / fs
        segments[i][1] =  float( vad_index[ sad_info[i][1] ]  +1 ) / fs
 
    return  segments  # present in seconds
  
def getSegmentTable(mask, speechMapping, wLength, wIncr, wShift):
    changePoints,segBeg,segEnd,nSegs = unravelMask(mask)  
    segmentTable = np.empty([0,4])
    for i in range(nSegs):
        begs = np.arange(segBeg[i],segEnd[i],wShift)
        bbegs = np.maximum(segBeg[i],begs-wIncr)
        ends = np.minimum(begs+wLength-1,segEnd[i])
        eends = np.minimum(ends+wIncr,segEnd[i])
        segmentTable=np.vstack((segmentTable,np.vstack((bbegs, begs, ends, eends)).T))
    return segmentTable

def unravelMask(mask):
    changePoints = np.diff(1*mask)
    segBeg=np.where(changePoints==1)[0]+1
    segEnd=np.where(changePoints==-1)[0]    
    if mask[0]==1:
        segBeg = np.insert(segBeg,0,0)
    if mask[-1]==1:
        segEnd = np.append(segEnd,np.size(mask)-1)
    nSegs = np.size(segBeg)  
    return changePoints,segBeg,segEnd,nSegs  
    
def trainKBM(data, windowLength, windowRate, kbmSize):  
    from scipy.stats import multivariate_normal
    from scipy.spatial.distance import cdist           
    # Calculate number of gaussian components in the whole gaussian pool
    numberOfComponents = int(np.floor((np.size(data,0)-windowLength)/windowRate))
    # Add new array for storing the mvn objects
    gmPool = []
    likelihoodVector = np.zeros((numberOfComponents, 1))
    muVector = np.zeros((numberOfComponents,np.size(data,1)))
    sigmaVector = np.zeros((numberOfComponents,np.size(data,1)))    
    for i in range(numberOfComponents):
        mu = np.mean(data[np.arange((i*windowRate),(i*windowRate+windowLength),1,int)],axis=0)
        std = np.std(data[np.arange((i*windowRate),(i*windowRate+windowLength),1,int)],axis=0)
        muVector[i], sigmaVector[i] = mu, std
        mvn = multivariate_normal(mu,std)
        gmPool.append(mvn)        
        likelihoodVector[i] = -np.sum(mvn.logpdf(data[np.arange((i*windowRate),(i*windowRate+windowLength),1,int)]))        
    # Define the global dissimilarity vector
    v_dist = np.inf*np.ones((numberOfComponents,1))
    # Create the kbm itself, which is a vector of kbmSize size, and contains the gaussian IDs of the components
    kbm = np.zeros((kbmSize,1));    
    # As the stored likelihoods are negative, get the minimum likelihood
    bestGaussianID = np.where(likelihoodVector==np.min(likelihoodVector))[0]    
    currentGaussianID = bestGaussianID
    kbm[0]=currentGaussianID
    v_dist[currentGaussianID]=-np.inf
    # Compare the current gaussian with the remaining ones
    dpairsAll = cdist(muVector,muVector,metric='cosine')
    np.fill_diagonal(dpairsAll,-np.inf)
    for j in range(1,kbmSize):
        dpairs = dpairsAll[currentGaussianID]    
        v_dist=np.minimum(v_dist,dpairs.T)
        # Once all distances are computed, get the position with highest value
        # set this position to 1 in the binary KBM ponemos a 1 en el vector kbm
        # store the gaussian ID in the KBM
        currentGaussianID = np.where(v_dist==np.max(v_dist))[0]
        kbm[j]=currentGaussianID
        v_dist[currentGaussianID]=-np.inf  
    return [kbm, gmPool]
    
def getVgMatrix(data, gmPool, kbm, topGaussiansPerFrame):
    print('Calculating log-likelihood table... ',end='')
    logLikelihoodTable = getLikelihoodTable(data,gmPool,kbm)
    print('done')    
    print('Calculating Vg matrix... ',end='')
    Vg = np.argsort(-logLikelihoodTable)[:,0:topGaussiansPerFrame]
    print('done')
    return Vg

def getLikelihoodTable(data,gmPool,kbm):
    # GETLIKELIHOODTABLE computes the log-likelihood of each feature in DATA
    # against all the Gaussians of GMPOOL specified by KBM vector    
    # Inputs:
    #   DATA = matrix of feature vectors
    #   GMPOOL = pool of Gaussians of the kbm model
    #   KBM = vector of the IDs of the actual Gaussians of the KBM
    # Output:
    #   LOGLIKELIHOODTABLE = NxM matrix storing the log-likelihood of each of
    #   the N features given each of th M Gaussians in the KBM    
    kbmSize = np.size(kbm,0)
    logLikelihoodTable = np.zeros([np.size(data,0),kbmSize])    
    for i in range(kbmSize):
        logLikelihoodTable[:,i]=gmPool[int(kbm[i])].logpdf(data)
    return logLikelihoodTable
  
def getSegmentBKs(segmentTable, kbmSize, Vg, bitsPerSegmentFactor, speechMapping):
    # GETSEGMENTBKS converts each of the segments in SEGMENTTABLE into a binary key
    # and/or cumulative vector.

    # Inputs:
    #   SEGMENTTABLE = matrix containing temporal segments returned by 'getSegmentTable' function
    #   KBMSIZE = number of components in the kbm model
    #   VG = matrix of the top components per frame returned by 'getVgMatrix' function
    #   BITSPERSEGMENTFACTOR = proportion of bits that will be set to 1 in the binary keys
    # Output:
    #   SEGMENTBKTABLE = NxKBMSIZE matrix containing N binary keys for each N segments in SEGMENTTABLE
    #   SEGMENTCVTABLE = NxKBMSIZE matrix containing N cumulative vectors for each N segments in SEGMENTTABLE  
    
    numberOfSegments = np.size(segmentTable,0)
    segmentBKTable = np.zeros([numberOfSegments,kbmSize])
    segmentCVTable = np.zeros([numberOfSegments,kbmSize])    
    for i in range(numberOfSegments):
        # Conform the segment according to the segmentTable matrix       
        beginningIndex = int(segmentTable[i,0])
        endIndex = int(segmentTable[i,3])
        # Store indices of features of the segment
        # speechMapping is substracted one because 1-indexing is used for this variable
        A = np.arange(speechMapping[beginningIndex]-1,speechMapping[endIndex],dtype=int)
        segmentBKTable[i], segmentCVTable[i] = binarizeFeatures(kbmSize, Vg[A,:], bitsPerSegmentFactor)
    print('done')
    return segmentBKTable, segmentCVTable

def binarizeFeatures(binaryKeySize, topComponentIndicesMatrix, bitsPerSegmentFactor):
    # BINARIZEMATRIX Extracts a binary key and a cumulative vector from the the
    # rows of VG specified by vector A
    
    # Inputs:
    #   BINARYKEYSIZE = binary key size
    #   TOPCOMPONENTINDICESMATRIX = matrix of top Gaussians per frame
    #   BITSPERSEGMENTFACTOR = Proportion of positions of the binary key which will be set to 1
    # Output:
    #   BINARYKEY = 1xBINARYKEYSIZE binary key
    #   V_F = 1xBINARYKEYSIZE cumulative vector
    numberOfElementsBinaryKey = np.floor(binaryKeySize * bitsPerSegmentFactor)    
    # Declare binaryKey
    binaryKey = np.zeros([1, binaryKeySize])
    # Declare cumulative vector v_f
    v_f = np.zeros([1, binaryKeySize])    
    unique, counts = np.unique(topComponentIndicesMatrix, return_counts=True)    
    # Fill CV
    v_f[:,unique]=counts    
    # Fill BK
    binaryKey[0,np.argsort(-v_f)[0][0:int(numberOfElementsBinaryKey)]]=1    
    # CV normalization
    v_f = v_f/np.sum(v_f)    
    return binaryKey, v_f

def performClusteringLinkage(segmentBKTable, segmentCVTable, N_init, linkageCriterion,linkageMetric ):
    from scipy.cluster.hierarchy import linkage
    from scipy import cluster
    if linkageMetric == 'jaccard':
      observations = segmentBKTable
    elif linkageMetric == 'cosine':
      observations = segmentCVTable
    else:
      observations = segmentCVTable      
    clusteringTable = np.zeros([np.size(segmentCVTable,0),N_init]) 
    Z = linkage(observations,method=linkageCriterion,metric=linkageMetric)
    for i in np.arange(N_init):
      clusteringTable[:,i] = cluster.hierarchy.cut_tree(Z,N_init-i).T+1  
    k=N_init
    print('done')
    return clusteringTable, k
   
def performClustering( speechMapping, segmentTable, segmentBKTable, segmentCVTable, Vg, bitsPerSegmentFactor, kbmSize, N_init, initialClustering, clusteringMetric):
    numberOfSegments = np.size(segmentTable,0)
    clusteringTable = np.zeros([numberOfSegments, N_init])
    finalClusteringTable = np.zeros([numberOfSegments, N_init])
    activeClusters = np.ones([N_init,1]) 
    clustersBKTable = np.zeros([N_init, kbmSize])
    clustersCVTable = np.zeros([N_init, kbmSize])    
    clustersBKTable, clustersCVTable = calcClusters(clustersCVTable,clustersBKTable,activeClusters,initialClustering,N_init,segmentTable,kbmSize,speechMapping,Vg,bitsPerSegmentFactor)   
    ####### Here the clustering algorithm begins. Steps are:
    ####### 1. Reassign all data among all existing signatures and retrain them
    ####### using the new clustering
    ####### 2. Save the resulting clustering solution
    ####### 3. Compare all signatures with each other and merge those two with
    ####### highest similarity, creating a new signature for the resulting
    ####### cluster
    ####### 4. Back to 1 if #clusters > 1      
    for k in range(N_init):
    ####### 1. Data reassignment. Calculate the similarity between the current segment with all clusters and assign it to the one which maximizes
    ####### the similarity. Finally re-calculate binaryKeys for all cluster   
    # before doing anything, check if there are remaining clusters
    # if there is only one active cluster, break    
        if np.sum(activeClusters)==1:
            break          
        clustersStillActive=np.zeros([1,N_init])
        segmentToClustersSimilarityMatrix = binaryKeySimilarity_cdist(clusteringMetric,segmentBKTable,segmentCVTable,clustersBKTable,clustersCVTable)
        # clusteringTable[:,k] = finalClusteringTable[:,k] = np.argmax(segmentToClustersSimilarityMatrix,axis=1)+1
        clusteringTable[:,k] = finalClusteringTable[:,k] = np.nanargmax(segmentToClustersSimilarityMatrix,axis=1)+1
        # clustersStillActive[:,np.unique(clusteringTable[:,k]).astype(int)-1] = 1
        clustersStillActive[:,np.unique(clusteringTable[:,k]).astype(int)-1] = 1       
        ####### update all binaryKeys for all new clusters        
        activeClusters = clustersStillActive
        clustersBKTable, clustersCVTable = calcClusters(clustersCVTable,clustersBKTable,activeClusters.T,clusteringTable[:,k].astype(int),N_init,segmentTable,kbmSize,speechMapping,Vg,bitsPerSegmentFactor)                
        ####### 2. Compare all signatures with each other and merge those two with highest similarity, creating a new signature for the resulting        
        clusterSimilarityMatrix = binaryKeySimilarity_cdist(clusteringMetric,clustersBKTable,clustersCVTable,clustersBKTable,clustersCVTable)        
        np.fill_diagonal(clusterSimilarityMatrix,np.nan)        
        value = np.nanmax(clusterSimilarityMatrix)
        location = np.nanargmax(clusterSimilarityMatrix)        
        R,C = np.unravel_index(location,(N_init,N_init))        
        ### Then we merge clusters R and C
        #print('Merging clusters',R+1,'and',C+1,'with a similarity score of',np.around(value,decimals=4))
        print('Merging clusters','%3s'%str(R+1),'and','%3s'%str(C+1),'with a similarity score of',np.around(value,decimals=4))
        activeClusters[0,C]=0        
        ### 3. Save the resulting clustering and go back to 1 if the number of clusters >1
        mergingClusteringIndices = np.where(clusteringTable[:,k]==C+1)
        # update clustering table
        clusteringTable[mergingClusteringIndices[0],k]=R+1
        # remove binarykey for removed cluster
        clustersBKTable[C,:]=np.zeros([1,kbmSize])
        clustersCVTable[C,:]=np.nan
        # prepare the vector with the indices of the features of thew new cluster and then binarize
        segmentsToBinarize = np.where(clusteringTable[:,k]==R+1)[0]
        M=[]
        for l in np.arange(np.size(segmentsToBinarize,0)):
            M = np.append(M,np.arange(int(segmentTable[segmentsToBinarize][:][l,1]),int(segmentTable[segmentsToBinarize][:][l,2])+1))
        clustersBKTable[R,:], clustersCVTable[R,:]=binarizeFeatures(kbmSize,Vg[np.array(speechMapping[np.array(M,dtype='int')],dtype='int')-1].T,bitsPerSegmentFactor)
    print('done')
    return clusteringTable, k
  
def calcClusters(clustersCVTable,clustersBKTable,activeClusters,clusteringTable,N_init,segmentTable,kbmSize,speechMapping,Vg,bitsPerSegmentFactor):    
    for i in np.arange(N_init):
        if activeClusters[i]==1:
            segmentsToBinarize = np.where(clusteringTable==i+1)[0]
            M = []
            for l in np.arange(np.size(segmentsToBinarize,0)):
                M = np.append(M,np.arange(int(segmentTable[segmentsToBinarize][:][l,1]),int(segmentTable[segmentsToBinarize][:][l,2])+1))
            clustersBKTable[i], clustersCVTable[i] = binarizeFeatures(kbmSize,Vg[np.array(speechMapping[np.array(M,dtype='int')],dtype='int')-1].T,bitsPerSegmentFactor)
        else:
            clustersBKTable[i]=np.zeros([1,kbmSize])
            clustersCVTable[i]=np.nan
    return clustersBKTable, clustersCVTable
  
def binaryKeySimilarity_cdist(clusteringMetric,bkT1, cvT1, bkT2, cvT2):
    from scipy.spatial.distance import cdist      
    if clusteringMetric == 'cosine':
      S = 1 - cdist(cvT1,cvT2,metric=clusteringMetric)
    elif clusteringMetric == 'jaccard':
      S = 1 - cdist(bkT1,bkT2,metric=clusteringMetric)
    else:
      print('Clustering metric must be cosine or jaccard')  
    return S
  
def getBestClustering(bestClusteringMetric, bkT, cvT, clusteringTable, n, maxNrSpeakers):
    from scipy.spatial.distance import cdist  
    wss = np.zeros([1,n])
    overallMean = np.mean(cvT,0)    
    if bestClusteringMetric == 'cosine':
        distances = cdist(np.expand_dims(overallMean,axis=0),cvT,bestClusteringMetric)
    elif bestClusteringMetric == 'jaccard':
        nBitsTol = np.sum(bkT[0,:])
        indices = np.argsort(-overallMean)
        overallMean = np.zeros([1,np.size(bkT,1)])
        overallMean[0,indices[np.arange(nBitsTol).astype(int)]]=1
        distances = cdist(overallMean,bkT,bestClusteringMetric)    
    distances2 = np.square(distances)    
    wss[0,n-1] = np.sum(distances2)    
    for i in np.arange(n-1):
        T = clusteringTable[:,i]
        clusterIDs = np.unique(T)
        variances = np.zeros([np.size(clusterIDs,0),1])
        for j in np.arange(np.size(clusterIDs,0)):
            clusterIDsIndex = np.where(T==clusterIDs[j])
            meanVector = np.mean(cvT[clusterIDsIndex,:],axis=1)
            if bestClusteringMetric=='cosine':
                distances = cdist(meanVector,cvT[clusterIDsIndex,:][0],bestClusteringMetric)
            elif bestClusteringMetric=='jaccard':
                indices = np.argsort(-meanVector[0,:])
                meanVector = np.zeros([1,np.size(bkT,1)])
                meanVector[0,indices[np.arange(nBitsTol).astype(int)]]=1
                distances = cdist(meanVector,bkT[clusterIDsIndex,:][0],bestClusteringMetric)
            distances2 = np.square(distances)    
            variances[j] = np.sum(distances2)
        wss[0,i]=np.sum(variances)        
    nPoints = np.size(wss,1)    
    allCoord = np.vstack((np.arange(1,nPoints+1),wss)).T
    firstPoint = allCoord[0,:]
    allCoord = allCoord[np.arange(np.where(allCoord[:,1]==np.min(allCoord[:,1]))[0],nPoints),:]
    nPoints = np.size(allCoord,0)
    lineVec = allCoord[-1,:] - firstPoint
    lineVecN = lineVec / np.sqrt(np.sum(np.square(lineVec)))
    vecFromFirst = np.subtract(allCoord,firstPoint)
    scalarProduct = vecFromFirst*lineVecN
    scalarProduct = scalarProduct[:,0]+scalarProduct[:,1]
    vecFromFirstParallel = np.expand_dims(scalarProduct,axis=1) * np.expand_dims(lineVecN,0)
    vecToLine = vecFromFirst - vecFromFirstParallel
    distToLine = np.sqrt(np.sum(np.square(vecToLine),axis=1))
    bestClusteringID = allCoord[np.argmax(distToLine)][0]
    nrSpeakersPerSolution = np.zeros((clusteringTable.shape[1]))
    for k in np.arange(clusteringTable.shape[1]):
        nrSpeakersPerSolution[k] = np.size(np.unique(clusteringTable[:,k]))
    bestClusteringID = np.maximum(np.where(nrSpeakersPerSolution==np.maximum(np.minimum(maxNrSpeakers,np.max(nrSpeakersPerSolution))-1,1))[0][0],bestClusteringID)
    return bestClusteringID
  
def getSpectralClustering(bestClusteringMetric,clusteringTable,N_init, bkT, cvT, n, sigma, percentile,maxNrSpeakers):
    from scipy.ndimage.filters import gaussian_filter
    simMatrix = binaryKeySimilarity_cdist(bestClusteringMetric,bkT,cvT,bkT,cvT)
    np.fill_diagonal(simMatrix,np.nan)       
    np.fill_diagonal(simMatrix,np.nanmax(simMatrix,1))    
    # Similarity matrix smoothed through Gaussian filter
    simMatrix_1 = gaussian_filter(simMatrix,sigma)
    # Similarity matrix thresholded to the percentile-th components leaving a small amount instead of 0 following Google's implementation
    thresholds = np.tile(percentile*np.nanmax(simMatrix,1)/100,(np.size(bkT,0),1)).T
    simMatrix_2 = np.copy(simMatrix_1)
    mask = simMatrix_2<thresholds
    simMatrix_2[np.where(mask==True)] = simMatrix_2[np.where(mask==True)]*0.01
    # Similarity matrix made symmetric
    simMatrix_3 = np.copy(simMatrix_2)
    for k in np.arange(np.size(simMatrix_3,0)):
        # for j in np.arange(np.size(simMatrix_3,)):
        maximum = np.maximum(simMatrix_3[:,k],simMatrix_3[k,:])
        simMatrix_3[:,k] = simMatrix_3[k,:] = maximum
    # Similarity matrix diffussion
    simMatrix_4 = np.dot(simMatrix_3,simMatrix_3)   
    # Row-wise max normalization
    simMatrix_5 = simMatrix_4 / np.tile(simMatrix_4.max(axis=0),(np.size(simMatrix_4,0),1)).T
    # Decomposition in eigenvalues, we don't use eigenvectors for the moment
    eigenvalues,eigenvectors = np.linalg.eigh(simMatrix_5)
    new_N_init = np.minimum(maxNrSpeakers,N_init)
    eigenvalues = np.flip(np.sort(eigenvalues),axis=0)[0:new_N_init]
    if new_N_init > 1:
        eigengaps = eigenvalues[0:new_N_init-1]/eigenvalues[1:new_N_init]
        eigengapssubstract = eigenvalues[0:new_N_init-1] - eigenvalues[1:new_N_init] 
    else:
        eigengaps = eigengapssubstract = eigenvalues       
    if eigengapssubstract[0] > 340:
        kclusters = 1
    else:
        eigengaps[0]=0  
        kclusters = np.flip(np.argsort(eigengaps),axis=0)[0]+1  
    
    nrElements = np.zeros([N_init,1])
    for k in np.arange(N_init):
        nrElements[k]=np.size(np.unique(clusteringTable[:,k]),0)
    distances = np.abs(nrElements-kclusters)    
    minidx = np.argmin(distances)
    bestClusteringID = minidx  
    return bestClusteringID

def smooth(a,WSZ):
    # a: NumPy 1-D array containing the data to be smoothed
    # WSZ: smoothing window size needs, which must be odd number,
    # as in the original MATLAB implementation
    #From https://stackoverflow.com/a/40443565
    out0 = np.convolve(a,np.ones(WSZ,dtype=int),'valid')/WSZ    
    r = np.arange(1,WSZ-1,2)
    start = np.cumsum(a[:WSZ-1])[::2]/r
    stop = (np.cumsum(a[:-WSZ:-1])[::2]/r)[::-1]
    return np.concatenate((start,out0,stop))
  
def performResegmentation(data, speechMapping,mask,finalClusteringTable,segmentTable,modelSize,nbIter,smoothWin,numberOfSpeechFeatures):        
    from sklearn import mixture
    np.random.seed(0)
    
    changePoints,segBeg,segEnd,nSegs = unravelMask(mask) 
    speakerIDs = np.unique(finalClusteringTable)
    trainingData = np.empty([2,0])      
    for i in np.arange(np.size(speakerIDs,0)):
        spkID = speakerIDs[i]
        speakerFeaturesIndxs = []
        idxs = np.where(finalClusteringTable==spkID)[0]
        for l in np.arange(np.size(idxs,0)):
            speakerFeaturesIndxs = np.append(speakerFeaturesIndxs,np.arange(int(segmentTable[idxs][:][l,1]),int(segmentTable[idxs][:][l,2])+1))
        formattedData = np.vstack((np.tile(spkID,(1,np.size(speakerFeaturesIndxs,0))),speakerFeaturesIndxs))
        trainingData = np.hstack((trainingData,formattedData))
    
    llkMatrix = np.zeros([np.size(speakerIDs,0),numberOfSpeechFeatures])
    for i in np.arange(np.size(speakerIDs,0)):
        spkIdxs = np.where(trainingData[0,:]==speakerIDs[i])[0]
        spkIdxs = speechMapping[trainingData[1,spkIdxs].astype(int)].astype(int)-1
        msize = np.minimum(modelSize,np.size(spkIdxs,0))
        w_init = np.ones([msize])/msize
        m_init = data[spkIdxs[np.random.randint(np.size(spkIdxs,0), size=(1, msize))[0]],:]
        gmm=mixture.GaussianMixture(n_components=msize,covariance_type='diag',weights_init=w_init,means_init=m_init,verbose=0)
        gmm.fit(data[spkIdxs,:])
        llkSpk = gmm.score_samples(data)
        llkSpkSmoothed = np.zeros([1,numberOfSpeechFeatures])      
        for jx in np.arange(nSegs):
            sectionIdx = np.arange(speechMapping[segBeg[jx]]-1,speechMapping[segEnd[jx]]).astype(int)
            sectionWin = np.minimum(smoothWin,np.size(sectionIdx))
            if sectionWin % 2 ==0:
                sectionWin = sectionWin - 1
            if sectionWin>=2:
                llkSpkSmoothed[0,sectionIdx] = smooth(llkSpk[sectionIdx], sectionWin)
            else:
                llkSpkSmoothed[0,sectionIdx]=llkSpk[sectionIdx]
        llkMatrix[i,:] = llkSpkSmoothed[0].T
    segOut = np.argmax(llkMatrix,axis=0)+1
    segChangePoints = np.diff(segOut)
    changes = np.where(segChangePoints!=0)[0]
    relSegEnds = speechMapping[segEnd]
    relSegEnds = relSegEnds[0:-1]
    changes = np.sort(np.unique(np.hstack((changes,relSegEnds))))    
    
    # Create the new segment and clustering tables
    currentPoint = 0
    finalSegmentTable = np.empty([0,4])
    finalClusteringTableResegmentation = np.empty([0,1])
    
    for i in np.arange(np.size(changes,0)):
        addedRow = np.hstack((np.tile(np.where(speechMapping==np.maximum(currentPoint,1))[0],(1,2)),  np.tile(np.where(speechMapping==np.maximum(1,changes[i].astype(int)))[0],(1,2))))
        finalSegmentTable = np.vstack((finalSegmentTable,addedRow[0]))    
        finalClusteringTableResegmentation = np.vstack((finalClusteringTableResegmentation,segOut[(changes[i]).astype(int)]))
        currentPoint = changes[i]+1
    addedRow = np.hstack((np.tile(np.where(speechMapping==currentPoint)[0],(1,2)),  np.tile(np.where(speechMapping==numberOfSpeechFeatures)[0],(1,2))))
    finalSegmentTable = np.vstack((finalSegmentTable,addedRow[0]))
    finalClusteringTableResegmentation = np.vstack((finalClusteringTableResegmentation,segOut[(changes[i]+1).astype(int)]))    
    return finalClusteringTableResegmentation,finalSegmentTable  

def getSegmentationFile(format, frameshift,finalSegmentTable, finalClusteringTable, showName, filename, outputPath, outputExt):
    numberOfSpeechFeatures = finalSegmentTable[-1,2].astype(int)+1
    solutionVector = np.zeros([1,numberOfSpeechFeatures])
    for i in np.arange(np.size(finalSegmentTable,0)):
        solutionVector[0,np.arange(finalSegmentTable[i,1],finalSegmentTable[i,2]+1).astype(int)]=finalClusteringTable[i]
    seg = np.empty([0,3]) 
    solutionDiff = np.diff(solutionVector)[0]
    first = 0
    for i in np.arange(0,np.size(solutionDiff,0)):
        if solutionDiff[i]:
            last = i+1
            seg1 = (first)*frameshift
            seg2 = (last-first)*frameshift
            seg3 = solutionVector[0,last-1]
            if seg3:
                seg = np.vstack((seg,[seg1,seg2,seg3]))
            first = i+1
    last = np.size(solutionVector,1)
    seg1 = (first-1)*frameshift
    seg2 = (last-first+1)*frameshift
    seg3 = solutionVector[0,last-1]
    seg = np.vstack((seg,[seg1,seg2,seg3]))    
    solution = []
    if format=='MDTM':
        for i in np.arange(np.size(seg,0)):
            solution.append(showName+' 1 '+str(np.around(seg[i,0],decimals=4))+' '+str(np.around(seg[i,1],decimals=4))+' speaker NA unknown speaker'+str(seg[i,2].astype(int)))+'\n'
    elif format=='RTTM':
        for i in np.arange(np.size(seg,0)):
            solution.append('SPEAKER '+showName+' 1 '+str(np.around(seg[i,0],decimals=4))+' '+str(np.around(seg[i,1],decimals=4))+' <NA> <NA> speaker'+str(seg[i,2].astype(int))+' <NA>\n')
    else:
        print('Output file format must be MDTM or RTTM.')
    solution[-1]=solution[-1][0:-1]        
        
    outf = open(outputPath+filename+outputExt,"a")    
    outf.writelines(solution)
    outf.write('\n')
    outf.close()
    
class HTKFile:
    
    #Code from: https://github.com/danijel3/PyHTK
    
    """ Class to load binary HTK file.

        Details on the format can be found online in HTK Book chapter 5.7.1.

        Not everything is implemented 100%, but most features should be supported.

        Not implemented:
            CRC checking - files can have CRC, but it won't be checked for correctness

            VQ - Vector features are not implemented.
    """

    data = None
    nSamples = 0
    nFeatures = 0
    sampPeriod = 0
    basicKind = None
    qualifiers = None

    def load(self, filename):
        import struct
        """ Loads HTK file.

            After loading the file you can check the following members:

                data (matrix) - data contained in the file

                nSamples (int) - number of frames in the file

                nFeatures (int) - number if features per frame

                sampPeriod (int) - sample period in 100ns units (e.g. fs=16 kHz -> 625)

                basicKind (string) - basic feature kind saved in the file

                qualifiers (string) - feature options present in the file

        """
        with open(filename, "rb") as f:

            header = f.read(12)
            self.nSamples, self.sampPeriod, sampSize, paramKind = struct.unpack(">iihh", header)
            basicParameter = paramKind & 0x3F

            if basicParameter is 0:
                self.basicKind = "WAVEFORM"
            elif basicParameter is 1:
                self.basicKind = "LPC"
            elif basicParameter is 2:
                self.basicKind = "LPREFC"
            elif basicParameter is 3:
                self.basicKind = "LPCEPSTRA"
            elif basicParameter is 4:
                self.basicKind = "LPDELCEP"
            elif basicParameter is 5:
                self.basicKind = "IREFC"
            elif basicParameter is 6:
                self.basicKind = "MFCC"
            elif basicParameter is 7:
                self.basicKind = "FBANK"
            elif basicParameter is 8:
                self.basicKind = "MELSPEC"
            elif basicParameter is 9:
                self.basicKind = "USER"
            elif basicParameter is 10:
                self.basicKind = "DISCRETE"
            elif basicParameter is 11:
                self.basicKind = "PLP"
            else:
                self.basicKind = "ERROR"

            self.qualifiers = []
            if (paramKind & 0o100) != 0:
                self.qualifiers.append("E")
            if (paramKind & 0o200) != 0:
                qualifiers.append("N")
            if (paramKind & 0o400) != 0:
                self.qualifiers.append("D")
            if (paramKind & 0o1000) != 0:
                self.qualifiers.append("A")
            if (paramKind & 0o2000) != 0:
                self.qualifiers.append("C")
            if (paramKind & 0o4000) != 0:
                self.qualifiers.append("Z")
            if (paramKind & 0o10000) != 0:
                self.qualifiers.append("K")
            if (paramKind & 0o20000) != 0:
                self.qualifiers.append("0")
            if (paramKind & 0o40000) != 0:
                self.qualifiers.append("V")
            if (paramKind & 0o100000) != 0:
                self.qualifiers.append("T")

            if "C" in self.qualifiers or "V" in self.qualifiers or self.basicKind is "IREFC" or self.basicKind is "WAVEFORM":
                self.nFeatures = sampSize // 2
            else:
                self.nFeatures = sampSize // 4

            if "C" in self.qualifiers:
                self.nSamples -= 4

            if "V" in self.qualifiers:
                raise NotImplementedError("VQ is not implemented")

            self.data = []
            if self.basicKind is "IREFC" or self.basicKind is "WAVEFORM":
                for x in range(self.nSamples):
                    s = f.read(sampSize)
                    frame = []
                    for v in range(self.nFeatures):
                        val = struct.unpack_from(">h", s, v * 2)[0] / 32767.0
                        frame.append(val)
                    self.data.append(frame)
            elif "C" in self.qualifiers:

                A = []
                s = f.read(self.nFeatures * 4)
                for x in range(self.nFeatures):
                    A.append(struct.unpack_from(">f", s, x * 4)[0])
                B = []
                s = f.read(self.nFeatures * 4)
                for x in range(self.nFeatures):
                    B.append(struct.unpack_from(">f", s, x * 4)[0])

                for x in range(self.nSamples):
                    s = f.read(sampSize)
                    frame = []
                    for v in range(self.nFeatures):
                        frame.append((struct.unpack_from(">h", s, v * 2)[0] + B[v]) / A[v])
                    self.data.append(frame)
            else:
                for x in range(self.nSamples):
                    s = f.read(sampSize)
                    frame = []
                    for v in range(self.nFeatures):
                        val = struct.unpack_from(">f", s, v * 4)
                        frame.append(val[0])
                    self.data.append(frame)

            if "K" in self.qualifiers:
                print("CRC checking not implememented...")