Python scipy.cluster.vq.whiten() Examples

The following are 9 code examples of scipy.cluster.vq.whiten(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scipy.cluster.vq , or try the search function .
Example #1
Source File: knn.py    From deepnl with GNU General Public License v3.0 7 votes vote down vote up
def Kmeans(file, vocabfile, k):
  np.random.seed((1000,2000))
  whitened = whiten(embeddings)
  codebook, distortion = kmeans(whitened, k)
  clusters = [l2_nearest(embeddings, c, representatives+1) for c in codebook]
  # output
  print(len(codebook), distortion)
  for centroid in codebook:
    print(' '.join([str(x) for x in centroid]))
  print()
  for cluster in clusters:
    print(' '.join([id_word[i] for i, d in cluster]).encode('utf-8'))
  print()
  # assign clusters to words
  codes, _ = vq(embeddings, codebook)
  for w, c in zip(word_id.keys(), codes):
    print(w, c) 
Example #2
Source File: clustering_kmeans_alternative.py    From practicalDataAnalysisCookbook with GNU General Public License v2.0 6 votes vote down vote up
def findClusters_kmeans(data):
    '''
        Cluster data using k-means
    '''
    # whiten the observations
    data_w = vq.whiten(data)

    # create the classifier object
    kmeans, labels = vq.kmeans2(
        data_w,
        k=4,
        iter=30
    )

    # fit the data
    return kmeans, labels

# the file name of the dataset 
Example #3
Source File: segmenter.py    From msaf with MIT License 6 votes vote down vote up
def compute_labels_kmeans(fmcs, k):
    # Removing the higher frequencies seem to yield better results
    fmcs = fmcs[:, fmcs.shape[1] // 2:]

    # Pre-process
    fmcs = np.log1p(fmcs)
    wfmcs = vq.whiten(fmcs)

    # Make sure we are not using more clusters than existing segments
    if k > fmcs.shape[0]:
        k = fmcs.shape[0]

    # K-means
    kmeans = KMeans(n_clusters=k, n_init=100)
    kmeans.fit(wfmcs)

    return kmeans.labels_ 
Example #4
Source File: pairdist.py    From link-prediction_with_deep-learning with MIT License 5 votes vote down vote up
def argparser():
    try:
        import argparse
    except ImportError:
        import compat.argparse as argparse

    ap=argparse.ArgumentParser()
    ap.add_argument('vectors', nargs=1, metavar='FILE', help='word vectors')
    ap.add_argument('-a', '--approximate', default=False, action='store_true',
                    help='filter by approximate similarity (with -t)')
    ap.add_argument('-i', '--min-index', default=0, type=int,
                    help='index of first word (default 0)')
    ap.add_argument('-M', '--metric', default=DEFAULT_METRIC, 
                    choices=sorted(metrics.keys()),
                    help='distance metric to apply')
    ap.add_argument('-n', '--normalize', default=False, action='store_true',
                    help='normalize vectors to unit length')
    ap.add_argument('-r', '--max-rank', metavar='INT', default=None, 
                    type=int, help='only consider r most frequent words')
    ap.add_argument('-t', '--threshold', metavar='FLOAT', default=None,
                    type=float, help='only output distances <= t')
    ap.add_argument('-T', '--tolerance', metavar='FLOAT', default=0.1,
                    type=float, help='approximation tolerace (with -a)')
    ap.add_argument('-w', '--whiten', default=False, action='store_true',
                    help='normalize features to unit variance ')
    ap.add_argument('-W', '--words',  default=False, action='store_true',
                    help='output words instead of indices')
    return ap 
Example #5
Source File: pairdist.py    From link-prediction_with_deep-learning with MIT License 5 votes vote down vote up
def process_options(args):    
    options = argparser().parse_args(args)

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    if options.threshold is not None and options.threshold < 0.0:
        raise ValueError('threshold must be >= 0')
    if options.tolerance is not None and options.tolerance < 0.0:
        raise ValueError('tolerance must be >= 0')
    if options.approximate and not options.threshold:
        raise ValueError('approximate only makes sense with a threshold')
    if options.approximate and options.metric != 'cosine':
        raise NotImplementedError('approximate only supported for cosine')

    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    words, vectors = wv.words(), wv.vectors()

    if options.whiten:
        # whitening should be implemented in wvlib to support together with
        # approximate similarity
        if options.approximate:
            raise NotImplemenedError
        logging.info('normalize features to unit variance')
        vectors = whiten(vectors)

    return words, vectors, wv, options 
Example #6
Source File: xmeans.py    From msaf with MIT License 5 votes vote down vote up
def run_kmeans(self, X, K):
        """Runs k-means and returns the labels assigned to the data."""
        wX = vq.whiten(X)
        means, dist = vq.kmeans(wX, K, iter=100)
        labels, dist = vq.vq(wX, means)
        return means, labels 
Example #7
Source File: xmeans.py    From msaf with MIT License 5 votes vote down vote up
def compute_bic(self, D, means, labels, K, R):
        """Computes the Bayesian Information Criterion."""
        D = vq.whiten(D)
        Rn = D.shape[0]
        M = D.shape[1]

        if R == K:
            return 1

        # Maximum likelihood estimate (MLE)
        mle_var = 0
        for k in range(len(means)):
            X = D[np.argwhere(labels == k)]
            X = X.reshape((X.shape[0], X.shape[-1]))
            for x in X:
                mle_var += distance.euclidean(x, means[k])
                #print x, means[k], mle_var
        mle_var /= float(R - K)

        # Log-likelihood of the data
        l_D = - Rn/2. * np.log(2*np.pi) - (Rn * M)/2. * np.log(mle_var) - \
            (Rn - K) / 2. + Rn * np.log(Rn) - Rn * np.log(R)

        # Params of BIC
        p = (K-1) + M * K + mle_var

        #print "BIC:", l_D, p, R, K

        # Return the bic
        return l_D - p / 2. * np.log(R) 
Example #8
Source File: xmeans.py    From msaf with MIT License 5 votes vote down vote up
def test_kmeans(K=5):
    """Test k-means with the synthetic data."""
    X = XMeans.generate_2d_data(K=4)
    wX = vq.whiten(X)
    dic, dist = vq.kmeans(wX, K, iter=100)

    plt.scatter(wX[:, 0], wX[:, 1])
    plt.scatter(dic[:, 0], dic[:, 1], color="m")
    plt.show() 
Example #9
Source File: analysis.py    From enlopy with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def get_load_archetypes(Load, k=2, x='hour', y='dayofyear', plot_diagnostics=False):
    """Extract typical load profiles using k-means and vector quantization. the time scale of archetypes depend on the selected dimensions (x,y).
    For the default values daily archetypes will be extracted.

    Parameters:
        Load (pd.Series): timeseries
        k (int): number of archetypes to identify and extract
        x (str): This will define how the timeseries will be grouped by. Has to be an accessor of pd.DatetimeIndex
        y (str): similar to above for y axis.
        plot_diagnostics (bool): If true a figure is plotted showing an overview of the results
    Returns:
        np.ndarray: dimensions (k, len(x))
    """
    from scipy.cluster.vq import whiten, kmeans, vq

    df = reshape_timeseries(Load, x=x, y=y, aggfunc='mean').astype(float)
    df_white = whiten(df)
    clusters_center, __ = kmeans(df_white, k)
    clusters_center_dewhitened = clusters_center.T * np.array([df.std(), ] * k ).T

    if plot_diagnostics:
        try:
            import matplotlib.pyplot as plt
            clusters, _ = vq(df_white, clusters_center)
            cm = _n_colors_from_colormap(k)
            ax1 = df.T.plot(legend=False, alpha=.1,
                            color=[cm[i] for i in clusters])
            # Add colored cluster centers as lines
            ax1.set_prop_cycle('color', cm)
            ax1.plot(clusters_center_dewhitened, linewidth=3, linestyle='--')
            plt.figure()  # FIXME: works only with weekdays
            day_clusters = pd.DataFrame({y: Load.resample('d').mean().index.weekday,
                                         'clusters': clusters,
                                         'val': 1})
            x_labels = "Mon Tue Wed Thu Fri Sat Sun".split()
            day_clusters.pivot_table(columns=y, index='clusters',
                                     aggfunc='count').T.plot.bar(stacked=True)
            plt.gca().set_xticklabels(x_labels)
        except Exception: #FIXME: specify exception
            print ('Works only with daily profile clustering')

    return clusters_center_dewhitened