Python scipy.cluster.vq.whiten() Examples

The following are code examples for showing how to use scipy.cluster.vq.whiten(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: GenefyHMM   Author: adit-39   File: train.py    MIT License 6 votes vote down vote up
def vector_quantize(data_dict, vs, bins):
	codebooks = {}
	vq_data = {}
	for size in vs.keys():
		all_size_data = []
		for disease in vs[size]:
			all_size_data.extend(data_dict[disease])
		#whitened = sp.whiten(all_size_data)
		#codebooks[size] = sp.kmeans(whitened, bins)[0]
		codebooks[size] = sp.kmeans(np.asarray(all_size_data), bins)[0]
	pickle.dump(codebooks,open("all_codebooks.pkl","wb"))
	for dis in data_dict.keys():
		n = len(data_dict[dis])
		m = len(data_dict[dis][0])
		vq_data[dis] = map(str,sp.vq(np.reshape(data_dict[dis],(n,m)), codebooks[len(data_dict[dis][0])])[0])
	return vq_data 
Example 2
Project: practicalDataAnalysisCookbook   Author: drabastomek   File: clustering_kmeans_alternative.py    GNU General Public License v2.0 6 votes vote down vote up
def findClusters_kmeans(data):
    '''
        Cluster data using k-means
    '''
    # whiten the observations
    data_w = vq.whiten(data)

    # create the classifier object
    kmeans, labels = vq.kmeans2(
        data_w,
        k=4,
        iter=30
    )

    # fit the data
    return kmeans, labels

# the file name of the dataset 
Example 3
Project: Malware-Image-Analysis   Author: skycckk   File: quantizer.py    MIT License 6 votes vote down vote up
def cluster_all_features(feature_mat):
    """
    Run k-means to cluster the input feature vectors
    :param feature_mat: m-by-n ndarray
            M is the number of samples and N is dimensionality
    :return: dictionary<k, (distortion, centroids)>
            This dictionary tells the distortion with what centroids and what's K
            key: k value
            Value: tuple with <distortion, centroids> where centroids are k-by-n ndarray
    """
    n_dims = feature_mat.shape[1]
    whitened = whiten(feature_mat.transpose())
    all_codebooks = dict()
    for k in range(n_dims, 0, -1):
        centroids, distortion = kmeans(whitened, k)
        all_codebooks[k] = (distortion, centroids)

    return all_codebooks 
Example 4
Project: senior-design   Author: james-tate   File: ex1.py    GNU General Public License v2.0 6 votes vote down vote up
def cluster_data(data,cluster_cnt,iter=20,thresh=1e-5):
    """ Group data into a number of common clusters

        data -- 2D array of data points.  Each point is a row in the array.
        cluster_cnt -- The number of clusters to use
        iter -- number of iterations to use for kmeans algorithm
        thresh -- distortion threshold for kmeans algorithm

        return -- list of 2D arrays.  Each array contains the data points
                  that belong to a specific cluster.

        Uses kmeans algorithm to find the clusters.
    """
    wh_data = vq.whiten(data)
    code_book,dist = vq.kmeans(wh_data,cluster_cnt,iter,thresh)
    code_ids, distortion = vq.vq(wh_data,code_book)
    clusters = []
    for i in range(len(code_book)):
        cluster = compress(code_ids == i,data,0)
        clusters.append(cluster)
    return clusters 
Example 5
Project: link-prediction_with_deep-learning   Author: cambridgeltl   File: pairdist.py    MIT License 5 votes vote down vote up
def argparser():
    try:
        import argparse
    except ImportError:
        import compat.argparse as argparse

    ap=argparse.ArgumentParser()
    ap.add_argument('vectors', nargs=1, metavar='FILE', help='word vectors')
    ap.add_argument('-a', '--approximate', default=False, action='store_true',
                    help='filter by approximate similarity (with -t)')
    ap.add_argument('-i', '--min-index', default=0, type=int,
                    help='index of first word (default 0)')
    ap.add_argument('-M', '--metric', default=DEFAULT_METRIC, 
                    choices=sorted(metrics.keys()),
                    help='distance metric to apply')
    ap.add_argument('-n', '--normalize', default=False, action='store_true',
                    help='normalize vectors to unit length')
    ap.add_argument('-r', '--max-rank', metavar='INT', default=None, 
                    type=int, help='only consider r most frequent words')
    ap.add_argument('-t', '--threshold', metavar='FLOAT', default=None,
                    type=float, help='only output distances <= t')
    ap.add_argument('-T', '--tolerance', metavar='FLOAT', default=0.1,
                    type=float, help='approximation tolerace (with -a)')
    ap.add_argument('-w', '--whiten', default=False, action='store_true',
                    help='normalize features to unit variance ')
    ap.add_argument('-W', '--words',  default=False, action='store_true',
                    help='output words instead of indices')
    return ap 
Example 6
Project: link-prediction_with_deep-learning   Author: cambridgeltl   File: pairdist.py    MIT License 5 votes vote down vote up
def process_options(args):    
    options = argparser().parse_args(args)

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    if options.threshold is not None and options.threshold < 0.0:
        raise ValueError('threshold must be >= 0')
    if options.tolerance is not None and options.tolerance < 0.0:
        raise ValueError('tolerance must be >= 0')
    if options.approximate and not options.threshold:
        raise ValueError('approximate only makes sense with a threshold')
    if options.approximate and options.metric != 'cosine':
        raise NotImplementedError('approximate only supported for cosine')

    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    words, vectors = wv.words(), wv.vectors()

    if options.whiten:
        # whitening should be implemented in wvlib to support together with
        # approximate similarity
        if options.approximate:
            raise NotImplemenedError
        logging.info('normalize features to unit variance')
        vectors = whiten(vectors)

    return words, vectors, wv, options 
Example 7
Project: Malware-Image-Analysis   Author: skycckk   File: quantizer.py    MIT License 5 votes vote down vote up
def cluster_feature(feature_mat, k):
    """
    Apply K-means to get the clusters' centroid and distortion
    :param feature_mat: m-by-n ndarray
            M is the number of samples and N is dimensionality
    :param k: int
            Number of centroids
    :return: <centroids, distortions>
            centroids: k-by-n ndarray
            distortion: overall distortion for k centroids
    """
    whitened = whiten(feature_mat.transpose())
    centroid, distortion = kmeans(whitened, k)

    return centroid, distortion 
Example 8
Project: segment   Author: algorithmlixuan   File: _utils.py    MIT License 5 votes vote down vote up
def normalizeData(featureVectors, setMeanToZero, spatialWeight=1):

    means = []
    for col in range(0, len(featureVectors[0])):
        colMean = 0
        for row in range(0, len(featureVectors)):
            colMean += featureVectors[row][col]
        colMean /= len(featureVectors)
        means.append(colMean)

    for col in range(2, len(featureVectors[0])):
        for row in range(0, len(featureVectors)):
            featureVectors[row][col] -= means[col]
    copy = vq.whiten(featureVectors)
    if (setMeanToZero):
        for row in range(0, len(featureVectors)):
            for col in range(0, len(featureVectors[0])):
                copy[row][col] -= means[col]

    for row in range(0, len(featureVectors)):
        copy[row][0] *= spatialWeight
        copy[row][1] *= spatialWeight

    return copy

# Create the feature vectors and add in row and column data 
Example 9
Project: tomominer   Author: alberlab   File: kmeans.py    GNU General Public License v3.0 5 votes vote down vote up
def kmeans_clustering(data, k):
  """
  TODO: add docs

  :param data:
    :param k:
  """

  from scipy.cluster.vq import kmeans, vq, whiten

  data = whiten(data)
  centroids, _ = kmeans(data, k)
  labels,  _ = vq(data, centroids)

  return labels 
Example 10
Project: pdkit   Author: pdkit   File: clinical_updrs.py    MIT License 4 votes vote down vote up
def __train(self, n_neighbors=3):
        """
            Train the classifier implementing the `k-nearest neighbors vote <http://scikit-learn.org/stable/modules/\
            generated/sklearn.neighbors.KNeighborsClassifier.html>`_

            :param n_clusters: the number of clusters
            :type n_clusters: int
        """

        # m = self.labels.drop(['id','MDS_UPDRSIII'], axis=1).values
        # print(itemfreq(m))
        #
        # for i, row in enumerate(self.labels.drop(['id','MDS_UPDRSIII'], axis=1).values):
        #     print(np.bincount(row))

        try:
            for obs in self.observations:
                features, ids = self.__get_features_for_observation(observation=obs, skip_id=3497,
                                                                    last_column_is_id=True)
                normalised_data = whiten(features)

                x = pd.DataFrame(normalised_data)
                y = self.labels[obs].values
                # x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

                knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights='distance')
                # knn.fit(x_train, y_train)
                knn.fit(x, y)

                # print('Accuracy of K-NN classifier: {:.2f}'.format(knn.score(x, y)))
                # print('Accuracy of K-NN classifier on training set: {:.2f}'.format(knn.score(x_train, y_train)))
                # print('Accuracy of K-NN classifier on test set: {:.2f}'.format(knn.score(x_test, y_test)))
                # print('------')

                if not self.knns:
                    self.knns = [[obs, knn]]
                else:
                    self.knns.append([obs, knn])
        except IOError as e:
            ierr = "({}): {}".format(e.errno, e.strerror)
            logging.error("Error training Clinical UPDRS, file not found, I/O error %s", ierr)

        except ValueError as verr:
            logging.error("Error training Clinical UPDRS ValueError ->%s", verr.message)

        except:
            logging.error("Unexpected error on training Clinical UPDRS init: %s", sys.exc_info()[0]) 
Example 11
Project: pdkit   Author: pdkit   File: updrs.py    MIT License 4 votes vote down vote up
def __train(self, n_clusters=4):
        """
            Calculate cluster's centroids and standard deviations. If there are at least the number of threshold rows \
            then:

            * Observations will be normalised.

            * Standard deviations will be returned.

            * Clusters will be returned.

            * Centroids are ordered based on their distance from an arbitrary -100, -100 point.

            If there are not enough Observations, then centroids and standard deviations will be set to the empty list.

            General strategy: Use numpy.array for calculations. Keep everything in float. Convert arrays back to lists \
            at the end.

            :param n_clusters: the number of clusters
            :type n_clusters: int
        """

        try:
            for obs in self.observations:
                features, ids = self.__get_features_for_observation(observation=obs, last_column_is_id=True)
                # the last column is the observation id
                normalised_data = whiten(features)

                # skip any rows that contain just zero values... they create nans
                first_safe_row = pdkit.utils.non_zero_index(normalised_data)
                observation_ids = features.tolist()
                sd = features[first_safe_row] / normalised_data[first_safe_row]

                # Calculate centroids and sort result
                centroids_array, _ = kmeans(normalised_data, n_clusters)
                sorted_centroids = pdkit.utils.centroid_sort(centroids_array)

                if not self.clusters:
                    self.clusters = [[obs, sd.tolist(), sorted_centroids.tolist()]]
                else:
                    self.clusters.append([obs, sd.tolist(),sorted_centroids.tolist()])
        except IOError as e:
            ierr = "({}): {}".format(e.errno, e.strerror)
            logging.error("Error training UPDRS, file not found, I/O error %s", ierr)

        except ValueError as verr:
            logging.error("Error training UPDRS ValueError ->%s", verr.message)

        except:
            logging.error("Unexpected error on training UPDRS init: %s", sys.exc_info()[0]) 
Example 12
Project: color-clustering   Author: thobbs   File: color_clustering.py    Apache License 2.0 4 votes vote down vote up
def analyze(filename, num_means, rounds):
    """
    Returns a tuple of two objects:
      * A list of the means in the form [(h, s, v), ...].  Each of the
        (h, s, v) values are in the range [0, 1].
      * A list of the same length containing the number of pixels that
        are closest to the mean at the same index in the first list.
    """

    # open the image
    current_dir = os.path.dirname(os.path.realpath(__file__))
    img = Image.open(os.path.join(current_dir, filename))

    # load pixels into array
    flat_img = np.asarray(img)

    # convert from rgb to hsv (all values in range [0, 1])
    flat_img = np.apply_along_axis(
        lambda a: (a[0] / 255.0, a[1] / 255.0, a[2] / 255.0), 2, flat_img)
    flat_img = matplotlib.colors.rgb_to_hsv(flat_img)

    # reshape to an Nx3 array
    img = np.reshape(flat_img, (len(flat_img) * len(flat_img[0]), 3))

    # perform k-means clustering
    stdev = get_stdev(img)
    whitened = whiten(img)
    means, _ = kmeans(whitened, num_means, iter=rounds)
    unwhitened = means * stdev

    unwhitened = map(tuple, unwhitened)
    unwhitened.sort()

    # count the number of pixels that are closest to each centroid
    match_counts = [0] * len(unwhitened)
    for i, row in enumerate(flat_img):
        for a in row:
            distances = [dist(a, b) for b in unwhitened]
            min_index = distances.index(min(distances))
            match_counts[min_index] += 1

    return unwhitened, match_counts