Python scipy.cluster.vq.kmeans() Examples

The following are code examples for showing how to use scipy.cluster.vq.kmeans(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: GenefyHMM   Author: adit-39   File: train.py    MIT License 6 votes vote down vote up
def vector_quantize(data_dict, vs, bins):
	codebooks = {}
	vq_data = {}
	for size in vs.keys():
		all_size_data = []
		for disease in vs[size]:
			all_size_data.extend(data_dict[disease])
		#whitened = sp.whiten(all_size_data)
		#codebooks[size] = sp.kmeans(whitened, bins)[0]
		codebooks[size] = sp.kmeans(np.asarray(all_size_data), bins)[0]
	pickle.dump(codebooks,open("all_codebooks.pkl","wb"))
	for dis in data_dict.keys():
		n = len(data_dict[dis])
		m = len(data_dict[dis][0])
		vq_data[dis] = map(str,sp.vq(np.reshape(data_dict[dis],(n,m)), codebooks[len(data_dict[dis][0])])[0])
	return vq_data 
Example 2
Project: Computable   Author: ktraunmueller   File: test_vq.py    MIT License 6 votes vote down vote up
def test_large_features(self):
        # Generate a data set with large values, and run kmeans on it to
        # (regression for 1077).
        d = 300
        n = 100

        m1 = np.random.randn(d)
        m2 = np.random.randn(d)
        x = 10000 * np.random.randn(n, d) - 20000 * m1
        y = 10000 * np.random.randn(n, d) + 20000 * m2

        data = np.empty((x.shape[0] + y.shape[0], d), np.double)
        data[:x.shape[0]] = x
        data[x.shape[0]:] = y

        kmeans(data, 2) 
Example 3
Project: Computable   Author: ktraunmueller   File: test_vq.py    MIT License 6 votes vote down vote up
def test_kmeans_lost_cluster(self):
        """This will cause kmean to have a cluster with no points."""
        data = np.fromfile(DATAFILE1, sep=", ")
        data = data.reshape((200, 2))
        initk = np.array([[-1.8127404, -0.67128041],
                         [2.04621601, 0.07401111],
                         [-2.31149087,-0.05160469]])

        res = kmeans(data, initk)

        warn_ctx = WarningManager()
        warn_ctx.__enter__()
        try:
            warnings.simplefilter('ignore', UserWarning)
            res = kmeans2(data, initk, missing='warn')
        finally:
            warn_ctx.__exit__()

        assert_raises(ClusterError, kmeans2, data, initk, missing='raise') 
Example 4
Project: PyME   Author: vikramsunkara   File: util_FSP.py    GNU Affero General Public License v3.0 6 votes vote down vote up
def seperate_via_kmeans(state_space,p,K,tau=0.1):
	from scipy.cluster.vq import kmeans	
	#centres= np.floor(kmeans2(state_space,K)[0]) # these are the original lines
	
	# the following are being added as hacks
	
	#_all_cores_filled_ = False
	#while(_all_cores_filled_ == False):
		#centres, distributed = kmeans(state_space,K)
		#print("going into k means" + "we only have " + str(np.max(distributed)))
		#if np.max(distributed) == K-1:
			#_all_cores_filled_ = True
	
	# bhack to make just the K means work
	
	centres, stuff = kmeans(state_space,K)
	
	# hack ends here	
	
	#proportions = partition_algo_distances(state_space,centres,tau)
	proportions = partition_algo_distances_tight(state_space,centres,tau)
	sub_state_space, sub_prob = seperate_via_proportions(state_space,proportions, p)
	return sub_state_space, sub_prob, centres 
Example 5
Project: Malware-Image-Analysis   Author: skycckk   File: quantizer.py    MIT License 6 votes vote down vote up
def cluster_all_features(feature_mat):
    """
    Run k-means to cluster the input feature vectors
    :param feature_mat: m-by-n ndarray
            M is the number of samples and N is dimensionality
    :return: dictionary<k, (distortion, centroids)>
            This dictionary tells the distortion with what centroids and what's K
            key: k value
            Value: tuple with <distortion, centroids> where centroids are k-by-n ndarray
    """
    n_dims = feature_mat.shape[1]
    whitened = whiten(feature_mat.transpose())
    all_codebooks = dict()
    for k in range(n_dims, 0, -1):
        centroids, distortion = kmeans(whitened, k)
        all_codebooks[k] = (distortion, centroids)

    return all_codebooks 
Example 6
Project: colorz   Author: metakirby5   File: colorz.py    MIT License 6 votes vote down vote up
def colorz(fd, n=DEFAULT_NUM_COLORS, min_v=DEFAULT_MINV, max_v=DEFAULT_MAXV,
           bold_add=DEFAULT_BOLD_ADD, order_colors=True):
    """
    Get the n most dominant colors of an image.
    Clamps value to between min_v and max_v.

    Creates bold colors using bold_add.
    Total number of colors returned is 2*n, optionally ordered by hue.
    Returns as a list of pairs of RGB triples.

    For terminal colors, the hue order is:
    red, yellow, green, cyan, blue, magenta
    """
    img = Image.open(fd)
    img.thumbnail(THUMB_SIZE)

    obs = get_colors(img)
    clamped = [clamp(color, min_v, max_v) for color in obs]
    clusters, _ = kmeans(array(clamped).astype(float), n)
    colors = order_by_hue(clusters) if order_colors else clusters
    return list(zip(colors, [brighten(c, bold_add) for c in colors])) 
Example 7
Project: senior-design   Author: james-tate   File: test_vq.py    GNU General Public License v2.0 6 votes vote down vote up
def test_large_features(self):
        # Generate a data set with large values, and run kmeans on it to
        # (regression for 1077).
        d = 300
        n = 1e2

        m1 = np.random.randn(d)
        m2 = np.random.randn(d)
        x = 10000 * np.random.randn(n, d) - 20000 * m1
        y = 10000 * np.random.randn(n, d) + 20000 * m2

        data = np.empty((x.shape[0] + y.shape[0], d), np.double)
        data[:x.shape[0]] = x
        data[x.shape[0]:] = y

        res = kmeans(data, 2) 
Example 8
Project: senior-design   Author: james-tate   File: test_vq.py    GNU General Public License v2.0 6 votes vote down vote up
def test_kmeans_lost_cluster(self):
        """This will cause kmean to have a cluster with no points."""
        data = np.fromfile(open(DATAFILE1), sep = ", ")
        data = data.reshape((200, 2))
        initk = np.array([[-1.8127404, -0.67128041],
                         [ 2.04621601, 0.07401111],
                         [-2.31149087,-0.05160469]])

        res = kmeans(data, initk)
        warnings.simplefilter('ignore', UserWarning)
        try:
            res = kmeans2(data, initk, missing = 'warn')
        finally:
            warnings.simplefilter('default', UserWarning)

        try :
            res = kmeans2(data, initk, missing = 'raise')
            raise AssertionError("Exception not raised ! Should not happen")
        except ClusterError, e:
            pass 
Example 9
Project: senior-design   Author: james-tate   File: test_vq.py    GNU General Public License v2.0 6 votes vote down vote up
def test_kmeans_0k(self):
        """Regression test for #546: fail when k arg is 0."""
        try:
            kmeans(X, 0)
            raise AssertionError("kmeans with 0 clusters should fail.")
        except ValueError:
            pass

        try:
            kmeans2(X, 0)
            raise AssertionError("kmeans2 with 0 clusters should fail.")
        except ValueError:
            pass

        try:
            kmeans2(X, np.array([]))
            raise AssertionError("kmeans2 with 0 clusters should fail.")
        except ValueError:
            pass 
Example 10
Project: senior-design   Author: james-tate   File: ex1.py    GNU General Public License v2.0 6 votes vote down vote up
def cluster_data(data,cluster_cnt,iter=20,thresh=1e-5):
    """ Group data into a number of common clusters

        data -- 2D array of data points.  Each point is a row in the array.
        cluster_cnt -- The number of clusters to use
        iter -- number of iterations to use for kmeans algorithm
        thresh -- distortion threshold for kmeans algorithm

        return -- list of 2D arrays.  Each array contains the data points
                  that belong to a specific cluster.

        Uses kmeans algorithm to find the clusters.
    """
    wh_data = vq.whiten(data)
    code_book,dist = vq.kmeans(wh_data,cluster_cnt,iter,thresh)
    code_ids, distortion = vq.vq(wh_data,code_book)
    clusters = []
    for i in range(len(code_book)):
        cluster = compress(code_ids == i,data,0)
        clusters.append(cluster)
    return clusters 
Example 11
Project: NIPS2015   Author: sparseMCMC   File: image_demo.py    GNU General Public License v2.0 6 votes vote down vote up
def load_data(seed, ntrain, datasetName, num_inducing):
    d = io.loadmat('benchmarks.mat')[datasetName][0, 0]
    x, y = d[0], d[1]
    y = np.where(y == 1, 1, 0)  # data is stored as +-1, we use 1, 0

    # split into train, test sets
    np.random.seed(seed)
    index = np.random.permutation(x.shape[0])
    itrain, itest = index[:ntrain], index[ntrain:]
    xtrain, xtest = x[itrain], x[itest]
    ytrain, ytest = y[itrain], y[itest]

    # normalize using training data mean, std
    xmean, xstd = xtrain.mean(0), xtrain.std(0)
    xstd = np.where(xstd > 1e-6, xstd, 1.)
    xtrain, xtest = (xtrain-xmean)/xstd, (xtest-xmean)/xstd
    Z, _ = scipy_kmeans(xtrain, num_inducing)
    return dict(Xtrain=xtrain, Ytrain=ytrain, Xtest=xtest, Ytest=ytest, Z=Z) 
Example 12
Project: ColorRecognition   Author: bijilap   File: ColorDetector.py    Apache License 2.0 5 votes vote down vote up
def analyzeImage(self, image):
        color_band = scipy.misc.fromimage(image)
        shape = color_band.shape
        color_band = color_band.reshape(scipy.product(shape[:2]), shape[2])

        self.log('generating clusters')
        codes, dist = kmeans(color_band, self.NUM_OF_CLUSTERS)
        self.log('Here are the cluster centres:')
        self.log(codes)

        vecs, dist = vq(color_band, codes)         # assign codes
        counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences

        return (codes, counts) 
Example 13
Project: peri   Author: peri-source   File: tiling.py    MIT License 5 votes vote down vote up
def cluster_tiles_by_volume(tiles, volumes, nclusters, nattempts, max_pix=None):
    max_pix = max_pix or np.inf

    # cluster the tiles by volume
    logvol = np.log10(volumes)
    centers = vq.kmeans(logvol, nclusters)[0]
    labels = vq.vq(logvol, centers)[0]
    ids = np.arange(labels.max())

    # get the centers in order so we can walk down the list
    centers, ids = (
        list(t) for t in zip(*sorted(zip(centers, ids), reverse=True))
    )

    # get the groups that are viable based on memory constraints
    grouped_labels = [
        labels[labels==i] for i in ids if volumes[labels==i].max() < max_pix
    ]
    return grouped_labels
    """

    # do hierarchical clustering starting with the largest sizes
    for 
    grp = groups[centers.argmax()]
    return tiles, volumes
    """ 
Example 14
Project: DominantColor   Author: michaelrhyndress   File: DominantColor.py    GNU Lesser General Public License v3.0 5 votes vote down vote up
def generate_scalar(self):
        """
        breaks the image into a multi-dimensional array -> (ar)
        then breaks the rows up by NUM_CLUSTERS & color -> (codes)
        """
        ar = fromimage(self.im)
        shape = ar.shape
        ar = ar.reshape(product(shape[:2]), shape[2])
        float_ar = ar+0.0
        codes, dist = kmeans(float_ar, self.NUM_CLUSTERS)
        return ar, codes 
Example 15
Project: Computable   Author: ktraunmueller   File: test_vq.py    MIT License 5 votes vote down vote up
def test_kmeans_simple(self):
        initc = np.concatenate(([[X[0]], [X[1]], [X[2]]]))
        code = initc.copy()
        code1 = kmeans(X, code, iter=1)[0]

        assert_array_almost_equal(code1, CODET2) 
Example 16
Project: Computable   Author: ktraunmueller   File: test_vq.py    MIT License 5 votes vote down vote up
def test_kmeans_0k(self):
        """Regression test for #546: fail when k arg is 0."""
        assert_raises(ValueError, kmeans, X, 0)
        assert_raises(ValueError, kmeans2, X, 0)
        assert_raises(ValueError, kmeans2, X, np.array([])) 
Example 17
Project: ControllableVideoGen   Author: zekunhao1995   File: rp_reader.py    Apache License 2.0 5 votes vote down vote up
def filter_trajs_kmeans(trajs, num_centroids):
    num_trajs = trajs.shape[0]
    len_trajs = trajs.shape[1]
    traj_vec_stor = np.empty((num_trajs, (len_trajs-1)*2), np.float32)
    disp_stor = np.empty((num_trajs,), np.float32)
        
    for ii in range(num_trajs):
        traj = trajs[ii,:,:]  # n-by-2
        traj_vec_stor[ii,:] = (traj[1:,:] - traj[0,:]).flatten() # substract start point        
        disp_stor[ii] = np.sum(np.sqrt(np.sum((traj[1:,:]-traj[0:-1,:])**2,1)))
    # Remove trajectories that have very low displacement
    good_trajs = np.flatnonzero(disp_stor>0.4)
    traj_vec_stor = traj_vec_stor[good_trajs,:]
    
    if traj_vec_stor.shape[0] < num_centroids: # too few points
        #print("kmeans: TOO FEW USABLE KEYPOINTS")
        return good_trajs[np.arange(0,traj_vec_stor.shape[0]-1)] # try to use all of them
        
    # k-means on vectors
    #num_centroids = 10
    #centroids,_ = kmeans(traj_vec_stor,k_or_guess=num_centroids, iter=100)
    centroids,label = kmeans(traj_vec_stor,num_centroids, iter=20) # Label[i] is the cluster no that i-th datapoint belongs to
    
    # Sample
    # Find the nearest vectors to centroids
    rep = np.argmin(np.sum((traj_vec_stor[:,np.newaxis,:]-centroids[:,:])**2,2),0) # 10-dim
    
    rep = good_trajs[rep]
    
    return rep # return the index of K most representative trajectories 
Example 18
Project: ControllableVideoGen   Author: zekunhao1995   File: ucf_reader.py    Apache License 2.0 5 votes vote down vote up
def filter_trajs_kmeans(trajs, num_centroids):
    num_trajs = trajs.shape[0]
    len_trajs = trajs.shape[1]
    traj_vec_stor = np.empty((num_trajs, (len_trajs-1)*2), np.float32)
    disp_stor = np.empty((num_trajs,), np.float32)
        
    for ii in range(num_trajs):
        traj = trajs[ii,:,:]  # n-by-2
        traj_vec_stor[ii,:] = (traj[1:,:] - traj[0,:]).flatten() # substract start point        
        disp_stor[ii] = np.sum(np.sqrt(np.sum((traj[1:,:]-traj[0:-1,:])**2,1)))
    # Remove trajectories that have very low displacement
    good_trajs = np.flatnonzero(disp_stor>0.4)
    traj_vec_stor = traj_vec_stor[good_trajs,:]
    
    if traj_vec_stor.shape[0] < num_centroids: # too few points
        #print("kmeans: TOO FEW USABLE KEYPOINTS")
        return good_trajs[np.arange(0,traj_vec_stor.shape[0]-1)] # try to use all of them
        
    # k-means on vectors
    #num_centroids = 10
    #centroids,_ = kmeans(traj_vec_stor,k_or_guess=num_centroids, iter=100)
    centroids,label = kmeans(traj_vec_stor,num_centroids, iter=20) # Label[i] is the cluster no that i-th datapoint belongs to
    
    # Sample
    # Find the nearest vectors to centroids
    rep = np.argmin(np.sum((traj_vec_stor[:,np.newaxis,:]-centroids[:,:])**2,2),0) # 10-dim
    
    rep = good_trajs[rep]
    
    return rep # return the index of K most representative trajectories 
Example 19
Project: ControllableVideoGen   Author: zekunhao1995   File: kitti_reader.py    Apache License 2.0 5 votes vote down vote up
def filter_trajs_kmeans(trajs, num_centroids):
    num_trajs = trajs.shape[0]
    len_trajs = trajs.shape[1]
    traj_vec_stor = np.empty((num_trajs, (len_trajs-1)*2), np.float32)
    disp_stor = np.empty((num_trajs,), np.float32)
        
    for ii in range(num_trajs):
        traj = trajs[ii,:,:]  # n-by-2
        traj_vec_stor[ii,:] = (traj[1:,:] - traj[0,:]).flatten() # substract start point        
        disp_stor[ii] = np.sum(np.sqrt(np.sum((traj[1:,:]-traj[0:-1,:])**2,1)))
    # Remove trajectories that have very low displacement
    good_trajs = np.flatnonzero(disp_stor>0.4)
    traj_vec_stor = traj_vec_stor[good_trajs,:]
    
    if traj_vec_stor.shape[0] < num_centroids: # too few points
        #print("kmeans: TOO FEW USABLE KEYPOINTS")
        return good_trajs[np.arange(0,traj_vec_stor.shape[0]-1)] # try to use all of them
        
    # k-means on vectors
    #num_centroids = 10
    #centroids,_ = kmeans(traj_vec_stor,k_or_guess=num_centroids, iter=100)
    centroids,label = kmeans(traj_vec_stor,num_centroids, iter=20) # Label[i] is the cluster no that i-th datapoint belongs to
    
    # Sample
    # Find the nearest vectors to centroids
    rep = np.argmin(np.sum((traj_vec_stor[:,np.newaxis,:]-centroids[:,:])**2,2),0) # 10-dim
    
    rep = good_trajs[rep]
    
    return rep # return the index of K most representative trajectories 
Example 20
Project: ControllableVideoGen   Author: zekunhao1995   File: batch_process_dataset.py    Apache License 2.0 5 votes vote down vote up
def filter_trajs_kmeans(trajs, dec_frames, num_centroids):
    num_trajs = len(trajs)
    traj_vec_stor = np.empty((num_trajs, (dec_frames-1)*2), np.float32)
    disp_stor = np.empty((num_trajs,), np.float32)
        
    for ii in range(num_trajs):
        traj = trajs[ii,0:dec_frames,:]  # n-by-2
        traj_vec_stor[ii,:] = (traj[1:,:] - traj[0,:]).flatten() # substract start point        
        disp_stor[ii] = np.sum(np.sqrt(np.sum((traj[1:,:]-traj[0:-1,:])**2,1)))
    # Remove trajectories that have very low displacement
    good_trajs = np.flatnonzero(disp_stor>0.4)
    traj_vec_stor = traj_vec_stor[good_trajs,:]
    
    if traj_vec_stor.shape[0] < num_centroids: # too few points
        print("kmeans: TOO FEW USABLE KEYPOINTS")
        return good_trajs[np.arange(0,traj_vec_stor.shape[0]-1)] # try to use all of them
        
    # k-means on vectors
    #num_centroids = 10
    #centroids,_ = kmeans(traj_vec_stor,k_or_guess=num_centroids, iter=100)
    centroids,_ = kmeans(traj_vec_stor,num_centroids, iter=100)
    
    # Find the nearest vectors to centroids
    rep = np.argmin(np.sum((traj_vec_stor[:,np.newaxis,:]-centroids[:,:])**2,2),0) # 10-dim
    
    rep = good_trajs[rep]
    
    return rep # return the index of K most representative trajectories
    
# ==========================================================================

# This time we don't do clustering
# Setting parameters 
Example 21
Project: ControllableVideoGen   Author: zekunhao1995   File: batch_process_dataset.py    Apache License 2.0 5 votes vote down vote up
def filter_trajs_kmeans(trajs, dec_frames, num_centroids):
    num_trajs = len(trajs)
    traj_vec_stor = np.empty((num_trajs, (dec_frames-1)*2), np.float32)
    disp_stor = np.empty((num_trajs,), np.float32)
        
    for ii in range(num_trajs):
        traj = trajs[ii,0:dec_frames,:]  # n-by-2
        traj_vec_stor[ii,:] = (traj[1:,:] - traj[0,:]).flatten() # substract start point        
        disp_stor[ii] = np.sum(np.sqrt(np.sum((traj[1:,:]-traj[0:-1,:])**2,1)))
    # Remove trajectories that have very low displacement
    good_trajs = np.flatnonzero(disp_stor>0.4)
    traj_vec_stor = traj_vec_stor[good_trajs,:]
    
    if traj_vec_stor.shape[0] < num_centroids: # too few points
        print("kmeans: TOO FEW USABLE KEYPOINTS")
        return good_trajs[np.arange(0,traj_vec_stor.shape[0]-1)] # try to use all of them
        
    # k-means on vectors
    #num_centroids = 10
    #centroids,_ = kmeans(traj_vec_stor,k_or_guess=num_centroids, iter=100)
    centroids,_ = kmeans(traj_vec_stor,num_centroids, iter=100)
    
    # Find the nearest vectors to centroids
    rep = np.argmin(np.sum((traj_vec_stor[:,np.newaxis,:]-centroids[:,:])**2,2),0) # 10-dim
    
    rep = good_trajs[rep]
    
    return rep # return the index of K most representative trajectories
    
# ==========================================================================

# This time we don't do clustering
# Setting parameters 
Example 22
Project: simple-linear-regression   Author: williamd4112   File: preprocess.py    MIT License 5 votes vote down vote up
def kmeans(x, k):
    centroids, dist = _kmeans(x, k)
    idx, _ = vq(x,centroids)
    return idx, centroids, dist 
Example 23
Project: NoteShrinker   Author: ghandic   File: noteshrinker.py    MIT License 5 votes vote down vote up
def set_palette(self, samples, kmeans_iter=40):
        '''Extract the palette for the set of sampled RGB values. The first
        palette entry is always the background color; the rest are determined
        from foreground pixels by running K-means clustering. Returns the
        palette, as well as a mask corresponding to the foreground pixels.'''

        self.bg_color = get_bg_color(samples)

        self.fg_mask = get_fg_mask(self.bg_color, samples, self.value_threshold, self.sat_threshold)

        self.centers, _ = kmeans(self.samples[self.fg_mask].astype(np.float32),
                            self.num_colors - 1,
                            iter=kmeans_iter)

        self.palette = np.vstack((self.bg_color, self.centers)).astype(np.uint8) 
Example 24
Project: kusanagi   Author: mcgillmrl   File: SPGP.py    MIT License 5 votes vote down vote up
def init_pseudo_inputs(self):
        msg = "Dataset must have more than n_inducing [ %n ] to enable"
        msg += " inference with sparse pseudo inputs"
        assert self.N >= self.n_inducing, msg % (self.n_inducing)
        self.should_recompile = True
        # pick initial cluster centers from dataset
        X = self.X.get_value()
        X_sp_ = utils.kmeanspp(X, self.n_inducing)

        # perform kmeans to get initial cluster centers
        utils.print_with_stamp('Initialising pseudo inputs', self.name)
        X_sp_, dist = kmeans(X, X_sp_, iter=200, thresh=1e-9)
        # initialize symbolic tensor variable if necessary
        # (this will create the self.X_sp atttribute)
        self.set_params({'X_sp': X_sp_}) 
Example 25
Project: dvt   Author: distant-viewing   File: cielab.py    GNU General Public License v2.0 5 votes vote down vote up
def _get_cielab_dominant(img, num_dominant):
    img_flat = img.reshape(-1, 3).astype(float32)

    # increasing iter would give 'better' clustering, at the cost of speed
    dominant_colors, _ = kmeans(img_flat, num_dominant, iter=5)

    if dominant_colors.shape[0] != num_dominant:         # pragma: no cover
        diff = num_dominant - dominant_colors.shape[0]
        dominant_colors = vstack([
            dominant_colors,
            zeros((diff, dominant_colors.shape[1]))
        ])

    return dominant_colors.astype(uint8) 
Example 26
Project: Malware-Image-Analysis   Author: skycckk   File: quantizer.py    MIT License 5 votes vote down vote up
def cluster_feature(feature_mat, k):
    """
    Apply K-means to get the clusters' centroid and distortion
    :param feature_mat: m-by-n ndarray
            M is the number of samples and N is dimensionality
    :param k: int
            Number of centroids
    :return: <centroids, distortions>
            centroids: k-by-n ndarray
            distortion: overall distortion for k centroids
    """
    whitened = whiten(feature_mat.transpose())
    centroid, distortion = kmeans(whitened, k)

    return centroid, distortion 
Example 27
Project: color-clustering   Author: thobbs   File: color_clustering.py    Apache License 2.0 5 votes vote down vote up
def parse_options():
    parser = OptionParser()
    parser.add_option('-k', '--kmeans', type='int', default=36,
                      help='number of means for clustering [default: %default]')
    parser.add_option('-r', '--rounds', type='int', default=10,
                      help='number of clustering rounds; higher values increase accuracy [default: %default]')
    parser.add_option('-s', '--size', type='int', default=1000,
                      help='Output image size in pixels (NxN) [default: %default]')
    parser.add_option('-f', '--filename', default=None,
                      help='Output file')

    options, args = parser.parse_args()

    if options.kmeans <= 0:
        print >>sys.stderr, "--kmeans must have a positive value"
        sys.exit(1)
    if options.rounds <= 0:
        print >>sys.stderr, "--rounds must have a positive value"
        sys.exit(1)
    if options.size <= 0:
        print >>sys.stderr, "--size must have a positive value"
        sys.exit(1)
    if len(args) > 1:
        print >>sys.stderr, "Expected one argument, but got %d" % len(args)
        sys.exit(1)

    return options, args 
Example 28
Project: color-clustering   Author: thobbs   File: color_clustering.py    Apache License 2.0 5 votes vote down vote up
def main():
    options, args = parse_options()
    means, match_counts = analyze(args[0], options.kmeans, options.rounds)

    figure, ax = prep_figure(options.size)
    draw_color_patches(means, match_counts, options.size, ax)

    if options.filename is None:
        outfile = create_outfile_name(args[0])
    else:
        outfile = options.filename

    save_file(figure, options.size, outfile) 
Example 29
Project: senior-design   Author: james-tate   File: test_vq.py    GNU General Public License v2.0 5 votes vote down vote up
def test_kmeans_simple(self):
        initc = np.concatenate(([[X[0]], [X[1]], [X[2]]]))
        code = initc.copy()
        code1 = kmeans(X, code, iter = 1)[0]

        assert_array_almost_equal(code1, CODET2) 
Example 30
Project: tomominer   Author: alberlab   File: kmeans.py    GNU General Public License v3.0 5 votes vote down vote up
def kmeans_clustering(data, k):
  """
  TODO: add docs

  :param data:
    :param k:
  """

  from scipy.cluster.vq import kmeans, vq, whiten

  data = whiten(data)
  centroids, _ = kmeans(data, k)
  labels,  _ = vq(data, centroids)

  return labels 
Example 31
Project: NIPS2015   Author: sparseMCMC   File: setup_multiclass.py    GNU General Public License v2.0 5 votes vote down vote up
def select_Z(dataset, OPTION_NZ):
    from scipy.cluster.vq import kmeans as scipy_kmeans
    
    np.random.seed(seed=149221)
    Z, _ = scipy_kmeans(dataset.xtrain, OPTION_NZ)

    return (Z)
## ******************************


## ********** Create and optimise GP classifier 
Example 32
Project: NIPS2015   Author: sparseMCMC   File: setup_image.py    GNU General Public License v2.0 5 votes vote down vote up
def select_Z(dataset, OPTION_NZ):
    from scipy.cluster.vq import kmeans as scipy_kmeans
    
    np.random.seed(seed=149221)
    Z, _ = scipy_kmeans(dataset.xtrain, OPTION_NZ)

    return (Z)
## ******************************


## ********** Create and optimise GP classifier 
Example 33
Project: NIPS2015   Author: sparseMCMC   File: setup_coal.py    GNU General Public License v2.0 5 votes vote down vote up
def select_Z(dataset, OPTION_NZ):
    from scipy.cluster.vq import kmeans as scipy_kmeans
    
    np.random.seed(seed=149221)
    Z, _ = scipy_kmeans(dataset.xtrain, OPTION_NZ)

    return (Z)
## ******************************


## ********** Create and optimise GP classifier 
Example 34
Project: crikey   Author: kastnerkyle   File: kdllib.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def fetch_fruitspeech_softmax():
    fs, d, wav_names = fetch_sample_speech_fruit()
    def matcher(name):
        return name.split("/")[1]

    classes = [matcher(wav_name) for wav_name in wav_names]
    all_chars = [c for c in sorted(list(set("".join(classes))))]
    char2code = {v: k for k, v in enumerate(all_chars)}
    vocabulary_size = len(char2code.keys())
    y = []
    for n, cl in enumerate(classes):
        y.append(tokenize_ind(cl, char2code))

    # Is it kosher to kmeans on all the data?
    X, _apply, _re = apply_lpc_softmax_preproc(d)

    """
    for n, Xi in enumerate(X[::8]):
        di = _re(Xi)
        wavfile.write("t_%i.wav" % n, fs, soundsc(di))

    raise ValueError()
    """

    speech = {}
    speech["vocabulary_size"] = vocabulary_size
    speech["vocabulary"] = char2code
    speech["sample_rate"] = fs
    speech["data"] = X
    speech["target"] = y
    speech["reconstruct"] = _re
    return speech 
Example 35
Project: crikey   Author: kastnerkyle   File: kdllib.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def fetch_fruitspeech_softmax():
    fs, d, wav_names = fetch_sample_speech_fruit()
    def matcher(name):
        return name.split("/")[1]

    classes = [matcher(wav_name) for wav_name in wav_names]
    all_chars = [c for c in sorted(list(set("".join(classes))))]
    char2code = {v: k for k, v in enumerate(all_chars)}
    vocabulary_size = len(char2code.keys())
    y = []
    for n, cl in enumerate(classes):
        y.append(tokenize_ind(cl, char2code))

    # Is it kosher to kmeans on all the data?
    X, _apply, _re = apply_lpc_softmax_preproc(d)

    """
    for n, Xi in enumerate(X[::8]):
        di = _re(Xi)
        wavfile.write("t_%i.wav" % n, fs, soundsc(di))

    raise ValueError()
    """

    speech = {}
    speech["vocabulary_size"] = vocabulary_size
    speech["vocabulary"] = char2code
    speech["sample_rate"] = fs
    speech["data"] = X
    speech["target"] = y
    speech["reconstruct"] = _re
    return speech 
Example 36
Project: crikey   Author: kastnerkyle   File: kdllib.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def fetch_fruitspeech_softmax():
    fs, d, wav_names = fetch_sample_speech_fruit()
    def matcher(name):
        return name.split("/")[1]

    classes = [matcher(wav_name) for wav_name in wav_names]
    all_chars = [c for c in sorted(list(set("".join(classes))))]
    char2code = {v: k for k, v in enumerate(all_chars)}
    vocabulary_size = len(char2code.keys())
    y = []
    for n, cl in enumerate(classes):
        y.append(tokenize_ind(cl, char2code))

    # Is it kosher to kmeans on all the data?
    X, _apply, _re = apply_lpc_softmax_preproc(d)

    """
    for n, Xi in enumerate(X[::8]):
        di = _re(Xi)
        wavfile.write("t_%i.wav" % n, fs, soundsc(di))

    raise ValueError()
    """

    speech = {}
    speech["vocabulary_size"] = vocabulary_size
    speech["vocabulary"] = char2code
    speech["sample_rate"] = fs
    speech["data"] = X
    speech["target"] = y
    speech["reconstruct"] = _re
    return speech 
Example 37
Project: crikey   Author: kastnerkyle   File: kdllib.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def fetch_fruitspeech_softmax():
    fs, d, wav_names = fetch_sample_speech_fruit()
    def matcher(name):
        return name.split("/")[1]

    classes = [matcher(wav_name) for wav_name in wav_names]
    all_chars = [c for c in sorted(list(set("".join(classes))))]
    char2code = {v: k for k, v in enumerate(all_chars)}
    vocabulary_size = len(char2code.keys())
    y = []
    for n, cl in enumerate(classes):
        y.append(tokenize_ind(cl, char2code))

    # Is it kosher to kmeans on all the data?
    X, _apply, _re = apply_lpc_softmax_preproc(d)

    """
    for n, Xi in enumerate(X[::8]):
        di = _re(Xi)
        wavfile.write("t_%i.wav" % n, fs, soundsc(di))

    raise ValueError()
    """

    speech = {}
    speech["vocabulary_size"] = vocabulary_size
    speech["vocabulary"] = char2code
    speech["sample_rate"] = fs
    speech["data"] = X
    speech["target"] = y
    speech["reconstruct"] = _re
    return speech 
Example 38
Project: crikey   Author: kastnerkyle   File: kdllib.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def fetch_fruitspeech_softmax():
    fs, d, wav_names = fetch_sample_speech_fruit()
    def matcher(name):
        return name.split("/")[1]

    classes = [matcher(wav_name) for wav_name in wav_names]
    all_chars = [c for c in sorted(list(set("".join(classes))))]
    char2code = {v: k for k, v in enumerate(all_chars)}
    vocabulary_size = len(char2code.keys())
    y = []
    for n, cl in enumerate(classes):
        y.append(tokenize_ind(cl, char2code))

    # Is it kosher to kmeans on all the data?
    X, _apply, _re = apply_lpc_softmax_preproc(d)

    """
    for n, Xi in enumerate(X[::8]):
        di = _re(Xi)
        wavfile.write("t_%i.wav" % n, fs, soundsc(di))

    raise ValueError()
    """

    speech = {}
    speech["vocabulary_size"] = vocabulary_size
    speech["vocabulary"] = char2code
    speech["sample_rate"] = fs
    speech["data"] = X
    speech["target"] = y
    speech["reconstruct"] = _re
    return speech 
Example 39
Project: crikey   Author: kastnerkyle   File: kdllib.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def fetch_fruitspeech_softmax():
    fs, d, wav_names = fetch_sample_speech_fruit()
    def matcher(name):
        return name.split("/")[1]

    classes = [matcher(wav_name) for wav_name in wav_names]
    all_chars = [c for c in sorted(list(set("".join(classes))))]
    char2code = {v: k for k, v in enumerate(all_chars)}
    vocabulary_size = len(char2code.keys())
    y = []
    for n, cl in enumerate(classes):
        y.append(tokenize_ind(cl, char2code))

    # Is it kosher to kmeans on all the data?
    X, _apply, _re = apply_lpc_softmax_preproc(d)

    """
    for n, Xi in enumerate(X[::8]):
        di = _re(Xi)
        wavfile.write("t_%i.wav" % n, fs, soundsc(di))

    raise ValueError()
    """

    speech = {}
    speech["vocabulary_size"] = vocabulary_size
    speech["vocabulary"] = char2code
    speech["sample_rate"] = fs
    speech["data"] = X
    speech["target"] = y
    speech["reconstruct"] = _re
    return speech 
Example 40
Project: crikey   Author: kastnerkyle   File: kdllib.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def fetch_fruitspeech_softmax():
    fs, d, wav_names = fetch_sample_speech_fruit()
    def matcher(name):
        return name.split("/")[1]

    classes = [matcher(wav_name) for wav_name in wav_names]
    all_chars = [c for c in sorted(list(set("".join(classes))))]
    char2code = {v: k for k, v in enumerate(all_chars)}
    vocabulary_size = len(char2code.keys())
    y = []
    for n, cl in enumerate(classes):
        y.append(tokenize_ind(cl, char2code))

    # Is it kosher to kmeans on all the data?
    X, _apply, _re = apply_lpc_softmax_preproc(d)

    """
    for n, Xi in enumerate(X[::8]):
        di = _re(Xi)
        wavfile.write("t_%i.wav" % n, fs, soundsc(di))

    raise ValueError()
    """

    speech = {}
    speech["vocabulary_size"] = vocabulary_size
    speech["vocabulary"] = char2code
    speech["sample_rate"] = fs
    speech["data"] = X
    speech["target"] = y
    speech["reconstruct"] = _re
    return speech 
Example 41
Project: crikey   Author: kastnerkyle   File: kdllib.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def fetch_fruitspeech_softmax():
    fs, d, wav_names = fetch_sample_speech_fruit()
    def matcher(name):
        return name.split("/")[1]

    classes = [matcher(wav_name) for wav_name in wav_names]
    all_chars = [c for c in sorted(list(set("".join(classes))))]
    char2code = {v: k for k, v in enumerate(all_chars)}
    vocabulary_size = len(char2code.keys())
    y = []
    for n, cl in enumerate(classes):
        y.append(tokenize_ind(cl, char2code))

    # Is it kosher to kmeans on all the data?
    X, _apply, _re = apply_lpc_softmax_preproc(d)

    """
    for n, Xi in enumerate(X[::8]):
        di = _re(Xi)
        wavfile.write("t_%i.wav" % n, fs, soundsc(di))

    raise ValueError()
    """

    speech = {}
    speech["vocabulary_size"] = vocabulary_size
    speech["vocabulary"] = char2code
    speech["sample_rate"] = fs
    speech["data"] = X
    speech["target"] = y
    speech["reconstruct"] = _re
    return speech 
Example 42
Project: crikey   Author: kastnerkyle   File: kdllib.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def fetch_fruitspeech_softmax():
    fs, d, wav_names = fetch_sample_speech_fruit()
    def matcher(name):
        return name.split("/")[1]

    classes = [matcher(wav_name) for wav_name in wav_names]
    all_chars = [c for c in sorted(list(set("".join(classes))))]
    char2code = {v: k for k, v in enumerate(all_chars)}
    vocabulary_size = len(char2code.keys())
    y = []
    for n, cl in enumerate(classes):
        y.append(tokenize_ind(cl, char2code))

    # Is it kosher to kmeans on all the data?
    X, _apply, _re = apply_lpc_softmax_preproc(d)

    """
    for n, Xi in enumerate(X[::8]):
        di = _re(Xi)
        wavfile.write("t_%i.wav" % n, fs, soundsc(di))

    raise ValueError()
    """

    speech = {}
    speech["vocabulary_size"] = vocabulary_size
    speech["vocabulary"] = char2code
    speech["sample_rate"] = fs
    speech["data"] = X
    speech["target"] = y
    speech["reconstruct"] = _re
    return speech 
Example 43
Project: crikey   Author: kastnerkyle   File: kdllib.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def fetch_fruitspeech_softmax():
    fs, d, wav_names = fetch_sample_speech_fruit()
    def matcher(name):
        return name.split("/")[1]

    classes = [matcher(wav_name) for wav_name in wav_names]
    all_chars = [c for c in sorted(list(set("".join(classes))))]
    char2code = {v: k for k, v in enumerate(all_chars)}
    vocabulary_size = len(char2code.keys())
    y = []
    for n, cl in enumerate(classes):
        y.append(tokenize_ind(cl, char2code))

    # Is it kosher to kmeans on all the data?
    X, _apply, _re = apply_lpc_softmax_preproc(d)

    """
    for n, Xi in enumerate(X[::8]):
        di = _re(Xi)
        wavfile.write("t_%i.wav" % n, fs, soundsc(di))

    raise ValueError()
    """

    speech = {}
    speech["vocabulary_size"] = vocabulary_size
    speech["vocabulary"] = char2code
    speech["sample_rate"] = fs
    speech["data"] = X
    speech["target"] = y
    speech["reconstruct"] = _re
    return speech 
Example 44
Project: crikey   Author: kastnerkyle   File: kdllib.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def fetch_fruitspeech_softmax():
    fs, d, wav_names = fetch_sample_speech_fruit()
    def matcher(name):
        return name.split("/")[1]

    classes = [matcher(wav_name) for wav_name in wav_names]
    all_chars = [c for c in sorted(list(set("".join(classes))))]
    char2code = {v: k for k, v in enumerate(all_chars)}
    vocabulary_size = len(char2code.keys())
    y = []
    for n, cl in enumerate(classes):
        y.append(tokenize_ind(cl, char2code))

    # Is it kosher to kmeans on all the data?
    X, _apply, _re = apply_lpc_softmax_preproc(d)

    """
    for n, Xi in enumerate(X[::8]):
        di = _re(Xi)
        wavfile.write("t_%i.wav" % n, fs, soundsc(di))

    raise ValueError()
    """

    speech = {}
    speech["vocabulary_size"] = vocabulary_size
    speech["vocabulary"] = char2code
    speech["sample_rate"] = fs
    speech["data"] = X
    speech["target"] = y
    speech["reconstruct"] = _re
    return speech 
Example 45
Project: pdkit   Author: pdkit   File: updrs.py    MIT License 4 votes vote down vote up
def __train(self, n_clusters=4):
        """
            Calculate cluster's centroids and standard deviations. If there are at least the number of threshold rows \
            then:

            * Observations will be normalised.

            * Standard deviations will be returned.

            * Clusters will be returned.

            * Centroids are ordered based on their distance from an arbitrary -100, -100 point.

            If there are not enough Observations, then centroids and standard deviations will be set to the empty list.

            General strategy: Use numpy.array for calculations. Keep everything in float. Convert arrays back to lists \
            at the end.

            :param n_clusters: the number of clusters
            :type n_clusters: int
        """

        try:
            for obs in self.observations:
                features, ids = self.__get_features_for_observation(observation=obs, last_column_is_id=True)
                # the last column is the observation id
                normalised_data = whiten(features)

                # skip any rows that contain just zero values... they create nans
                first_safe_row = pdkit.utils.non_zero_index(normalised_data)
                observation_ids = features.tolist()
                sd = features[first_safe_row] / normalised_data[first_safe_row]

                # Calculate centroids and sort result
                centroids_array, _ = kmeans(normalised_data, n_clusters)
                sorted_centroids = pdkit.utils.centroid_sort(centroids_array)

                if not self.clusters:
                    self.clusters = [[obs, sd.tolist(), sorted_centroids.tolist()]]
                else:
                    self.clusters.append([obs, sd.tolist(),sorted_centroids.tolist()])
        except IOError as e:
            ierr = "({}): {}".format(e.errno, e.strerror)
            logging.error("Error training UPDRS, file not found, I/O error %s", ierr)

        except ValueError as verr:
            logging.error("Error training UPDRS ValueError ->%s", verr.message)

        except:
            logging.error("Unexpected error on training UPDRS init: %s", sys.exc_info()[0]) 
Example 46
Project: color-clustering   Author: thobbs   File: color_clustering.py    Apache License 2.0 4 votes vote down vote up
def analyze(filename, num_means, rounds):
    """
    Returns a tuple of two objects:
      * A list of the means in the form [(h, s, v), ...].  Each of the
        (h, s, v) values are in the range [0, 1].
      * A list of the same length containing the number of pixels that
        are closest to the mean at the same index in the first list.
    """

    # open the image
    current_dir = os.path.dirname(os.path.realpath(__file__))
    img = Image.open(os.path.join(current_dir, filename))

    # load pixels into array
    flat_img = np.asarray(img)

    # convert from rgb to hsv (all values in range [0, 1])
    flat_img = np.apply_along_axis(
        lambda a: (a[0] / 255.0, a[1] / 255.0, a[2] / 255.0), 2, flat_img)
    flat_img = matplotlib.colors.rgb_to_hsv(flat_img)

    # reshape to an Nx3 array
    img = np.reshape(flat_img, (len(flat_img) * len(flat_img[0]), 3))

    # perform k-means clustering
    stdev = get_stdev(img)
    whitened = whiten(img)
    means, _ = kmeans(whitened, num_means, iter=rounds)
    unwhitened = means * stdev

    unwhitened = map(tuple, unwhitened)
    unwhitened.sort()

    # count the number of pixels that are closest to each centroid
    match_counts = [0] * len(unwhitened)
    for i, row in enumerate(flat_img):
        for a in row:
            distances = [dist(a, b) for b in unwhitened]
            min_index = distances.index(min(distances))
            match_counts[min_index] += 1

    return unwhitened, match_counts 
Example 47
Project: attributeBasedClustering   Author: eduard-kazakov   File: attribute_based_clustering_dialog.py    GNU General Public License v2.0 4 votes vote down vote up
def kmeansClustering(self, vectorLayer, attributesList, normalize, clusterNumber, iterations, threshold, outputFieldName):
        from scipy.cluster.vq import kmeans, vq
        from numpy import array

        fullObjectsList = []
        features = vectorLayer.getFeatures()

        for feature in features:
            fullObjectsList.append([])
            for attribute in attributesList:
                if feature[attribute[0]]:
                    fullObjectsList[len(fullObjectsList) - 1].append(feature[attribute[0]])
                else:
                    fullObjectsList[len(fullObjectsList) - 1].append(0)

        # NORMALIZING
        if normalize:
            i = 0
            maxValues = []
            while i < len(attributesList):
                maxValues.append(max(abs(item[i]) for item in fullObjectsList))
                i += 1

            j = 0
            while j < len(fullObjectsList):
                i = 0
                while i < len(fullObjectsList[j]):
                    fullObjectsList[j][i] = (fullObjectsList[j][i] * 1.0) / (maxValues[i] * 1.0)
                    i += 1
                j += 1

        data = array(fullObjectsList)

        centroids, _ = kmeans(data, clusterNumber, iter=iterations, thresh=threshold)
        idx, _ = vq(data, centroids)
        idx = idx.tolist()

        vectorLayerDataProvider = vectorLayer.dataProvider()

        # Create field of not exist
        if vectorLayer.fields().indexFromName(outputFieldName) == -1:
            vectorLayerDataProvider.addAttributes([QgsField(outputFieldName, QVariant.Int)])

        vectorLayer.updateFields()
        vectorLayer.startEditing()
        attrIdx = vectorLayer.fields().indexFromName(outputFieldName)
        features = vectorLayer.getFeatures()

        i = 0
        for feature in features:
            vectorLayer.changeAttributeValue(feature.id(), attrIdx, int(idx[i]))
            i += 1

        vectorLayer.updateFields()
        vectorLayer.commitChanges()

        # Performs K-Means clustering. Updates vector layer field.