Python sklearn.cluster() Examples

The following are 30 code examples of sklearn.cluster(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn , or try the search function .
Example #1
Source File: cluster.py    From acai with Apache License 2.0 7 votes vote down vote up
def error(cluster, target_cluster, k):
    """ Compute error between cluster and target cluster
    :param cluster: proposed cluster
    :param target_cluster: target cluster
    :return: error
    """
    n = np.shape(target_cluster)[0]
    M = np.zeros((k, k))
    for i in range(k):
        for j in range(k):
            M[i][j] = np.sum(np.logical_and(cluster == i, target_cluster == j))
    m = Munkres()
    indexes = m.compute(-M)
    corresp = []
    for i in range(k):
        corresp.append(indexes[i][1])
    pred_corresp = [corresp[int(predicted)] for predicted in cluster]
    acc = np.sum(pred_corresp == target_cluster) / float(len(target_cluster))
    return acc 
Example #2
Source File: dbscan.py    From link-prediction_with_deep-learning with MIT License 6 votes vote down vote up
def process_options(args):    
    options = argparser().parse_args(args)

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    if options.eps <= 0.0:
        raise ValueError('eps must be > 0')

    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    words, vectors = wv.words(), wv.vectors()

    if options.whiten:
        logging.info('normalize features to unit variance')
        vectors = scipy.cluster.vq.whiten(vectors)

    return words, vectors, options 
Example #3
Source File: auxiliaries.py    From Deep-Metric-Learning-Baselines with Apache License 2.0 6 votes vote down vote up
def run_kmeans(features, n_cluster):
    """
    Run kmeans on a set of features to find <n_cluster> cluster.

    Args:
        features:  np.ndarrary [n_samples x embed_dim], embedding training/testing samples for which kmeans should be performed.
        n_cluster: int, number of cluster.
    Returns:
        cluster_assignments: np.ndarray [n_samples x 1], per sample provide the respective cluster label it belongs to.
    """
    n_samples, dim = features.shape
    kmeans = faiss.Kmeans(dim, n_cluster)
    kmeans.n_iter, kmeans.min_points_per_centroid, kmeans.max_points_per_centroid = 20,5,1000000000
    kmeans.train(features)
    _, cluster_assignments = kmeans.index.search(features,1)
    return cluster_assignments 
Example #4
Source File: cluster.py    From acai with Apache License 2.0 6 votes vote down vote up
def cluster(train_latents, train_labels, test_latents, test_labels):
    num_classes = np.shape(train_labels)[-1]
    labels_hot = np.argmax(test_labels, axis=-1)
    train_latents = np.reshape(train_latents,
                               newshape=[train_latents.shape[0], -1])
    test_latents = np.reshape(test_latents,
                              newshape=[test_latents.shape[0], -1])
    kmeans = KMeans(init='random', n_clusters=num_classes,
                    random_state=0, max_iter=1000, n_init=FLAGS.n_init,
                    n_jobs=FLAGS.n_jobs)
    kmeans.fit(train_latents)
    print(kmeans.cluster_centers_)
    print('Train/Test k-means objective = %.4f / %.4f' %
          (-kmeans.score(train_latents), -kmeans.score(test_latents)))
    print('Train/Test accuracy %.4f / %.3f' %
          (error(np.argmax(train_labels, axis=-1), kmeans.predict(train_latents), k=num_classes),
           error(np.argmax(test_labels, axis=-1), kmeans.predict(test_latents), k=num_classes)))
    return error(labels_hot, kmeans.predict(test_latents), k=num_classes) 
Example #5
Source File: cluster.py    From lexpredict-contraxsuite with GNU Affero General Public License v3.0 6 votes vote down vote up
def __call__(self, features: np.array, term_index: list, use_tfidf: bool = True, **options):
        """
        Just call activated class instance to cluster data.
        :param features: np.array - term frequency matrix
        :param term_index:  list - list of term frequency matrix indexes
        :param use_tfidf: bool - whether to use TF IDF Transformer
        :param options: **dict - unpacked cluster algorithm options
        :return: ClusterEngine instance with attributes listed in __init__
        """
        self.features = features
        self.term_index = term_index
        self.num_records = features.shape[0]
        self.use_tfidf = use_tfidf
        self.user_options = options
        self.n_clusters = options.get('n_clusters')
        self.cluster_model = self.get_model()
        return self.cluster() 
Example #6
Source File: main2.py    From msaf with MIT License 6 votes vote down vote up
def cluster(evecs, Cnorm, k, in_bound_idxs=None):
    X = evecs[:, :k] / (Cnorm[:, k - 1:k] + 1e-5)
    KM = sklearn.cluster.KMeans(n_clusters=k, n_init=50, max_iter=500)
    seg_ids = KM.fit_predict(X)

    ###############################################################
    # Locate segment boundaries from the label sequence
    if in_bound_idxs is None:
        bound_beats = 1 + np.flatnonzero(seg_ids[:-1] != seg_ids[1:])

        # Count beats 0 as a boundary
        bound_idxs = librosa.util.fix_frames(bound_beats, x_min=0)
    else:
        bound_idxs = in_bound_idxs

    # Compute the segment label for each boundary
    bound_segs = list(seg_ids[bound_idxs])

    # Tack on the end-time
    bound_idxs = list(np.append(bound_idxs, len(Cnorm) - 1))

    return bound_idxs, bound_segs 
Example #7
Source File: main2.py    From msaf with MIT License 6 votes vote down vote up
def do_segmentation(C, M, config, in_bound_idxs=None):
    embedding = embed_beats(C, M, config)
    Cnorm = np.cumsum(embedding ** 2, axis=1) ** 0.5

    if config["hier"]:
        est_idxs = []
        est_labels = []
        for k in range(1, config["num_layers"] + 1):
            est_idx, est_label = cluster(embedding, Cnorm, k)
            est_idxs.append(est_idx)
            est_labels.append(np.asarray(est_label, dtype=np.int))

    else:
        est_idxs, est_labels = cluster(embedding, Cnorm, config["scluster_k"], in_bound_idxs)
        est_labels = np.asarray(est_labels, dtype=np.int)

    return est_idxs, est_labels, Cnorm 
Example #8
Source File: main.py    From scTDA with GNU General Public License v3.0 6 votes vote down vote up
def save(self, name, resolution, gain, equalize=True, cluster='agglomerative', statistics='db', max_K=5):
        """
        Generates a topological representation using the Mapper algorithm with resolution and gain specified by the
        parameters 'resolution' and 'gain'. When equalize is set to True, patches are chosen such that they
        contain the same number of points. The parameter 'cluster' specifies the clustering method ('agglomerative' or
        'kmeans'). The parameter 'statistics' specifies the criterion for choosing the optimal number of clusters
        ('db' for Davies-Bouildin index, or 'gap' for the gap statistic). The parameter 'max_K' specifies the maximum
        number of clusters to be considered within each patch. The topological representation is stored in the files
        'name.gexf' and 'name.json'. It returns a dictionary with the patches.
        """
        G, all_clusters, patches = sakmapper.mapper_graph(self.df, lens_data=self.lens_data_mds,
                                                          resolution=resolution,
                                                          gain=gain, equalize=equalize, clust=cluster,
                                                          stat=statistics, max_K=max_K)
        dic = {}
        for n, rs in enumerate(all_clusters):
            dic[str(n)] = map(lambda x: int(x), rs)
        with open(name + '.json', 'wb') as handle3:
            json.dump(dic, handle3)
        networkx.write_gexf(G, name + '.gexf')
        return patches 
Example #9
Source File: main.py    From scTDA with GNU General Public License v3.0 6 votes vote down vote up
def cellular_subpopulations(self, threshold=0.05, min_cells=5, clus_thres=0.65):
        """
        Identifies potential transient cellular subpopulations. The parameter
        'threshold' sets an upper bound of the q-value of the genes that are considered in the analysis.
        The parameter 'min_cells' sets the minimum number of cells on which each of the genes considered in the
        analysis is expressed. Cellular subpopulations are determined by clustering the Jensen-Shannon distance
        matrix of the genes that pass all the constraints. The number of clusters is controlled in this case by
        the parameter 'clus_thres'. In both cases a list with the genes associated to each cluster is returned.
        It requires the presence of the file 'name.genes.tsv', produced by the method RotedGraph.save().
        """
        con = []
        dis = []
        nam = []
        f = open(self.name + '.genes.tsv', 'r')
        for n, line in enumerate(f):
            if n > 0:
                sp = line[:-1].split('\t')
                if float(sp[7]) < threshold and float(sp[1]) > min_cells:
                    nam.append(sp[0])
        f.close()
        mat2 = self.JSD_matrix(nam)
        return [map(lambda xx: nam[xx], m)
                for m in find_clusters(hierarchical_clustering(mat2, labels=nam,
                                                               cluster_distance=True, thres=clus_thres)).values()] 
Example #10
Source File: TICC.py    From TICC with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def computeF1_macro(confusion_matrix,matching, num_clusters):
	"""
	computes the macro F1 score
	confusion matrix : requres permutation
	matching according to which matrix must be permuted
	"""
	##Permute the matrix columns
	permuted_confusion_matrix = np.zeros([num_clusters,num_clusters])
	for cluster in xrange(num_clusters):
		matched_cluster = matching[cluster]
 		permuted_confusion_matrix[:,cluster] = confusion_matrix[:,matched_cluster]
 	##Compute the F1 score for every cluster
 	F1_score = 0
 	for cluster in xrange(num_clusters):
 		TP = permuted_confusion_matrix[cluster,cluster]
 		FP = np.sum(permuted_confusion_matrix[:,cluster]) - TP
 		FN = np.sum(permuted_confusion_matrix[cluster,:]) - TP
 		precision = TP/(TP + FP)
 		recall = TP/(TP + FN)
 		f1 = stats.hmean([precision,recall])
 		F1_score += f1
 	F1_score /= num_clusters
 	return F1_score 
Example #11
Source File: network_accuracy.py    From TICC with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def computeF1_macro(confusion_matrix,matching, num_clusters):
	"""
	computes the macro F1 score
	confusion matrix : requres permutation
	matching according to which matrix must be permuted
	"""
	##Permute the matrix columns
	permuted_confusion_matrix = np.zeros([num_clusters,num_clusters])
	for cluster in xrange(num_clusters):
		matched_cluster = matching[cluster]
 		permuted_confusion_matrix[:,cluster] = confusion_matrix[:,matched_cluster]
 	##Compute the F1 score for every cluster
 	F1_score = 0
 	for cluster in xrange(num_clusters):
 		TP = permuted_confusion_matrix[cluster,cluster]
 		FP = np.sum(permuted_confusion_matrix[:,cluster]) - TP
 		FN = np.sum(permuted_confusion_matrix[cluster,:]) - TP
 		precision = TP/(TP + FP)
 		recall = TP/(TP + FN)
 		f1 = stats.hmean([precision,recall])
 		F1_score += f1
 	F1_score /= num_clusters
 	return F1_score 
Example #12
Source File: test_monkeypatch.py    From daal4py with Apache License 2.0 6 votes vote down vote up
def test_monkey_patching(self):
        _tokens = daal4py.sklearn.sklearn_patch_names()
        self.assertTrue(isinstance(_tokens, list) and len(_tokens) > 0)
        for t in _tokens:
            daal4py.sklearn.unpatch_sklearn(t)
        for t in _tokens:
            daal4py.sklearn.patch_sklearn(t)

        import sklearn
        for a in [(sklearn.decomposition, 'PCA'),
                  (sklearn.linear_model, 'Ridge'),
                  (sklearn.linear_model, 'LinearRegression'),
                  (sklearn.cluster, 'KMeans'),
                  (sklearn.svm, 'SVC'),]:
            class_module = getattr(a[0], a[1]).__module__
            self.assertTrue(class_module.startswith('daal4py')) 
Example #13
Source File: baselines.py    From AirBnbPricePrediction with MIT License 6 votes vote down vote up
def kmeans(X_train, y_train, X_val, y_val):
    n_clusters = 10
    kmeans = KMeans(n_clusters=n_clusters, random_state=0, verbose=0, n_jobs=int(0.8*n_cores)).fit(X_train)
    c_train = kmeans.predict(X_train)
    c_pred = kmeans.predict(X_val)
    centroids = kmeans.cluster_centers_
    for i in range(n_clusters):
        print('--------analyzing cluster %d--------' %i)
        train_mask = c_train==i
        std_train = np.std(y_train[train_mask])
        mean_train = np.mean(y_train[train_mask])
        print("# examples & price mean & std for training set within cluster %d is:(%d, %.2f, %.2f)" %(i, train_mask.sum(), np.float(mean_train), np.float(std_train)))
        pred_mask = c_pred==i
        std_pred = np.std(y_val[pred_mask])
        mean_pred = np.mean(y_val[pred_mask])
        print("# examples & price mean & std for validation set within cluster %d is:(%d, %.2f, %.2f)" %(i, pred_mask.sum(), np.float(mean_pred), np.float(std_pred)))
        if pred_mask.sum() == 0:
            print('Zero membered test set! Skipping the test and training validation.')
            continue
        LinearModel(X_train[train_mask], y_train[train_mask], X_val[pred_mask], y_val[pred_mask])
        print('--------Finished analyzing cluster %d--------' %i)
    
    
    return c_pred, centroids 
Example #14
Source File: k_means.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def predict(self, X):
        """Predict the closest cluster each sample in X belongs to.
        In the vector quantization literature, `cluster_centers_` is called
        the code book and each value returned by `predict` is the index of
        the closest code in the code book.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            New data to predict.

        Returns
        -------
        labels : array, shape [n_samples,]
            Index of the cluster each sample belongs to.
        """
        check_is_fitted(self, "cluster_centers_")
        X = self._check_array(X)
        labels = pairwise_distances_argmin_min(X, self.cluster_centers_)[0].astype(
            np.int32
        )
        return labels 
Example #15
Source File: kmeans.py    From link-prediction_with_deep-learning with MIT License 5 votes vote down vote up
def process_options(args):    
    options = argparser().parse_args(args)

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    if options.k is not None and options.k < 2:
        raise ValueError('cluster number must be >= 2')

    if options.method == MINIBATCH_KMEANS and not with_sklearn:
        logging.warning('minibatch kmeans not available, using kmeans (slow)')
        options.method = KMEANS

    if options.jobs != 1 and (options.method != KMEANS or not with_sklearn):
        logging.warning('jobs > 1 only supported scikit-learn %s' % KMEANS)
        options.jobs = 1

    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)

    if options.k is None:
        options.k = int(math.ceil((len(wv.words())/2)**0.5))
        logging.info('set k=%d (%d words)' % (options.k, len(wv.words())))

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    words, vectors = wv.words(), wv.vectors()

    if options.whiten:
        logging.info('normalize features to unit variance')
        vectors = scipy.cluster.vq.whiten(vectors)

    return words, vectors, options 
Example #16
Source File: kmeans.py    From link-prediction_with_deep-learning with MIT License 5 votes vote down vote up
def minibatch_kmeans(vectors, k):
    if not with_sklearn:
        raise NotImplementedError
    # Sculley (http://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf)
    # uses batch size 1000. sklearn KMeans defaults to n_init 10
    kmeans = sklearn.cluster.MiniBatchKMeans(k, batch_size=1000, n_init=10)
    kmeans.fit(vectors)
    return kmeans.labels_ 
Example #17
Source File: kmeans.py    From link-prediction_with_deep-learning with MIT License 5 votes vote down vote up
def kmeans(vectors, k, jobs=1):
    vectors = numpy.array(vectors)
    if with_sklearn:
        if jobs == 1:
            kmeans = sklearn.cluster.KMeans(k)
        else:
            kmeans = sklearn.cluster.KMeans(k, n_jobs=jobs) # sklearn > 0.10
        kmeans.fit(vectors)
        return kmeans.labels_
    else:
        codebook, distortion = scipy.cluster.vq.kmeans(vectors, k)
        cluster_ids, dist = scipy.cluster.vq.vq(vectors, codebook)
        return cluster_ids 
Example #18
Source File: kmeans.py    From link-prediction_with_deep-learning with MIT License 5 votes vote down vote up
def write_cluster_ids(words, cluster_ids, out=None):
    """Write given list of words and their corresponding cluster ids to out."""

    assert len(words) == len(cluster_ids), 'word/cluster ids number mismatch'

    if out is None:
        out = sys.stdout
    for word, cid in izip(words, cluster_ids):
        print >> out, '%s\t%d' % (word, cid) 
Example #19
Source File: clustering.py    From metric-learning-divide-and-conquer with GNU Lesser General Public License v3.0 5 votes vote down vote up
def get_cluster_labels(model, data_loader, use_penultimate, nb_clusters,
       gpu_id=None, backend='faiss'):
    is_dry_run = (nb_clusters == 1)
    if not is_dry_run:
        if not use_penultimate:
            logging.debug('Using the final layer for clustering')
        X_all, T_all, I_all = utils.predict_batchwise(
            model=model,
            dataloader=data_loader,
            use_penultimate=use_penultimate,
            is_dry_run=is_dry_run
        )
        perm = np.argsort(I_all)
        X_all = X_all[perm]
        I_all = I_all[perm]
        T_all = T_all[perm]
        if backend == 'torch+sklearn':
            clustering_algorithm = sklearn.cluster.KMeans(
                n_clusters=nb_clusters)
            C = clustering_algorithm.fit(X_all).labels_
        else:
            C = faissext.do_clustering(
                X_all,
                num_clusters = nb_clusters,
                gpu_ids = None if backend != 'faiss-gpu'
                    else torch.cuda.current_device(),
                niter=100,
                nredo=5,
                verbose=0
            )
    else:
        T_all = np.array(data_loader.dataset.ys)
        I_all = np.array(data_loader.dataset.I)
        C = np.zeros(len(T_all), dtype=int)
    return C, T_all, I_all 
Example #20
Source File: script_smk.py    From ibeis with Apache License 2.0 5 votes vote down vote up
def hyrule_vocab_test():
    from yael.yutils import load_ext
    from os.path import join
    import sklearn.cluster

    dbdir = ut.truepath('/raid/work/Oxford/')
    datadir = dbdir + '/smk_data_iccv_2013/data/'

    # Files storing descriptors/geometry for Oxford5k dataset
    test_sift_fname = join(datadir, 'oxford_sift.uint8')
    # test_nf_fname = join(datadir, 'oxford_nsift.uint32')
    all_vecs = load_ext(test_sift_fname, ndims=128, verbose=True).astype(np.float32)
    print(ut.print_object_size(all_vecs))
    # nfeats_list = load_ext(test_nf_fname, verbose=True)

    with ut.embed_on_exception_context:
        rng = np.random.RandomState(13421421)
        # init_size = int(config['num_words'] * 8)
        num_words = int(2 ** 16)
        init_size = num_words * 4
        # converged after 26043 iterations
        minibatch_params = dict(
            n_clusters=num_words,
            init='k-means++',
            # init='random',
            init_size=init_size,
            n_init=1,
            max_iter=100,
            batch_size=1000,
            tol=0.0,
            max_no_improvement=10,
            reassignment_ratio=0.01,
        )
        clusterer = sklearn.cluster.MiniBatchKMeans(
            compute_labels=False, random_state=rng, verbose=1,
            **minibatch_params)
        clusterer.fit(all_vecs)
        words = clusterer.cluster_centers_
        print(words.shape) 
Example #21
Source File: preproc_occurrence.py    From ibeis with Apache License 2.0 5 votes vote down vote up
def ibeis_compute_occurrences(ibs, gid_list, config=None, verbose=None):
    """
    clusters occurrences togethers (by time, not yet space)
    An occurrence is a meeting, localized in time and space between a camera
    and a group of animals.
    Animals are identified within each occurrence.

    Does not modify database state, just returns cluster ids

    Args:
        ibs (IBEISController):  ibeis controller object
        gid_list (list):

    Returns:
        tuple: (None, None)

    CommandLine:
        python -m ibeis --tf ibeis_compute_occurrences:0 --show
        TODO: FIXME: good example of autogen doctest return failure
    """
    if config is None:
        config = {'use_gps': False, 'seconds_thresh': 600}
        #from ibeis.algo import Config
        #config = Config.OccurrenceConfig().asdict()
    occur_labels, occur_gids = compute_occurrence_groups(ibs, gid_list, config,
                                                         verbose=verbose)
    if True:
        gid2_label = {gid: label for label, gids in zip(occur_labels, occur_gids)
                      for gid in gids}
        # Assert that each gid only belongs to one occurrence
        flat_imgsetids = ut.dict_take(gid2_label, gid_list)
        flat_gids = gid_list
    else:
        # Flatten gids list by enounter
        flat_imgsetids, flat_gids = ut.flatten_membership_mapping(occur_labels, occur_gids)
    return flat_imgsetids, flat_gids 
Example #22
Source File: preproc_occurrence.py    From ibeis with Apache License 2.0 5 votes vote down vote up
def agglomerative_cluster_occurrences(X_data, thresh_sec):
    """
    Agglomerative occurrence clustering algorithm

    Args:
        X_data (ndarray):  Length N array of data to cluster
        thresh_sec (float):

    Returns:
        ndarray: (label_arr) - Length N array of cluster indexes

    CommandLine:
        python -m ibeis.algo.preproc.preproc_occurrence --exec-agglomerative_cluster_occurrences

    References:
        https://docs.scipy.org/doc/scipy-0.9.0/reference/generated/scipy.cluster.hierarchy.fclusterdata.html#scipy.cluster.hierarchy.fclusterdata
        http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.fcluster.html

    Example:
        >>> # DISABLE_DOCTEST
        >>> from ibeis.algo.preproc.preproc_occurrence import *  # NOQA
        >>> X_data = '?'
        >>> thresh_sec = '?'
        >>> (occur_ids, occur_gids) = agglomerative_cluster_occurrences(X_data, thresh_sec)
        >>> result = ('(occur_ids, occur_gids) = %s' % (str((occur_ids, occur_gids)),))
        >>> print(result)
    """
    label_arr = scipy.cluster.hierarchy.fclusterdata(
        X_data, thresh_sec, criterion='distance')
    return label_arr 
Example #23
Source File: preproc_occurrence.py    From ibeis with Apache License 2.0 5 votes vote down vote up
def group_images_by_label(label_arr, gid_arr):
    """
    Input: Length N list of labels and ids
    Output: Length M list of unique labels, and lenth M list of lists of ids
    """
    # Reverse the image to cluster index mapping
    import vtool_ibeis as vt
    labels_, groupxs_ = vt.group_indices(label_arr)
    sortx = np.array(list(map(len, groupxs_))).argsort()[::-1]
    labels  = labels_.take(sortx, axis=0)
    groupxs = ut.take(groupxs_, sortx)
    label_gids = vt.apply_grouping(gid_arr, groupxs)
    return labels, label_gids 
Example #24
Source File: Transform_KM_Features.py    From Auto_ViML with Apache License 2.0 5 votes vote down vote up
def transform(self, X, y=None):
        """Output the closest cluster id for each input data point.
        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_data_points, n_features)
        y : vector of length n_data_points, optional, default None
            Target vector is ignored even if provided.
        Returns
        -------
        cluster_ids : array, shape[n_data_points,1]
        """
        clusters = self.km_model.predict(X)
        return clusters[:,np.newaxis] 
Example #25
Source File: Transform_KM_Features.py    From Auto_ViML with Apache License 2.0 5 votes vote down vote up
def Transform_KM_Features(training_data, training_labels, test_data, km_max=0):
    seed = 99
    preds = list(training_data)
    target = training_labels.name
    train_index =  training_data.index
    test_index =  test_data.index
    if km_max <= 2:
        k_max = 2
    else:
        k_max = copy.deepcopy(km_max)
    ### Calculate the target scale here => the higher the number the better for target accuracy
    try:
        if training_labels.dtype in [np.float64,np.float32,np.float16]:
            target_range = float(abs(training_labels.max() - training_labels.min()))
        elif training_labels.dtype in [object,bool]:
            target_range = int(len(Counter(training_labels)) + 3)
        else:
            target_range = int(abs(training_labels.max() - training_labels.min()))
    except:
        target_range = 5.0
    kmf =  KMeansFeaturizer(k=k_max, target_scale=target_range, random_state=seed)
    kmf_hint = kmf.fit(training_data, training_labels)
    ### Just return it with the cluster column => no need to return the data frame ###
    training_cluster_features = kmf_hint.transform(training_data)
    test_cluster_features = kmf_hint.transform(test_data)
    npx = np.c_[training_data, training_labels.values]
    training_with_cluster = np.c_[npx,training_cluster_features]
    test_with_cluster = np.c_[test_data, test_cluster_features]
    ### We are going to just return the cluster values ######
    train_with_cluster_df = training_with_cluster[:,-1]
    test_with_cluster_df = test_with_cluster[:,-1]
    #train_with_cluster_df = pd.DataFrame(training_with_cluster,index=train_index,
    #                                  columns=preds+[target,'cluster'])
    #test_with_cluster_df = pd.DataFrame(test_with_cluster,index=test_index,
    #                                  columns=preds+['cluster'])
    return train_with_cluster_df, test_with_cluster_df 
Example #26
Source File: kitti_usage.py    From pydriver with MIT License 5 votes vote down vote up
def vocabularyGenerator(dimensions, featureName):
    voc = pydriver.detectors.vocabularies.Vocabulary(
        dimensions,
        preprocessors=[
            sklearn.cluster.MiniBatchKMeans(n_clusters=100, batch_size=1000, max_iter=100),
            ],
        classifier=sklearn.ensemble.AdaBoostClassifier(n_estimators=75),
        storageGenerator=storageGenerator,
        balanceNegatives=True,
        )
    return voc

# initialize detector that will perform learning and recognition 
Example #27
Source File: cluster.py    From lexpredict-contraxsuite with GNU Affero General Public License v3.0 5 votes vote down vote up
def get_model(self):
        """
        Activates cluster model with filled options
        :return: activated cluster model like Kmeans(**options)
        """
        options = self.get_engine_options()
        return self.engine(**options) 
Example #28
Source File: cluster.py    From lexpredict-contraxsuite with GNU Affero General Public License v3.0 5 votes vote down vote up
def cluster(self):
        """
        Central method to process incomind data with chosen activated Clustering model
        :return: ClusterEngine instance with attributes listed in __init__
        """

        # Toggle TF-IDF and cluster
        if self.use_tfidf:
            self.features = TfidfTransformer().fit_transform(self.features).toarray()

        # Return labels and representative points/centers
        self.cluster_labels = self.cluster_model.fit_predict(self.features).tolist()

        # for DBSCAN (it produces -1 label)
        if -1 in self.cluster_labels:
            self.cluster_labels = [i + 1 for i in self.cluster_labels]

        self.cluster_centers = self.get_cluster_centers()

        pca = PCA(n_components=2).fit(self.features)
        self.data2d = pca.transform(self.features)

        self.cluster_label_set = set(self.cluster_labels)

        try:
            order_centroids = self.cluster_centers.argsort()[:, ::-1]
            self.cluster_terms = [[self.term_index[ind] for ind in order_centroids[i, :10]] for i in
                                  range(max(self.cluster_label_set) + 1)]
            self.centers2d = pca.transform(self.cluster_centers)
        except Exception as e:
            print(e)

        return self 
Example #29
Source File: cluster.py    From lexpredict-contraxsuite with GNU Affero General Public License v3.0 5 votes vote down vote up
def get_cluster_centers(self):
        """
        Default method to locate cluster centers
        :return: list of cluster centers
        """
        return self.cluster_model.cluster_centers_ 
Example #30
Source File: cluster.py    From lexpredict-contraxsuite with GNU Affero General Public License v3.0 5 votes vote down vote up
def create_db_cluster_object(self, cluster_label_id, cluster_terms, cluster_item_id_list):
        """
        Store a CLuster in DB, set M2M relation from cluster_item_id_list
        :param cluster_label_id: str
        :param cluster_self_name: str
        :param cluster_item_id_list: list of cluster item indexes
        :return: DB object pk
        """
        cluster_title = self.name or self.get_db_cluster_title(cluster_label_id)
        cluster_self_name = '-'.join([str(c) for c in cluster_terms[:5]]) if cluster_terms else None

        db_cluster_obj = self.db_cluster_model.objects.create(
            cluster_id=cluster_label_id,
            name=cluster_title,
            self_name=cluster_self_name,
            description=self.description or cluster_title,
            cluster_by=self.cluster_by_str,
            using=self.cluster_algorithm,
            created_date=self.start_date)

        # set m2m
        getattr(db_cluster_obj, self.db_cluster_model_m2m_name).set(cluster_item_id_list)

        # set default cluster name
        if self.use_default_name:
            db_cluster_obj.name = "Cluster #{}".format(db_cluster_obj.pk)
            db_cluster_obj.save()

        return db_cluster_obj