Python sklearn.cluster() Examples
The following are 30
code examples of sklearn.cluster().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn
, or try the search function
.

Example #1
Source File: cluster.py From acai with Apache License 2.0 | 7 votes |
def error(cluster, target_cluster, k): """ Compute error between cluster and target cluster :param cluster: proposed cluster :param target_cluster: target cluster :return: error """ n = np.shape(target_cluster)[0] M = np.zeros((k, k)) for i in range(k): for j in range(k): M[i][j] = np.sum(np.logical_and(cluster == i, target_cluster == j)) m = Munkres() indexes = m.compute(-M) corresp = [] for i in range(k): corresp.append(indexes[i][1]) pred_corresp = [corresp[int(predicted)] for predicted in cluster] acc = np.sum(pred_corresp == target_cluster) / float(len(target_cluster)) return acc
Example #2
Source File: dbscan.py From link-prediction_with_deep-learning with MIT License | 6 votes |
def process_options(args): options = argparser().parse_args(args) if options.max_rank is not None and options.max_rank < 1: raise ValueError('max-rank must be >= 1') if options.eps <= 0.0: raise ValueError('eps must be > 0') wv = wvlib.load(options.vectors[0], max_rank=options.max_rank) if options.normalize: logging.info('normalize vectors to unit length') wv.normalize() words, vectors = wv.words(), wv.vectors() if options.whiten: logging.info('normalize features to unit variance') vectors = scipy.cluster.vq.whiten(vectors) return words, vectors, options
Example #3
Source File: auxiliaries.py From Deep-Metric-Learning-Baselines with Apache License 2.0 | 6 votes |
def run_kmeans(features, n_cluster): """ Run kmeans on a set of features to find <n_cluster> cluster. Args: features: np.ndarrary [n_samples x embed_dim], embedding training/testing samples for which kmeans should be performed. n_cluster: int, number of cluster. Returns: cluster_assignments: np.ndarray [n_samples x 1], per sample provide the respective cluster label it belongs to. """ n_samples, dim = features.shape kmeans = faiss.Kmeans(dim, n_cluster) kmeans.n_iter, kmeans.min_points_per_centroid, kmeans.max_points_per_centroid = 20,5,1000000000 kmeans.train(features) _, cluster_assignments = kmeans.index.search(features,1) return cluster_assignments
Example #4
Source File: cluster.py From acai with Apache License 2.0 | 6 votes |
def cluster(train_latents, train_labels, test_latents, test_labels): num_classes = np.shape(train_labels)[-1] labels_hot = np.argmax(test_labels, axis=-1) train_latents = np.reshape(train_latents, newshape=[train_latents.shape[0], -1]) test_latents = np.reshape(test_latents, newshape=[test_latents.shape[0], -1]) kmeans = KMeans(init='random', n_clusters=num_classes, random_state=0, max_iter=1000, n_init=FLAGS.n_init, n_jobs=FLAGS.n_jobs) kmeans.fit(train_latents) print(kmeans.cluster_centers_) print('Train/Test k-means objective = %.4f / %.4f' % (-kmeans.score(train_latents), -kmeans.score(test_latents))) print('Train/Test accuracy %.4f / %.3f' % (error(np.argmax(train_labels, axis=-1), kmeans.predict(train_latents), k=num_classes), error(np.argmax(test_labels, axis=-1), kmeans.predict(test_latents), k=num_classes))) return error(labels_hot, kmeans.predict(test_latents), k=num_classes)
Example #5
Source File: cluster.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0 | 6 votes |
def __call__(self, features: np.array, term_index: list, use_tfidf: bool = True, **options): """ Just call activated class instance to cluster data. :param features: np.array - term frequency matrix :param term_index: list - list of term frequency matrix indexes :param use_tfidf: bool - whether to use TF IDF Transformer :param options: **dict - unpacked cluster algorithm options :return: ClusterEngine instance with attributes listed in __init__ """ self.features = features self.term_index = term_index self.num_records = features.shape[0] self.use_tfidf = use_tfidf self.user_options = options self.n_clusters = options.get('n_clusters') self.cluster_model = self.get_model() return self.cluster()
Example #6
Source File: main2.py From msaf with MIT License | 6 votes |
def cluster(evecs, Cnorm, k, in_bound_idxs=None): X = evecs[:, :k] / (Cnorm[:, k - 1:k] + 1e-5) KM = sklearn.cluster.KMeans(n_clusters=k, n_init=50, max_iter=500) seg_ids = KM.fit_predict(X) ############################################################### # Locate segment boundaries from the label sequence if in_bound_idxs is None: bound_beats = 1 + np.flatnonzero(seg_ids[:-1] != seg_ids[1:]) # Count beats 0 as a boundary bound_idxs = librosa.util.fix_frames(bound_beats, x_min=0) else: bound_idxs = in_bound_idxs # Compute the segment label for each boundary bound_segs = list(seg_ids[bound_idxs]) # Tack on the end-time bound_idxs = list(np.append(bound_idxs, len(Cnorm) - 1)) return bound_idxs, bound_segs
Example #7
Source File: main2.py From msaf with MIT License | 6 votes |
def do_segmentation(C, M, config, in_bound_idxs=None): embedding = embed_beats(C, M, config) Cnorm = np.cumsum(embedding ** 2, axis=1) ** 0.5 if config["hier"]: est_idxs = [] est_labels = [] for k in range(1, config["num_layers"] + 1): est_idx, est_label = cluster(embedding, Cnorm, k) est_idxs.append(est_idx) est_labels.append(np.asarray(est_label, dtype=np.int)) else: est_idxs, est_labels = cluster(embedding, Cnorm, config["scluster_k"], in_bound_idxs) est_labels = np.asarray(est_labels, dtype=np.int) return est_idxs, est_labels, Cnorm
Example #8
Source File: main.py From scTDA with GNU General Public License v3.0 | 6 votes |
def save(self, name, resolution, gain, equalize=True, cluster='agglomerative', statistics='db', max_K=5): """ Generates a topological representation using the Mapper algorithm with resolution and gain specified by the parameters 'resolution' and 'gain'. When equalize is set to True, patches are chosen such that they contain the same number of points. The parameter 'cluster' specifies the clustering method ('agglomerative' or 'kmeans'). The parameter 'statistics' specifies the criterion for choosing the optimal number of clusters ('db' for Davies-Bouildin index, or 'gap' for the gap statistic). The parameter 'max_K' specifies the maximum number of clusters to be considered within each patch. The topological representation is stored in the files 'name.gexf' and 'name.json'. It returns a dictionary with the patches. """ G, all_clusters, patches = sakmapper.mapper_graph(self.df, lens_data=self.lens_data_mds, resolution=resolution, gain=gain, equalize=equalize, clust=cluster, stat=statistics, max_K=max_K) dic = {} for n, rs in enumerate(all_clusters): dic[str(n)] = map(lambda x: int(x), rs) with open(name + '.json', 'wb') as handle3: json.dump(dic, handle3) networkx.write_gexf(G, name + '.gexf') return patches
Example #9
Source File: main.py From scTDA with GNU General Public License v3.0 | 6 votes |
def cellular_subpopulations(self, threshold=0.05, min_cells=5, clus_thres=0.65): """ Identifies potential transient cellular subpopulations. The parameter 'threshold' sets an upper bound of the q-value of the genes that are considered in the analysis. The parameter 'min_cells' sets the minimum number of cells on which each of the genes considered in the analysis is expressed. Cellular subpopulations are determined by clustering the Jensen-Shannon distance matrix of the genes that pass all the constraints. The number of clusters is controlled in this case by the parameter 'clus_thres'. In both cases a list with the genes associated to each cluster is returned. It requires the presence of the file 'name.genes.tsv', produced by the method RotedGraph.save(). """ con = [] dis = [] nam = [] f = open(self.name + '.genes.tsv', 'r') for n, line in enumerate(f): if n > 0: sp = line[:-1].split('\t') if float(sp[7]) < threshold and float(sp[1]) > min_cells: nam.append(sp[0]) f.close() mat2 = self.JSD_matrix(nam) return [map(lambda xx: nam[xx], m) for m in find_clusters(hierarchical_clustering(mat2, labels=nam, cluster_distance=True, thres=clus_thres)).values()]
Example #10
Source File: TICC.py From TICC with BSD 2-Clause "Simplified" License | 6 votes |
def computeF1_macro(confusion_matrix,matching, num_clusters): """ computes the macro F1 score confusion matrix : requres permutation matching according to which matrix must be permuted """ ##Permute the matrix columns permuted_confusion_matrix = np.zeros([num_clusters,num_clusters]) for cluster in xrange(num_clusters): matched_cluster = matching[cluster] permuted_confusion_matrix[:,cluster] = confusion_matrix[:,matched_cluster] ##Compute the F1 score for every cluster F1_score = 0 for cluster in xrange(num_clusters): TP = permuted_confusion_matrix[cluster,cluster] FP = np.sum(permuted_confusion_matrix[:,cluster]) - TP FN = np.sum(permuted_confusion_matrix[cluster,:]) - TP precision = TP/(TP + FP) recall = TP/(TP + FN) f1 = stats.hmean([precision,recall]) F1_score += f1 F1_score /= num_clusters return F1_score
Example #11
Source File: network_accuracy.py From TICC with BSD 2-Clause "Simplified" License | 6 votes |
def computeF1_macro(confusion_matrix,matching, num_clusters): """ computes the macro F1 score confusion matrix : requres permutation matching according to which matrix must be permuted """ ##Permute the matrix columns permuted_confusion_matrix = np.zeros([num_clusters,num_clusters]) for cluster in xrange(num_clusters): matched_cluster = matching[cluster] permuted_confusion_matrix[:,cluster] = confusion_matrix[:,matched_cluster] ##Compute the F1 score for every cluster F1_score = 0 for cluster in xrange(num_clusters): TP = permuted_confusion_matrix[cluster,cluster] FP = np.sum(permuted_confusion_matrix[:,cluster]) - TP FN = np.sum(permuted_confusion_matrix[cluster,:]) - TP precision = TP/(TP + FP) recall = TP/(TP + FN) f1 = stats.hmean([precision,recall]) F1_score += f1 F1_score /= num_clusters return F1_score
Example #12
Source File: test_monkeypatch.py From daal4py with Apache License 2.0 | 6 votes |
def test_monkey_patching(self): _tokens = daal4py.sklearn.sklearn_patch_names() self.assertTrue(isinstance(_tokens, list) and len(_tokens) > 0) for t in _tokens: daal4py.sklearn.unpatch_sklearn(t) for t in _tokens: daal4py.sklearn.patch_sklearn(t) import sklearn for a in [(sklearn.decomposition, 'PCA'), (sklearn.linear_model, 'Ridge'), (sklearn.linear_model, 'LinearRegression'), (sklearn.cluster, 'KMeans'), (sklearn.svm, 'SVC'),]: class_module = getattr(a[0], a[1]).__module__ self.assertTrue(class_module.startswith('daal4py'))
Example #13
Source File: baselines.py From AirBnbPricePrediction with MIT License | 6 votes |
def kmeans(X_train, y_train, X_val, y_val): n_clusters = 10 kmeans = KMeans(n_clusters=n_clusters, random_state=0, verbose=0, n_jobs=int(0.8*n_cores)).fit(X_train) c_train = kmeans.predict(X_train) c_pred = kmeans.predict(X_val) centroids = kmeans.cluster_centers_ for i in range(n_clusters): print('--------analyzing cluster %d--------' %i) train_mask = c_train==i std_train = np.std(y_train[train_mask]) mean_train = np.mean(y_train[train_mask]) print("# examples & price mean & std for training set within cluster %d is:(%d, %.2f, %.2f)" %(i, train_mask.sum(), np.float(mean_train), np.float(std_train))) pred_mask = c_pred==i std_pred = np.std(y_val[pred_mask]) mean_pred = np.mean(y_val[pred_mask]) print("# examples & price mean & std for validation set within cluster %d is:(%d, %.2f, %.2f)" %(i, pred_mask.sum(), np.float(mean_pred), np.float(std_pred))) if pred_mask.sum() == 0: print('Zero membered test set! Skipping the test and training validation.') continue LinearModel(X_train[train_mask], y_train[train_mask], X_val[pred_mask], y_val[pred_mask]) print('--------Finished analyzing cluster %d--------' %i) return c_pred, centroids
Example #14
Source File: k_means.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def predict(self, X): """Predict the closest cluster each sample in X belongs to. In the vector quantization literature, `cluster_centers_` is called the code book and each value returned by `predict` is the index of the closest code in the code book. Parameters ---------- X : array-like, shape = [n_samples, n_features] New data to predict. Returns ------- labels : array, shape [n_samples,] Index of the cluster each sample belongs to. """ check_is_fitted(self, "cluster_centers_") X = self._check_array(X) labels = pairwise_distances_argmin_min(X, self.cluster_centers_)[0].astype( np.int32 ) return labels
Example #15
Source File: kmeans.py From link-prediction_with_deep-learning with MIT License | 5 votes |
def process_options(args): options = argparser().parse_args(args) if options.max_rank is not None and options.max_rank < 1: raise ValueError('max-rank must be >= 1') if options.k is not None and options.k < 2: raise ValueError('cluster number must be >= 2') if options.method == MINIBATCH_KMEANS and not with_sklearn: logging.warning('minibatch kmeans not available, using kmeans (slow)') options.method = KMEANS if options.jobs != 1 and (options.method != KMEANS or not with_sklearn): logging.warning('jobs > 1 only supported scikit-learn %s' % KMEANS) options.jobs = 1 wv = wvlib.load(options.vectors[0], max_rank=options.max_rank) if options.k is None: options.k = int(math.ceil((len(wv.words())/2)**0.5)) logging.info('set k=%d (%d words)' % (options.k, len(wv.words()))) if options.normalize: logging.info('normalize vectors to unit length') wv.normalize() words, vectors = wv.words(), wv.vectors() if options.whiten: logging.info('normalize features to unit variance') vectors = scipy.cluster.vq.whiten(vectors) return words, vectors, options
Example #16
Source File: kmeans.py From link-prediction_with_deep-learning with MIT License | 5 votes |
def minibatch_kmeans(vectors, k): if not with_sklearn: raise NotImplementedError # Sculley (http://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf) # uses batch size 1000. sklearn KMeans defaults to n_init 10 kmeans = sklearn.cluster.MiniBatchKMeans(k, batch_size=1000, n_init=10) kmeans.fit(vectors) return kmeans.labels_
Example #17
Source File: kmeans.py From link-prediction_with_deep-learning with MIT License | 5 votes |
def kmeans(vectors, k, jobs=1): vectors = numpy.array(vectors) if with_sklearn: if jobs == 1: kmeans = sklearn.cluster.KMeans(k) else: kmeans = sklearn.cluster.KMeans(k, n_jobs=jobs) # sklearn > 0.10 kmeans.fit(vectors) return kmeans.labels_ else: codebook, distortion = scipy.cluster.vq.kmeans(vectors, k) cluster_ids, dist = scipy.cluster.vq.vq(vectors, codebook) return cluster_ids
Example #18
Source File: kmeans.py From link-prediction_with_deep-learning with MIT License | 5 votes |
def write_cluster_ids(words, cluster_ids, out=None): """Write given list of words and their corresponding cluster ids to out.""" assert len(words) == len(cluster_ids), 'word/cluster ids number mismatch' if out is None: out = sys.stdout for word, cid in izip(words, cluster_ids): print >> out, '%s\t%d' % (word, cid)
Example #19
Source File: clustering.py From metric-learning-divide-and-conquer with GNU Lesser General Public License v3.0 | 5 votes |
def get_cluster_labels(model, data_loader, use_penultimate, nb_clusters, gpu_id=None, backend='faiss'): is_dry_run = (nb_clusters == 1) if not is_dry_run: if not use_penultimate: logging.debug('Using the final layer for clustering') X_all, T_all, I_all = utils.predict_batchwise( model=model, dataloader=data_loader, use_penultimate=use_penultimate, is_dry_run=is_dry_run ) perm = np.argsort(I_all) X_all = X_all[perm] I_all = I_all[perm] T_all = T_all[perm] if backend == 'torch+sklearn': clustering_algorithm = sklearn.cluster.KMeans( n_clusters=nb_clusters) C = clustering_algorithm.fit(X_all).labels_ else: C = faissext.do_clustering( X_all, num_clusters = nb_clusters, gpu_ids = None if backend != 'faiss-gpu' else torch.cuda.current_device(), niter=100, nredo=5, verbose=0 ) else: T_all = np.array(data_loader.dataset.ys) I_all = np.array(data_loader.dataset.I) C = np.zeros(len(T_all), dtype=int) return C, T_all, I_all
Example #20
Source File: script_smk.py From ibeis with Apache License 2.0 | 5 votes |
def hyrule_vocab_test(): from yael.yutils import load_ext from os.path import join import sklearn.cluster dbdir = ut.truepath('/raid/work/Oxford/') datadir = dbdir + '/smk_data_iccv_2013/data/' # Files storing descriptors/geometry for Oxford5k dataset test_sift_fname = join(datadir, 'oxford_sift.uint8') # test_nf_fname = join(datadir, 'oxford_nsift.uint32') all_vecs = load_ext(test_sift_fname, ndims=128, verbose=True).astype(np.float32) print(ut.print_object_size(all_vecs)) # nfeats_list = load_ext(test_nf_fname, verbose=True) with ut.embed_on_exception_context: rng = np.random.RandomState(13421421) # init_size = int(config['num_words'] * 8) num_words = int(2 ** 16) init_size = num_words * 4 # converged after 26043 iterations minibatch_params = dict( n_clusters=num_words, init='k-means++', # init='random', init_size=init_size, n_init=1, max_iter=100, batch_size=1000, tol=0.0, max_no_improvement=10, reassignment_ratio=0.01, ) clusterer = sklearn.cluster.MiniBatchKMeans( compute_labels=False, random_state=rng, verbose=1, **minibatch_params) clusterer.fit(all_vecs) words = clusterer.cluster_centers_ print(words.shape)
Example #21
Source File: preproc_occurrence.py From ibeis with Apache License 2.0 | 5 votes |
def ibeis_compute_occurrences(ibs, gid_list, config=None, verbose=None): """ clusters occurrences togethers (by time, not yet space) An occurrence is a meeting, localized in time and space between a camera and a group of animals. Animals are identified within each occurrence. Does not modify database state, just returns cluster ids Args: ibs (IBEISController): ibeis controller object gid_list (list): Returns: tuple: (None, None) CommandLine: python -m ibeis --tf ibeis_compute_occurrences:0 --show TODO: FIXME: good example of autogen doctest return failure """ if config is None: config = {'use_gps': False, 'seconds_thresh': 600} #from ibeis.algo import Config #config = Config.OccurrenceConfig().asdict() occur_labels, occur_gids = compute_occurrence_groups(ibs, gid_list, config, verbose=verbose) if True: gid2_label = {gid: label for label, gids in zip(occur_labels, occur_gids) for gid in gids} # Assert that each gid only belongs to one occurrence flat_imgsetids = ut.dict_take(gid2_label, gid_list) flat_gids = gid_list else: # Flatten gids list by enounter flat_imgsetids, flat_gids = ut.flatten_membership_mapping(occur_labels, occur_gids) return flat_imgsetids, flat_gids
Example #22
Source File: preproc_occurrence.py From ibeis with Apache License 2.0 | 5 votes |
def agglomerative_cluster_occurrences(X_data, thresh_sec): """ Agglomerative occurrence clustering algorithm Args: X_data (ndarray): Length N array of data to cluster thresh_sec (float): Returns: ndarray: (label_arr) - Length N array of cluster indexes CommandLine: python -m ibeis.algo.preproc.preproc_occurrence --exec-agglomerative_cluster_occurrences References: https://docs.scipy.org/doc/scipy-0.9.0/reference/generated/scipy.cluster.hierarchy.fclusterdata.html#scipy.cluster.hierarchy.fclusterdata http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.fcluster.html Example: >>> # DISABLE_DOCTEST >>> from ibeis.algo.preproc.preproc_occurrence import * # NOQA >>> X_data = '?' >>> thresh_sec = '?' >>> (occur_ids, occur_gids) = agglomerative_cluster_occurrences(X_data, thresh_sec) >>> result = ('(occur_ids, occur_gids) = %s' % (str((occur_ids, occur_gids)),)) >>> print(result) """ label_arr = scipy.cluster.hierarchy.fclusterdata( X_data, thresh_sec, criterion='distance') return label_arr
Example #23
Source File: preproc_occurrence.py From ibeis with Apache License 2.0 | 5 votes |
def group_images_by_label(label_arr, gid_arr): """ Input: Length N list of labels and ids Output: Length M list of unique labels, and lenth M list of lists of ids """ # Reverse the image to cluster index mapping import vtool_ibeis as vt labels_, groupxs_ = vt.group_indices(label_arr) sortx = np.array(list(map(len, groupxs_))).argsort()[::-1] labels = labels_.take(sortx, axis=0) groupxs = ut.take(groupxs_, sortx) label_gids = vt.apply_grouping(gid_arr, groupxs) return labels, label_gids
Example #24
Source File: Transform_KM_Features.py From Auto_ViML with Apache License 2.0 | 5 votes |
def transform(self, X, y=None): """Output the closest cluster id for each input data point. Parameters ---------- X : array-like or sparse matrix, shape=(n_data_points, n_features) y : vector of length n_data_points, optional, default None Target vector is ignored even if provided. Returns ------- cluster_ids : array, shape[n_data_points,1] """ clusters = self.km_model.predict(X) return clusters[:,np.newaxis]
Example #25
Source File: Transform_KM_Features.py From Auto_ViML with Apache License 2.0 | 5 votes |
def Transform_KM_Features(training_data, training_labels, test_data, km_max=0): seed = 99 preds = list(training_data) target = training_labels.name train_index = training_data.index test_index = test_data.index if km_max <= 2: k_max = 2 else: k_max = copy.deepcopy(km_max) ### Calculate the target scale here => the higher the number the better for target accuracy try: if training_labels.dtype in [np.float64,np.float32,np.float16]: target_range = float(abs(training_labels.max() - training_labels.min())) elif training_labels.dtype in [object,bool]: target_range = int(len(Counter(training_labels)) + 3) else: target_range = int(abs(training_labels.max() - training_labels.min())) except: target_range = 5.0 kmf = KMeansFeaturizer(k=k_max, target_scale=target_range, random_state=seed) kmf_hint = kmf.fit(training_data, training_labels) ### Just return it with the cluster column => no need to return the data frame ### training_cluster_features = kmf_hint.transform(training_data) test_cluster_features = kmf_hint.transform(test_data) npx = np.c_[training_data, training_labels.values] training_with_cluster = np.c_[npx,training_cluster_features] test_with_cluster = np.c_[test_data, test_cluster_features] ### We are going to just return the cluster values ###### train_with_cluster_df = training_with_cluster[:,-1] test_with_cluster_df = test_with_cluster[:,-1] #train_with_cluster_df = pd.DataFrame(training_with_cluster,index=train_index, # columns=preds+[target,'cluster']) #test_with_cluster_df = pd.DataFrame(test_with_cluster,index=test_index, # columns=preds+['cluster']) return train_with_cluster_df, test_with_cluster_df
Example #26
Source File: kitti_usage.py From pydriver with MIT License | 5 votes |
def vocabularyGenerator(dimensions, featureName): voc = pydriver.detectors.vocabularies.Vocabulary( dimensions, preprocessors=[ sklearn.cluster.MiniBatchKMeans(n_clusters=100, batch_size=1000, max_iter=100), ], classifier=sklearn.ensemble.AdaBoostClassifier(n_estimators=75), storageGenerator=storageGenerator, balanceNegatives=True, ) return voc # initialize detector that will perform learning and recognition
Example #27
Source File: cluster.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0 | 5 votes |
def get_model(self): """ Activates cluster model with filled options :return: activated cluster model like Kmeans(**options) """ options = self.get_engine_options() return self.engine(**options)
Example #28
Source File: cluster.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0 | 5 votes |
def cluster(self): """ Central method to process incomind data with chosen activated Clustering model :return: ClusterEngine instance with attributes listed in __init__ """ # Toggle TF-IDF and cluster if self.use_tfidf: self.features = TfidfTransformer().fit_transform(self.features).toarray() # Return labels and representative points/centers self.cluster_labels = self.cluster_model.fit_predict(self.features).tolist() # for DBSCAN (it produces -1 label) if -1 in self.cluster_labels: self.cluster_labels = [i + 1 for i in self.cluster_labels] self.cluster_centers = self.get_cluster_centers() pca = PCA(n_components=2).fit(self.features) self.data2d = pca.transform(self.features) self.cluster_label_set = set(self.cluster_labels) try: order_centroids = self.cluster_centers.argsort()[:, ::-1] self.cluster_terms = [[self.term_index[ind] for ind in order_centroids[i, :10]] for i in range(max(self.cluster_label_set) + 1)] self.centers2d = pca.transform(self.cluster_centers) except Exception as e: print(e) return self
Example #29
Source File: cluster.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0 | 5 votes |
def get_cluster_centers(self): """ Default method to locate cluster centers :return: list of cluster centers """ return self.cluster_model.cluster_centers_
Example #30
Source File: cluster.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0 | 5 votes |
def create_db_cluster_object(self, cluster_label_id, cluster_terms, cluster_item_id_list): """ Store a CLuster in DB, set M2M relation from cluster_item_id_list :param cluster_label_id: str :param cluster_self_name: str :param cluster_item_id_list: list of cluster item indexes :return: DB object pk """ cluster_title = self.name or self.get_db_cluster_title(cluster_label_id) cluster_self_name = '-'.join([str(c) for c in cluster_terms[:5]]) if cluster_terms else None db_cluster_obj = self.db_cluster_model.objects.create( cluster_id=cluster_label_id, name=cluster_title, self_name=cluster_self_name, description=self.description or cluster_title, cluster_by=self.cluster_by_str, using=self.cluster_algorithm, created_date=self.start_date) # set m2m getattr(db_cluster_obj, self.db_cluster_model_m2m_name).set(cluster_item_id_list) # set default cluster name if self.use_default_name: db_cluster_obj.name = "Cluster #{}".format(db_cluster_obj.pk) db_cluster_obj.save() return db_cluster_obj