Python sklearn.cluster.DBSCAN Examples
The following are 30
code examples of sklearn.cluster.DBSCAN().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.cluster
, or try the search function
.
Example #1
Source File: __init__.py From dials with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _dbscan_clustering(self): from sklearn.preprocessing import StandardScaler X = self.coords_reduced.as_numpy_array() X = StandardScaler().fit_transform(X) # Perform cluster analysis from sklearn.cluster import DBSCAN db = DBSCAN( eps=self.params.cluster.dbscan.eps, min_samples=self.params.cluster.dbscan.min_samples, ).fit(X) import numpy as np return flex.int(db.labels_.astype(np.int32))
Example #2
Source File: km.py From kepler-mapper with MIT License | 6 votes |
def __init__(self, cluster_algorithm=cluster.DBSCAN(eps=0.5,min_samples=3), nr_cubes=10, overlap_perc=0.1, scaler=preprocessing.MinMaxScaler(), reducer=None, color_function="distance_origin", link_local=False, verbose=1): self.clf = cluster_algorithm self.nr_cubes = nr_cubes self.overlap_perc = overlap_perc self.scaler = scaler self.color_function = color_function self.verbose = verbose self.link_local = link_local self.reducer = reducer self.chunk_dist = [] self.overlap_dist = [] self.d = [] if self.verbose > 0: print("\nnr_cubes = %s \n\noverlap_perc = %s\n\nlink_local = %s\n\nClusterer = %s\n\nScaler = %s\n\n"%(self.nr_cubes, overlap_perc, self.link_local, str(self.clf),str(self.scaler)))
Example #3
Source File: find_parser.py From karonte with BSD 2-Clause "Simplified" License | 6 votes |
def get_cluster(self): scores = sorted([(b, max(i['stats'])) for b, i in self.all_candidates.items()], key=lambda x: x[1], reverse=True) data = [s[1] for s in scores] X = np.matrix(ParserFinder.get_matrix(data)) labels = list(DBSCAN(metric='precomputed').fit(X).labels_) clusters = [] new_c = [] old_l = 0 index = 0 for l in labels: b = scores[index][0] if old_l != l: clusters.append(new_c) new_c = [] new_c.append(b) old_l = l index += 1 self.clusters = list(clusters)
Example #4
Source File: find_parser.py From karonte with BSD 2-Clause "Simplified" License | 6 votes |
def get_cluster(self): scores = sorted([(b, max(i['stats'])) for b, i in self.all_candidates.items()], key=lambda x: x[1], reverse=True) data = [s[1] for s in scores] X = np.matrix(ParserFinder.get_matrix(data)) labels = list(DBSCAN(metric='precomputed').fit(X).labels_) clusters = [] new_c = [] old_l = 0 index = 0 for l in labels: b = scores[index][0] if old_l != l: clusters.append(new_c) new_c = [] new_c.append(b) old_l = l index += 1 self.clusters = list(clusters)
Example #5
Source File: pointsClustering.py From python-urbanPlanning with MIT License | 6 votes |
def affinityPropagationForPoints(dataArray,epsValue): # print("--------------------Clustering") data=dataArray a_T = datetime.datetime.now() db=cluster.DBSCAN(eps=epsValue,min_samples=3,metric='euclidean') #meter=degree*(2 * math.pi * 6378137.0)/ 360 degree=50/(2 * math.pi * 6378137.0) * 360,在调参时,eps为邻域的距离阈值,而分析的数据为经纬度数据,为了便于调参,可依据上述公式可以在米和度之间互相转换,此时设置eps=0.0008,约为90m,如果poi的空间点之间距离在90m内则为一簇;min_samples为样本点要成为核心对象所需要的邻域样本数阈值。参数需要自行根据所分析的数据不断调试,直至达到较好聚类的结果。 y_db=db.fit_predict(data) #获取聚类预测类标 b_T= datetime.datetime.now() # print("time span:", b_T-a_T) # print("_"*50) pred=y_db # print(pred,len(np.unique(pred))) #打印查看预测类标和计算聚类簇数 # print("-------------------cluster Finishing") return pred,np.unique(pred) #返回DBSCAN聚类预测值。和簇类标 #convert points .shp to raster 将点数据写入为raster数据。使用raster.SetGeoTransform,栅格化数据。参考GDAL官方代码
Example #6
Source File: rasterPTSextraction_statistic_poi.py From python-urbanPlanning with MIT License | 6 votes |
def affinityPropagationForPoints(dataArray,epsValue): print("--------------------Clustering") data=dataArray t1=time.time() db=cluster.DBSCAN(eps=epsValue,min_samples=3,metric='euclidean') #meter=degree*(2 * math.pi * 6378137.0)/ 360 degree=50/(2 * math.pi * 6378137.0) * 360,在调参时,eps为邻域的距离阈值,而分析的数据为经纬度数据,为了便于调参,可依据上述公式可以在米和度之间互相转换,此时设置eps=0.0008,约为90m,如果poi的空间点之间距离在90m内则为一簇;min_samples为样本点要成为核心对象所需要的邻域样本数阈值。参数需要自行根据所分析的数据不断调试,直至达到较好聚类的结果。 y_db=db.fit_predict(data) #获取聚类预测类标 t2=time.time() tDiff_af=t2-t1 #用于计算聚类所需时间 print(tDiff_af) pred=y_db print(pred,len(np.unique(pred))) #打印查看预测类标和计算聚类簇数 # t3=time.time() # plt.close('all') # plt.figure(1,figsize=(20,20)) # plt.clf() # cm=plt.cm.get_cmap('nipy_spectral') #获取内置色带 # plt.scatter(data[...,0],data[...,1],s=10,alpha=0.8,c=pred,cmap=cm) #c参数设置为预测值,传入色带,根据c值显示颜色 # plt.show() # t4=time.time() # tDiff_plt=t4-t3 #计算图表显示时间 # print(tDiff_plt) print("-------------------cluster Finishing") return pred,np.unique(pred) #返回DBSCAN聚类预测值。和簇类标
Example #7
Source File: vegetationCluster.py From python-urbanPlanning with MIT License | 6 votes |
def affinityPropagationForPoints(data): t1=time.time() db=cluster.DBSCAN(eps=16,min_samples=3,metric='euclidean') #调整eps参数,和min_sample参数,获得适宜的聚类结果 y_db=db.fit_predict(data) #获取聚类预测类标 t2=time.time() tDiff_af=t2-t1 #用于计算聚类所需时间 print("模型训练持续时间:",tDiff_af) pred=y_db print("预测类标,与簇数:",pred,len(np.unique(pred))) #打印查看预测类标和计算聚类簇数 t3=time.time() plt.close('all') plt.figure(1,figsize=(15,15)) plt.clf() cm=plt.cm.get_cmap('nipy_spectral') #获取内置色带 sc=plt.scatter(data[...,0],data[...,1],s=10,alpha=0.8,c=pred,cmap=cm) #c参数设置为预测值,传入色带,根据c值显示颜色 plt.show() t4=time.time() tDiff_plt=t4-t3 #计算图表显示时间 print("图表显示持续时间:",tDiff_plt) return pred,np.unique(pred) #返回DBSCAN聚类预测值。和簇类标
Example #8
Source File: DBSCAN.py From cn-text-classifier with GNU General Public License v3.0 | 6 votes |
def plot_res(labels: list, n_cluster: int, num: int): colors = plt.cm.Spectral(np.linspace(0, 1, len(set(labels)))) for k, col in zip(set(labels), colors): if k == -1: # Black used for noise. col = 'k' class_member_mask = (labels == k) xy = trainingData[class_member_mask & core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=10) xy = trainingData[class_member_mask & ~core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6) plt.title('DBSCAN') plt.savefig(settings.PLOT_DIR + 'db-%d-%d.png' % (n_cluster, num)) plt.show()
Example #9
Source File: DBSCAN.py From aggregation with Apache License 2.0 | 6 votes |
def createRanges(pts): X_ = np.array(pts) db_ = DBSCAN(eps=step+0.05, min_samples=1).fit(X_) labels = db_.labels_ ranges = [] for k in set(labels): class_member_mask = (labels == k) xy = X_[class_member_mask] epsilon_l,minPts = zip(*list(X_[class_member_mask])) epsilon_min,epsilon_max = min(epsilon_l),max(epsilon_l) assert(min(minPts) == max(minPts)) ranges.append((minPts[0],epsilon_min,epsilon_max)) return ranges
Example #10
Source File: new_jungle.py From aggregation with Apache License 2.0 | 6 votes |
def get_user_pts(markings): user_pts = [] X = np.asarray(markings) db = DBSCAN(eps=10, min_samples=3).fit(X) # core_samples_mask = np.zeros_like(db.labels_, dtype=bool) # core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) # print n_clusters_ unique_labels = set(labels) colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))) for k, col in zip(unique_labels, colors): if k == -1: continue class_member_mask = (labels == k) xy = X[class_member_mask] user_pts.append(xy) x,y = zip(*xy) # plt.plot(x,y,"o") return user_pts
Example #11
Source File: cluster_tools.py From SUPPA with MIT License | 6 votes |
def cluster_analysis(dpsi, psivec, sig_threshold, dpsi_threshold, eps, minpts, metric, indexes, clustering, separation, output): path = os.path.dirname(os.path.realpath(dpsi)) os.chdir(path) psi_matrix, eventid_lst = process_cluster_input(dpsi, psivec, sig_threshold, dpsi_threshold, indexes) if(clustering=="DBSCAN"): eventid_labels_dict, labels = DBSCAN_cluster(psi_matrix, eventid_lst, eps, minpts, metric) #eventid_labels_dict are the labels of the clustering for eacg event write_averaged_cluster_output(psi_matrix, eventid_lst, eventid_labels_dict, output) calculate_cluster_scores(psi_matrix, labels, output) else: #OPTICS points_list = create_points_list(psi_matrix, eventid_lst) #Transform the points on psi_matrix to Points from optics.py optics = Optics(points_list, eps, minpts) # Maximum radius to be considered, cluster size >= 2 points optics.run() # run the algorithm clusters = optics.cluster(separation) # minimum threshold for clustering (upper limit to separate the clusters) eventid_labels_dict, labels = generate_labels(clusters, eventid_lst) write_averaged_cluster_output(psi_matrix, eventid_lst, eventid_labels_dict, output) calculate_cluster_scores(psi_matrix, labels, output)
Example #12
Source File: cluster_tools.py From SUPPA with MIT License | 6 votes |
def DBSCAN_cluster(psi_matrix, eventid_lst, dist, minpts, metric): # Setting logging preferences logger = logging.getLogger(__name__) # The metric is "cosine" works only with the algorithm "brute" if metric == "cosine": alg = 'brute' else: alg = 'auto' try: db = DBSCAN(eps=dist, min_samples=minpts, metric=metric, algorithm=alg).fit(psi_matrix) labels = db.labels_ except: logger.error("Unknown error: {}".format(sys.exc_info())) sys.exit(1) eventid_labels_dict = {k: v for k, v in zip(eventid_lst, labels)} return eventid_labels_dict, labels
Example #13
Source File: gen_sklearn.py From lmatools with BSD 2-Clause "Simplified" License | 6 votes |
def identify_clusters(self, data): """ For data with shape (N, D) in D dimensions, return a vector of labels of length N. min_points is the minimum number of points required to form a a cluster. For the DBSCAN algorithm, this is min_samples for a core cluster. This function adopts the convention that clusters labeled with an ID of -1 are singleton points not belonging to a cluster, consistent with the convention of sklearn.cluster.DBSCAN """ db = DBSCAN(eps=1.0, min_samples=self.min_points, metric='euclidean') clusters = db.fit(data) labels = clusters.labels_.astype(int) return labels
Example #14
Source File: shifted_delta_cepstra.py From hunspeech with MIT License | 6 votes |
def get_classer(self, algo_name, classer, algo_dir): if not os.path.exists(algo_dir): os.mkdir(algo_dir) classer_fn = '{}_classer.npy'.format(os.path.join(algo_dir, algo_name)) trafoed_fn = '{}_trafoed.npy'.format(os.path.join(algo_dir, algo_name)) if os.path.isfile(classer_fn): return pickle.load(open(classer_fn, mode='rb')) else: if algo_name == 'DBSCAN': self.loop_estimate_bandwidth() logger.info('clustering all speech with {}'.format(algo_name)) if hasattr(classer, 'fit') and hasattr(classer, 'predict'): classer.fit(self.sdc_all_speech) elif hasattr(classer, 'fit_transform'): # TSNE all_speech_trafoed = classer.fit_transform(self.sdc_all_speech) np.save(open(trafoed_fn, mode='wb'), all_speech_trafoed) else: # DBSCAN classer.fit_predict(self.sdc_all_speech) logger.info(classer.get_params()) logger.info('dumping classifier') pickle.dump(classer, open(classer_fn, mode='wb')) return classer
Example #15
Source File: outlierdenstream.py From outlierdenstream with MIT License | 6 votes |
def initDBScan(self): """ Init with DBSCAN """ db=DBSCAN(eps=0.05, min_samples=2) db.fit(self.buffer) labels = pd.DataFrame(db.labels_+1) for x in range(1, labels[0].max()+1): samples = self.buffer[labels[labels[0]==x].index] sample = Sample(samples[0], 0) sample.setTimestamp(1) mc = MicroCluster(1, self.lamb, self.pMicroCluster.N + 1) for sampleNumber in range(0, len(samples)): sample = Sample(samples[sampleNumber], sampleNumber) sample.setTimestamp(sampleNumber+1) mc.insertSample(sample, self.currentTimestamp) self.pMicroCluster.insert(mc)
Example #16
Source File: test_keyed_models.py From spark-sklearn with Apache License 2.0 | 6 votes |
def test_correct_estimator_type(self): self.checkEstimatorType(KeyedEstimator(sklearnEstimator=PCA()), "transformer") self.checkEstimatorType(KeyedEstimator(sklearnEstimator=LinearRegression(), yCol="y"), "predictor") self.checkEstimatorType(KeyedEstimator(sklearnEstimator=DBSCAN()), "clusterer") self.checkEstimatorType(KeyedEstimator(sklearnEstimator=KMeans()), "clusterer") ke = KeyedEstimator(sklearnEstimator=KMeans(), estimatorType="transformer") self.checkEstimatorType(ke, "transformer") custom = KeyedModelTests._CustomClusterer() ke = KeyedEstimator(sklearnEstimator=custom) self.checkEstimatorType(ke, "clusterer") ke = KeyedEstimator(sklearnEstimator=custom, estimatorType="transformer") self.checkEstimatorType(ke, "transformer") custom = KeyedModelTests._CustomTransformer() self.checkEstimatorType(KeyedEstimator(sklearnEstimator=custom), "transformer")
Example #17
Source File: lexrankr.py From lexrankr with MIT License | 5 votes |
def __init__(self, similarity='cosine', decay_window=20, decay_alpha=0.25, clustering='dbscan', tagger='twitter', useful_tags=['Noun', 'Verb', 'Adjective', 'Determiner', 'Adverb', 'Conjunction', 'Josa', 'PreEomi', 'Eomi', 'Suffix', 'Alpha', 'Number'], delimiters=['. ', '\n', '.\n'], min_token_length=2, stopwords=stopwords_ko, no_below_word_count=2, no_above_word_portion=0.85, max_dictionary_size=None, min_cluster_size=2, similarity_threshold=0.85, matrix_smoothing=False, n_clusters=None, compactify=True, **kwargs): self.decay_window = decay_window self.decay_alpha = decay_alpha if similarity == 'cosine': # very, very slow :( self.vectorizer = DictVectorizer() self.uniform_sim = self._sim_cosine elif similarity == 'jaccard': self.uniform_sim = self._sim_jaccard elif similarity == 'normalized_cooccurrence': self.uniform_sim = self._sim_normalized_cooccurrence else: raise LexRankError("available similarity functions are: cosine, jaccard, normalized_cooccurrence") self.sim = lambda sentence1, sentence2: self.decay(sentence1, sentence2) * self.uniform_sim(sentence1, sentence2) self.factory = SentenceFactory(tagger=tagger, useful_tags=useful_tags, delimiters=delimiters, min_token_length=min_token_length, stopwords=stopwords, **kwargs) if clustering == 'birch': self._birch = Birch(threshold=0.99, n_clusters=n_clusters) self._clusterer = lambda matrix: self._birch.fit_predict(1 - matrix) elif clustering == 'dbscan': self._dbscan = DBSCAN() self._clusterer = lambda matrix: self._dbscan.fit_predict(1 - matrix) elif clustering == 'affinity': self._affinity = AffinityPropagation() self._clusterer = lambda matrix: self._affinity.fit_predict(1 - matrix) elif clustering is None: self._clusterer = lambda matrix: [0 for index in range(matrix.shape[0])] else: raise LexRankError("available clustering algorithms are: birch, markov, no-clustering(use `None`)") self.no_below_word_count = no_below_word_count self.no_above_word_portion = no_above_word_portion self.max_dictionary_size = max_dictionary_size self.similarity_threshold = similarity_threshold self.min_cluster_size = min_cluster_size self.matrix_smoothing = matrix_smoothing self.compactify = compactify
Example #18
Source File: lma.py From lmatools with BSD 2-Clause "Simplified" License | 5 votes |
def cluster_chunk_pairs(clustered_output_target): db = DBSCAN(eps=1.0, min_samples=10, metric='euclidean') """Receive chunks, and process overlapping pairs""" chunk1 = (yield) try: while True: chunk2 = (yield) len1 = chunk1.shape[0] len2 = chunk2.shape[0] print(len1+len2) # do stuff with chunk 1 and 2 clusters = db.fit(np.vstack((chunk1, chunk2))) labels = clusters.labels_ clustered_output_target.send((chunk1, labels[:len1])) # pull data out of chunk2 that was clustered as part of chunk 1 chunk1_labelset = set(labels[:len1]) if -1 in chunk1_labelset: chunk1_labelset.remove(-1) # remove the singleton cluster ID - we want to retain these from chunk 2. clustered_in_chunk2 = np.fromiter( ( True if label in chunk1_labelset else False for i,label in enumerate(labels[len1:])) , dtype=bool) clustered_output_target.send((chunk2[clustered_in_chunk2], labels[len1:][clustered_in_chunk2])) residuals = chunk2[clustered_in_chunk2==False] # prepare for another chunk if len(residuals) == 0: residuals = chunk1[0:0,:] # empty array that preserves the number of dimensions in the data vector - no obs. del chunk1 chunk1 = np.asarray(residuals) del residuals except GeneratorExit: clusters = db.fit(chunk1) labels = clusters.labels_ clustered_output_target.send((chunk1, labels))
Example #19
Source File: sklearn_cluster.py From learn-to-cluster with MIT License | 5 votes |
def knn_dbscan(feats, eps, min_samples, prefix, name, knn_method, knn, th_sim, **kwargs): knn_prefix = os.path.join(prefix, 'knns', name) knns = build_knns(knn_prefix, feats, knn_method, knn) sparse_affinity = fast_knns2spmat(knns, knn, th_sim, use_sim=False) db = cluster.DBSCAN(eps=eps, min_samples=min_samples, n_jobs=mp.cpu_count(), metric='precomputed').fit(sparse_affinity) return db.labels_
Example #20
Source File: sklearn_cluster.py From learn-to-cluster with MIT License | 5 votes |
def dbscan(feat, eps, min_samples, **kwargs): db = cluster.DBSCAN(eps=eps, min_samples=min_samples, n_jobs=mp.cpu_count()).fit(feat) return db.labels_
Example #21
Source File: density.py From trajminer with MIT License | 5 votes |
def __init__(self, eps=0.5, min_samples=5, measure='precomputed', n_jobs=1): self.dbscan = skDBSCAN(eps=eps, min_samples=min_samples, metric='precomputed', n_jobs=n_jobs) self.eps = eps self.min_samples = min_samples self.measure = measure self.n_jobs = n_jobs
Example #22
Source File: sklearn_sycl.py From daal4py with Apache License 2.0 | 5 votes |
def dbscan(): print("DBSCAN") X = np.array([[1., 2.], [2., 2.], [2., 3.], [8., 7.], [8., 8.], [25., 80.]]) clustering = DBSCAN(eps=3, min_samples=2).fit(X) print("clustering.labels_") print(clustering.labels_) print("clustering") print(clustering)
Example #23
Source File: active_weather.py From aggregation with Apache License 2.0 | 5 votes |
def __dbscan_threshold__(img): ink_pixels = np.where(img>0) X = np.asarray(zip(ink_pixels[1],ink_pixels[0])) print("doing dbscan: " + str(X.shape)) db = DBSCAN(eps=1, min_samples=5).fit(X) labels = db.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) unique_labels = set(labels) colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))) return_image = np.zeros(gray.shape,np.uint8) return_image.fill(255) print("going through dbscan results") for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. continue class_member_mask = (labels == k) # temp = np.zeros(X.shape) xy = X[class_member_mask] max_value = gray[xy[:, 1], xy[:, 0]].max() median = np.median(gray[xy[:, 1], xy[:, 0]]) mean = np.mean(gray[xy[:, 1], xy[:, 0]]) # print(max_value,median,mean) if True:#median > 120: x_max,y_max = np.max(xy,axis=0) x_min,y_min = np.min(xy,axis=0) if min(x_max-x_min,y_max-y_min) >= 10: return_image[xy[:, 1], xy[:, 0]] = gray[xy[:, 1], xy[:, 0]]
Example #24
Source File: echoDoc0.1.py From EchoBurst with MIT License | 5 votes |
def newDBSCANModel(vectorFile, outputFile): model = Doc2Vec.load("Models\\" + vectorFile) vecs = [] for doc in range(0, len(model.docvecs)): doc_vec = model.docvecs[doc] # print doc_vec vecs.append(doc_vec.reshape((1, 300))) doc_vecs = np.array(vecs, dtype='float') # TSNE expects float type values # print doc_vecs docs = [] for i in doc_vecs: docs.append(i[0]) db = DBSCAN(eps=0.03, algorithm="brute", metric='cosine').fit(docs) joblib.dump(db, outputFile) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) clusters = db.labels_.tolist() cluster_info = {'labels': model.docvecs.offset2doctag, "index, wordcount and repeated words": [model.docvecs.doctags[x] for x in model.docvecs.offset2doctag], 'clusters': clusters} sentenceDF = pd.DataFrame(cluster_info, index=[clusters], columns=['labels', "index, wordcount and repeated words", 'clusters']) print(sentenceDF) sentenceDF.to_csv("DBSCAN.csv") print('Estimated number of clusters: %d' % n_clusters_)
Example #25
Source File: DBSCAN.py From Splunking-Crime with GNU Affero General Public License v3.0 | 5 votes |
def __init__(self, options): self.handle_options(options) out_params = convert_params(options.get('params', {}), floats=['eps']) self.estimator = _DBSCAN(**out_params)
Example #26
Source File: sliding2.py From aggregation with Apache License 2.0 | 5 votes |
def get_window_size(): non_white_points = np.where(img[:, :500] != 255) non_white_points = np.asarray(zip(non_white_points[0], non_white_points[1])) print(non_white_points.shape) db = DBSCAN(eps=1, min_samples=5).fit(non_white_points) labels = db.labels_ unique_labels = set(labels) colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))) print("here") heights = [] widths = [] for k, col in zip(unique_labels, colors): if k == -1: continue # print(k) class_member_mask = (labels == k) xy = non_white_points[class_member_mask] # min_y, min_x = np.min(xy, axis=0) max_y, max_x = np.max(xy, axis=0) if min(max_x - min_x, max_y - min_y) <= 1: continue heights.append(max_y - min_y) widths.append(max_x - min_x)
Example #27
Source File: clusterer.py From yelp with GNU Lesser General Public License v2.1 | 5 votes |
def dbscan(matrix): dbscan = skcluster.DBSCAN(eps=0.3, min_samples=50, metric='euclidean') # dbscan = skcluster.DBSCAN(eps=0.3, min_samples=50, # metric=nltk.cosine_distance) dbscan.fit(matrix) labels = dbscan.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters:', n_clusters_) return labels # OK
Example #28
Source File: run_center_clustering.py From pyImSegm with BSD 3-Clause "New" or "Revised" License | 5 votes |
def cluster_center_candidates(points, max_dist=100, min_samples=1): """ cluster center candidates by given density clustering :param [[float]] points: points :param float max_dist: maximal distance among points :param int min_samples: minimal number od samples :return (ndarray, [int]): """ points = np.array(points) if not list(points): return points, [] dbscan = cluster.DBSCAN(eps=max_dist, min_samples=min_samples) dbscan.fit(points) labels = dbscan.labels_.copy() centers = [] for i in range(max(labels) + 1): clust = points[labels == i] if len(clust) > 0: center = np.mean(clust, axis=0) centers.append(center) return np.array(centers), labels
Example #29
Source File: dbscan_analysis.py From ns4_chatbot with Apache License 2.0 | 5 votes |
def DBSCAN_analysis(self,tfidf_data): dbscan = DBSCAN()#5是默认的 labels = dbscan.fit_predict(tfidf_data) logger.debug("DBSCAN聚出来类:%d", len(set(labels))) # logger.debug(labels) cores = dbscan.core_sample_indices_ # logger.debug("dbscan components:%r",dbscan.components_) return labels, dbscan,cores # 先将words变成tfidf,那么tfidf词表就不能变化,这个点要注意 # - 要验证词表没有变化,这样才能保证tfidf向量不会变化 # - 还要注意传入的分词组中不存在的词会不会报错 # 然后,启动KNN的it,将之前保存的分类传入KNN,形成KNN模型 # 使用KNN模型来预测这个新传入的分词组,属于的类别,并打印词类别中的3个样例,随机 #核心方法!!!!
Example #30
Source File: cluster.py From PAST-ReID with MIT License | 5 votes |
def dbscancluster(self, dist, iteration=-1): # DBSCAN cluster tri_mat = np.triu(dist, 1) # tri_mat.dim=2 tri_mat = tri_mat[np.nonzero(tri_mat)] # tri_mat.dim=1 tri_mat = np.sort(tri_mat, axis=None) top_num = np.round(self.args.rho * tri_mat.size).astype(int) eps = tri_mat[:top_num].mean() print('eps in cluster: {:.3f}'.format(eps)) clusterer = DBSCAN(eps=eps, min_samples=self.args.dbscan_minsample, metric='precomputed', n_jobs=8) labels = clusterer.fit_predict(dist) # select & cluster images as training set of this epochs print('Clustering and labeling...') num_ids = len(set(labels)) - 1 print('Epoch {} have {} training ids'.format(iteration, num_ids)) # generate new dataset new_dataset = [] new_indices = [] for (fname, _, _), label, indice in zip(self.traindataset, labels, self.old_indices): if label == -1: continue # dont need to change codes in trainer.py _parsing_input function and sampler function after add 0 new_dataset.append((fname, label, indice)) new_indices.append(indice) print('Iteration {} have {} training images'.format(iteration, len(new_dataset))) return new_dataset, new_indices