Python sklearn.cluster.DBSCAN Examples

The following are 30 code examples of sklearn.cluster.DBSCAN(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.cluster , or try the search function .
Example #1
Source File: __init__.py    From dials with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _dbscan_clustering(self):
        from sklearn.preprocessing import StandardScaler

        X = self.coords_reduced.as_numpy_array()
        X = StandardScaler().fit_transform(X)

        # Perform cluster analysis
        from sklearn.cluster import DBSCAN

        db = DBSCAN(
            eps=self.params.cluster.dbscan.eps,
            min_samples=self.params.cluster.dbscan.min_samples,
        ).fit(X)
        import numpy as np

        return flex.int(db.labels_.astype(np.int32)) 
Example #2
Source File: km.py    From kepler-mapper with MIT License 6 votes vote down vote up
def __init__(self, cluster_algorithm=cluster.DBSCAN(eps=0.5,min_samples=3), nr_cubes=10, 
         overlap_perc=0.1, scaler=preprocessing.MinMaxScaler(), reducer=None, color_function="distance_origin", 
         link_local=False, verbose=1):
    self.clf = cluster_algorithm
    self.nr_cubes = nr_cubes
    self.overlap_perc = overlap_perc
    self.scaler = scaler
    self.color_function = color_function
    self.verbose = verbose
    self.link_local = link_local
    self.reducer = reducer
    
    self.chunk_dist = []
    self.overlap_dist = []
    self.d = []
    
    if self.verbose > 0:
      print("\nnr_cubes = %s \n\noverlap_perc = %s\n\nlink_local = %s\n\nClusterer = %s\n\nScaler = %s\n\n"%(self.nr_cubes, overlap_perc, self.link_local, str(self.clf),str(self.scaler))) 
Example #3
Source File: find_parser.py    From karonte with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def get_cluster(self):
        scores = sorted([(b, max(i['stats'])) for b, i in self.all_candidates.items()], key=lambda x: x[1], reverse=True)
        data = [s[1] for s in scores]
        X = np.matrix(ParserFinder.get_matrix(data))
        labels = list(DBSCAN(metric='precomputed').fit(X).labels_)
        clusters = []
        new_c = []
        old_l = 0
        index = 0
        for l in labels:
            b = scores[index][0]
            if old_l != l:
                clusters.append(new_c)
                new_c = []

            new_c.append(b)
            old_l = l
            index += 1
        self.clusters = list(clusters) 
Example #4
Source File: find_parser.py    From karonte with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def get_cluster(self):
        scores = sorted([(b, max(i['stats'])) for b, i in self.all_candidates.items()], key=lambda x: x[1], reverse=True)
        data = [s[1] for s in scores]
        X = np.matrix(ParserFinder.get_matrix(data))
        labels = list(DBSCAN(metric='precomputed').fit(X).labels_)
        clusters = []
        new_c = []
        old_l = 0
        index = 0
        for l in labels:
            b = scores[index][0]
            if old_l != l:
                clusters.append(new_c)
                new_c = []

            new_c.append(b)
            old_l = l
            index += 1
        self.clusters = list(clusters) 
Example #5
Source File: pointsClustering.py    From python-urbanPlanning with MIT License 6 votes vote down vote up
def affinityPropagationForPoints(dataArray,epsValue):
    # print("--------------------Clustering")
    data=dataArray
    a_T = datetime.datetime.now()    
    db=cluster.DBSCAN(eps=epsValue,min_samples=3,metric='euclidean') #meter=degree*(2 * math.pi * 6378137.0)/ 360  degree=50/(2 * math.pi * 6378137.0) * 360,在调参时,eps为邻域的距离阈值,而分析的数据为经纬度数据,为了便于调参,可依据上述公式可以在米和度之间互相转换,此时设置eps=0.0008,约为90m,如果poi的空间点之间距离在90m内则为一簇;min_samples为样本点要成为核心对象所需要的邻域样本数阈值。参数需要自行根据所分析的数据不断调试,直至达到较好聚类的结果。
    y_db=db.fit_predict(data)  #获取聚类预测类标
    
    b_T= datetime.datetime.now()
    # print("time span:", b_T-a_T)
    # print("_"*50) 
    
    pred=y_db  
    # print(pred,len(np.unique(pred)))  #打印查看预测类标和计算聚类簇数

    # print("-------------------cluster Finishing")
    return pred,np.unique(pred)  #返回DBSCAN聚类预测值。和簇类标

#convert points .shp to raster 将点数据写入为raster数据。使用raster.SetGeoTransform,栅格化数据。参考GDAL官方代码 
Example #6
Source File: rasterPTSextraction_statistic_poi.py    From python-urbanPlanning with MIT License 6 votes vote down vote up
def affinityPropagationForPoints(dataArray,epsValue):
    print("--------------------Clustering")
    data=dataArray
    t1=time.time()     
    db=cluster.DBSCAN(eps=epsValue,min_samples=3,metric='euclidean') #meter=degree*(2 * math.pi * 6378137.0)/ 360  degree=50/(2 * math.pi * 6378137.0) * 360,在调参时,eps为邻域的距离阈值,而分析的数据为经纬度数据,为了便于调参,可依据上述公式可以在米和度之间互相转换,此时设置eps=0.0008,约为90m,如果poi的空间点之间距离在90m内则为一簇;min_samples为样本点要成为核心对象所需要的邻域样本数阈值。参数需要自行根据所分析的数据不断调试,直至达到较好聚类的结果。
    y_db=db.fit_predict(data)  #获取聚类预测类标
    t2=time.time()    
    tDiff_af=t2-t1 #用于计算聚类所需时间
    print(tDiff_af)
    
    pred=y_db  
    print(pred,len(np.unique(pred)))  #打印查看预测类标和计算聚类簇数
    
#    t3=time.time()
#    plt.close('all')
#    plt.figure(1,figsize=(20,20))
#    plt.clf()
#    cm=plt.cm.get_cmap('nipy_spectral')  #获取内置色带
#    plt.scatter(data[...,0],data[...,1],s=10,alpha=0.8,c=pred,cmap=cm) #c参数设置为预测值,传入色带,根据c值显示颜色
#    plt.show()
#    t4=time.time()
#    tDiff_plt=t4-t3  #计算图表显示时间
#    print(tDiff_plt)
    print("-------------------cluster Finishing")
    return pred,np.unique(pred)  #返回DBSCAN聚类预测值。和簇类标 
Example #7
Source File: vegetationCluster.py    From python-urbanPlanning with MIT License 6 votes vote down vote up
def affinityPropagationForPoints(data):
    t1=time.time()     
    db=cluster.DBSCAN(eps=16,min_samples=3,metric='euclidean') #调整eps参数,和min_sample参数,获得适宜的聚类结果
    y_db=db.fit_predict(data)  #获取聚类预测类标
    t2=time.time()    
    tDiff_af=t2-t1 #用于计算聚类所需时间
    print("模型训练持续时间:",tDiff_af)
    
    pred=y_db  
    print("预测类标,与簇数:",pred,len(np.unique(pred)))  #打印查看预测类标和计算聚类簇数
    
    t3=time.time()
    plt.close('all')
    plt.figure(1,figsize=(15,15))
    plt.clf()
    cm=plt.cm.get_cmap('nipy_spectral')  #获取内置色带
    sc=plt.scatter(data[...,0],data[...,1],s=10,alpha=0.8,c=pred,cmap=cm) #c参数设置为预测值,传入色带,根据c值显示颜色
    plt.show()
    t4=time.time()
    tDiff_plt=t4-t3  #计算图表显示时间
    print("图表显示持续时间:",tDiff_plt)
    return pred,np.unique(pred)  #返回DBSCAN聚类预测值。和簇类标 
Example #8
Source File: DBSCAN.py    From cn-text-classifier with GNU General Public License v3.0 6 votes vote down vote up
def plot_res(labels: list, n_cluster: int, num: int):
    colors = plt.cm.Spectral(np.linspace(0, 1, len(set(labels))))
    for k, col in zip(set(labels), colors):
        if k == -1:
            # Black used for noise.
            col = 'k'
        class_member_mask = (labels == k)
        xy = trainingData[class_member_mask & core_samples_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
                 markeredgecolor='k', markersize=10)
        xy = trainingData[class_member_mask & ~core_samples_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
                 markeredgecolor='k', markersize=6)
    plt.title('DBSCAN')
    plt.savefig(settings.PLOT_DIR + 'db-%d-%d.png' % (n_cluster, num))
    plt.show() 
Example #9
Source File: DBSCAN.py    From aggregation with Apache License 2.0 6 votes vote down vote up
def createRanges(pts):
    X_ = np.array(pts)
    db_ = DBSCAN(eps=step+0.05, min_samples=1).fit(X_)
    labels = db_.labels_

    ranges = []
    for k in set(labels):
        class_member_mask = (labels == k)
        xy = X_[class_member_mask]

        epsilon_l,minPts = zip(*list(X_[class_member_mask]))
        epsilon_min,epsilon_max = min(epsilon_l),max(epsilon_l)

        assert(min(minPts) == max(minPts))
        ranges.append((minPts[0],epsilon_min,epsilon_max))


    return ranges 
Example #10
Source File: new_jungle.py    From aggregation with Apache License 2.0 6 votes vote down vote up
def get_user_pts(markings):
    user_pts = []
    X = np.asarray(markings)
    db = DBSCAN(eps=10, min_samples=3).fit(X)
    # core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    # core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    # print n_clusters_
    unique_labels = set(labels)
    colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
    for k, col in zip(unique_labels, colors):
        if k == -1:
            continue

        class_member_mask = (labels == k)

        xy = X[class_member_mask]
        user_pts.append(xy)
        x,y = zip(*xy)
        # plt.plot(x,y,"o")


    return user_pts 
Example #11
Source File: cluster_tools.py    From SUPPA with MIT License 6 votes vote down vote up
def cluster_analysis(dpsi, psivec, sig_threshold, dpsi_threshold, eps, minpts, metric, indexes, clustering,
                     separation, output):

    path = os.path.dirname(os.path.realpath(dpsi))
    os.chdir(path)

    psi_matrix, eventid_lst = process_cluster_input(dpsi, psivec, sig_threshold, dpsi_threshold, indexes)

    if(clustering=="DBSCAN"):
        eventid_labels_dict, labels = DBSCAN_cluster(psi_matrix, eventid_lst, eps, minpts, metric)
        #eventid_labels_dict are the labels of the clustering for eacg event

        write_averaged_cluster_output(psi_matrix, eventid_lst, eventid_labels_dict, output)
        calculate_cluster_scores(psi_matrix, labels, output)

    else:
        #OPTICS
        points_list = create_points_list(psi_matrix, eventid_lst) #Transform the points on psi_matrix to Points from optics.py
        optics = Optics(points_list, eps, minpts)  # Maximum radius to be considered, cluster size >= 2 points
        optics.run()  # run the algorithm
        clusters = optics.cluster(separation)  # minimum threshold for clustering (upper limit to separate the clusters)
        eventid_labels_dict, labels = generate_labels(clusters, eventid_lst)
        write_averaged_cluster_output(psi_matrix, eventid_lst, eventid_labels_dict, output)
        calculate_cluster_scores(psi_matrix, labels, output) 
Example #12
Source File: cluster_tools.py    From SUPPA with MIT License 6 votes vote down vote up
def DBSCAN_cluster(psi_matrix, eventid_lst, dist, minpts, metric):

    # Setting logging preferences
    logger = logging.getLogger(__name__)

    # The metric is "cosine" works only with the algorithm "brute"
    if metric == "cosine":
        alg = 'brute'
    else:
        alg = 'auto'

    try:
        db = DBSCAN(eps=dist, min_samples=minpts, metric=metric, algorithm=alg).fit(psi_matrix)
        labels = db.labels_
    except:
        logger.error("Unknown error: {}".format(sys.exc_info()))
        sys.exit(1)

    eventid_labels_dict = {k: v for k, v in zip(eventid_lst, labels)}

    return eventid_labels_dict, labels 
Example #13
Source File: gen_sklearn.py    From lmatools with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def identify_clusters(self, data):
        """ For data with shape (N, D) in D dimensions, return 
            a vector of labels of length N. 
        
            min_points is the minimum number of points required to form a 
            a cluster. For the DBSCAN algorithm, this is min_samples for
            a core cluster.
    
            This function adopts the convention that clusters labeled
            with an ID of -1 are singleton points not belonging to a 
            cluster, consistent with the convention of sklearn.cluster.DBSCAN
        """
        db = DBSCAN(eps=1.0, min_samples=self.min_points, metric='euclidean')
        clusters = db.fit(data)
        labels = clusters.labels_.astype(int)
        return labels 
Example #14
Source File: shifted_delta_cepstra.py    From hunspeech with MIT License 6 votes vote down vote up
def get_classer(self, algo_name, classer, algo_dir):
        if not os.path.exists(algo_dir):
            os.mkdir(algo_dir)
        classer_fn = '{}_classer.npy'.format(os.path.join(algo_dir, algo_name))
        trafoed_fn = '{}_trafoed.npy'.format(os.path.join(algo_dir, algo_name))
        if os.path.isfile(classer_fn):
            return pickle.load(open(classer_fn, mode='rb'))
        else:
            if algo_name == 'DBSCAN':
                self.loop_estimate_bandwidth()
            logger.info('clustering all speech with {}'.format(algo_name))
            if hasattr(classer, 'fit') and hasattr(classer, 'predict'):
                classer.fit(self.sdc_all_speech)
            elif hasattr(classer, 'fit_transform'): # TSNE
                all_speech_trafoed = classer.fit_transform(self.sdc_all_speech)
                np.save(open(trafoed_fn, mode='wb'), all_speech_trafoed)
            else: # DBSCAN
                classer.fit_predict(self.sdc_all_speech)
            logger.info(classer.get_params())
            logger.info('dumping classifier')
            pickle.dump(classer, open(classer_fn, mode='wb'))
            return classer 
Example #15
Source File: outlierdenstream.py    From outlierdenstream with MIT License 6 votes vote down vote up
def initDBScan(self):

        """
        Init with DBSCAN
        """                     

        db=DBSCAN(eps=0.05, min_samples=2)
        db.fit(self.buffer)
        labels = pd.DataFrame(db.labels_+1) 
        for x in range(1, labels[0].max()+1):
            samples = self.buffer[labels[labels[0]==x].index]

            sample = Sample(samples[0], 0)
            sample.setTimestamp(1)

            mc = MicroCluster(1, self.lamb, self.pMicroCluster.N + 1)

            for sampleNumber in range(0, len(samples)):
                sample = Sample(samples[sampleNumber], sampleNumber)
                sample.setTimestamp(sampleNumber+1)
                mc.insertSample(sample, self.currentTimestamp)

            self.pMicroCluster.insert(mc) 
Example #16
Source File: test_keyed_models.py    From spark-sklearn with Apache License 2.0 6 votes vote down vote up
def test_correct_estimator_type(self):
        self.checkEstimatorType(KeyedEstimator(sklearnEstimator=PCA()), "transformer")

        self.checkEstimatorType(KeyedEstimator(sklearnEstimator=LinearRegression(), yCol="y"),
                                "predictor")

        self.checkEstimatorType(KeyedEstimator(sklearnEstimator=DBSCAN()), "clusterer")

        self.checkEstimatorType(KeyedEstimator(sklearnEstimator=KMeans()), "clusterer")

        ke = KeyedEstimator(sklearnEstimator=KMeans(), estimatorType="transformer")
        self.checkEstimatorType(ke, "transformer")

        custom = KeyedModelTests._CustomClusterer()
        ke = KeyedEstimator(sklearnEstimator=custom)
        self.checkEstimatorType(ke, "clusterer")

        ke = KeyedEstimator(sklearnEstimator=custom, estimatorType="transformer")
        self.checkEstimatorType(ke, "transformer")

        custom = KeyedModelTests._CustomTransformer()
        self.checkEstimatorType(KeyedEstimator(sklearnEstimator=custom), "transformer") 
Example #17
Source File: lexrankr.py    From lexrankr with MIT License 5 votes vote down vote up
def __init__(self, similarity='cosine', decay_window=20, decay_alpha=0.25, clustering='dbscan', tagger='twitter', useful_tags=['Noun', 'Verb', 'Adjective', 'Determiner', 'Adverb', 'Conjunction', 'Josa', 'PreEomi', 'Eomi', 'Suffix', 'Alpha', 'Number'], delimiters=['. ', '\n', '.\n'], min_token_length=2, stopwords=stopwords_ko, no_below_word_count=2, no_above_word_portion=0.85, max_dictionary_size=None, min_cluster_size=2, similarity_threshold=0.85, matrix_smoothing=False, n_clusters=None, compactify=True, **kwargs):
        self.decay_window = decay_window
        self.decay_alpha = decay_alpha
        if similarity == 'cosine':  # very, very slow :(
            self.vectorizer = DictVectorizer()
            self.uniform_sim = self._sim_cosine
        elif similarity == 'jaccard':
            self.uniform_sim = self._sim_jaccard
        elif similarity == 'normalized_cooccurrence':
            self.uniform_sim = self._sim_normalized_cooccurrence
        else:
            raise LexRankError("available similarity functions are: cosine, jaccard, normalized_cooccurrence")
        self.sim = lambda sentence1, sentence2: self.decay(sentence1, sentence2) * self.uniform_sim(sentence1, sentence2)
        self.factory = SentenceFactory(tagger=tagger, useful_tags=useful_tags, delimiters=delimiters, min_token_length=min_token_length, stopwords=stopwords, **kwargs)
        if clustering == 'birch':
            self._birch = Birch(threshold=0.99, n_clusters=n_clusters)
            self._clusterer = lambda matrix: self._birch.fit_predict(1 - matrix)
        elif clustering == 'dbscan':
            self._dbscan = DBSCAN()
            self._clusterer = lambda matrix: self._dbscan.fit_predict(1 - matrix)
        elif clustering == 'affinity':
            self._affinity = AffinityPropagation()
            self._clusterer = lambda matrix: self._affinity.fit_predict(1 - matrix)
        elif clustering is None:
            self._clusterer = lambda matrix: [0 for index in range(matrix.shape[0])]
        else:
            raise LexRankError("available clustering algorithms are: birch, markov, no-clustering(use `None`)")
        self.no_below_word_count = no_below_word_count
        self.no_above_word_portion = no_above_word_portion
        self.max_dictionary_size = max_dictionary_size
        self.similarity_threshold = similarity_threshold
        self.min_cluster_size = min_cluster_size
        self.matrix_smoothing = matrix_smoothing
        self.compactify = compactify 
Example #18
Source File: lma.py    From lmatools with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def cluster_chunk_pairs(clustered_output_target):
    db = DBSCAN(eps=1.0, min_samples=10, metric='euclidean')
    
    """Receive chunks, and process overlapping pairs"""
    chunk1 = (yield)
    try:
        while True:
            chunk2 = (yield)
            len1 = chunk1.shape[0]
            len2 = chunk2.shape[0]
            print(len1+len2)
            
            # do stuff with chunk 1 and 2
            clusters = db.fit(np.vstack((chunk1, chunk2)))
            labels = clusters.labels_
            
            clustered_output_target.send((chunk1, labels[:len1]))
            
            # pull data out of chunk2 that was clustered as part of chunk 1
            chunk1_labelset = set(labels[:len1])
            if -1 in chunk1_labelset:
                chunk1_labelset.remove(-1) # remove the singleton cluster ID - we want to retain these from chunk 2.
            clustered_in_chunk2 = np.fromiter( ( True if label in chunk1_labelset else False for i,label in enumerate(labels[len1:])) , dtype=bool)
            clustered_output_target.send((chunk2[clustered_in_chunk2], labels[len1:][clustered_in_chunk2]))  
            residuals = chunk2[clustered_in_chunk2==False]
            
            # prepare for another chunk
            if len(residuals) == 0:
                residuals = chunk1[0:0,:] # empty array that preserves the number of dimensions in the data vector - no obs.
            del chunk1
            chunk1 = np.asarray(residuals)
            del residuals
    except GeneratorExit:
        clusters = db.fit(chunk1)
        labels = clusters.labels_
        clustered_output_target.send((chunk1, labels)) 
Example #19
Source File: sklearn_cluster.py    From learn-to-cluster with MIT License 5 votes vote down vote up
def knn_dbscan(feats, eps, min_samples, prefix, name, knn_method, knn, th_sim,
               **kwargs):
    knn_prefix = os.path.join(prefix, 'knns', name)
    knns = build_knns(knn_prefix, feats, knn_method, knn)
    sparse_affinity = fast_knns2spmat(knns, knn, th_sim, use_sim=False)
    db = cluster.DBSCAN(eps=eps,
                        min_samples=min_samples,
                        n_jobs=mp.cpu_count(),
                        metric='precomputed').fit(sparse_affinity)
    return db.labels_ 
Example #20
Source File: sklearn_cluster.py    From learn-to-cluster with MIT License 5 votes vote down vote up
def dbscan(feat, eps, min_samples, **kwargs):
    db = cluster.DBSCAN(eps=eps,
                        min_samples=min_samples,
                        n_jobs=mp.cpu_count()).fit(feat)
    return db.labels_ 
Example #21
Source File: density.py    From trajminer with MIT License 5 votes vote down vote up
def __init__(self, eps=0.5, min_samples=5, measure='precomputed',
                 n_jobs=1):
        self.dbscan = skDBSCAN(eps=eps, min_samples=min_samples,
                               metric='precomputed', n_jobs=n_jobs)
        self.eps = eps
        self.min_samples = min_samples
        self.measure = measure
        self.n_jobs = n_jobs 
Example #22
Source File: sklearn_sycl.py    From daal4py with Apache License 2.0 5 votes vote down vote up
def dbscan():
    print("DBSCAN")
    X = np.array([[1., 2.], [2., 2.], [2., 3.],
                  [8., 7.], [8., 8.], [25., 80.]])
    clustering = DBSCAN(eps=3, min_samples=2).fit(X)
    print("clustering.labels_")
    print(clustering.labels_)
    print("clustering")
    print(clustering) 
Example #23
Source File: active_weather.py    From aggregation with Apache License 2.0 5 votes vote down vote up
def __dbscan_threshold__(img):
    ink_pixels = np.where(img>0)
    X = np.asarray(zip(ink_pixels[1],ink_pixels[0]))
    print("doing dbscan: " + str(X.shape))
    db = DBSCAN(eps=1, min_samples=5).fit(X)

    labels = db.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    unique_labels = set(labels)
    colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))

    return_image = np.zeros(gray.shape,np.uint8)
    return_image.fill(255)

    print("going through dbscan results")
    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            continue

        class_member_mask = (labels == k)
        # temp = np.zeros(X.shape)

        xy = X[class_member_mask]

        max_value = gray[xy[:, 1], xy[:, 0]].max()
        median = np.median(gray[xy[:, 1], xy[:, 0]])
        mean = np.mean(gray[xy[:, 1], xy[:, 0]])
        # print(max_value,median,mean)

        if True:#median > 120:
            x_max,y_max = np.max(xy,axis=0)
            x_min,y_min = np.min(xy,axis=0)
            if min(x_max-x_min,y_max-y_min) >= 10:
                return_image[xy[:, 1], xy[:, 0]] = gray[xy[:, 1], xy[:, 0]] 
Example #24
Source File: echoDoc0.1.py    From EchoBurst with MIT License 5 votes vote down vote up
def newDBSCANModel(vectorFile, outputFile):
    model = Doc2Vec.load("Models\\" + vectorFile)
    vecs = []
    for doc in range(0, len(model.docvecs)):
        doc_vec = model.docvecs[doc]
        # print doc_vec
        vecs.append(doc_vec.reshape((1, 300)))

    doc_vecs = np.array(vecs, dtype='float')  # TSNE expects float type values

    # print doc_vecs
    docs = []
    for i in doc_vecs:
        docs.append(i[0])
    db = DBSCAN(eps=0.03, algorithm="brute", metric='cosine').fit(docs)
    joblib.dump(db, outputFile)


    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    clusters = db.labels_.tolist()
    cluster_info = {'labels': model.docvecs.offset2doctag,
                    "index, wordcount and repeated words": [model.docvecs.doctags[x] for x in
                                                            model.docvecs.offset2doctag],
                    'clusters': clusters}
    sentenceDF = pd.DataFrame(cluster_info, index=[clusters],
                              columns=['labels', "index, wordcount and repeated words", 'clusters'])
    print(sentenceDF)
    sentenceDF.to_csv("DBSCAN.csv")

    print('Estimated number of clusters: %d' % n_clusters_) 
Example #25
Source File: DBSCAN.py    From Splunking-Crime with GNU Affero General Public License v3.0 5 votes vote down vote up
def __init__(self, options):
        self.handle_options(options)
        out_params = convert_params(options.get('params', {}), floats=['eps'])

        self.estimator = _DBSCAN(**out_params) 
Example #26
Source File: sliding2.py    From aggregation with Apache License 2.0 5 votes vote down vote up
def get_window_size():
    non_white_points = np.where(img[:, :500] != 255)
    non_white_points = np.asarray(zip(non_white_points[0], non_white_points[1]))
    print(non_white_points.shape)
    db = DBSCAN(eps=1, min_samples=5).fit(non_white_points)
    labels = db.labels_

    unique_labels = set(labels)
    colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
    print("here")
    heights = []
    widths = []
    for k, col in zip(unique_labels, colors):
        if k == -1:
            continue
        # print(k)

        class_member_mask = (labels == k)
        xy = non_white_points[class_member_mask]
        #
        min_y, min_x = np.min(xy, axis=0)
        max_y, max_x = np.max(xy, axis=0)
        if min(max_x - min_x, max_y - min_y) <= 1:
            continue

        heights.append(max_y - min_y)
        widths.append(max_x - min_x) 
Example #27
Source File: clusterer.py    From yelp with GNU Lesser General Public License v2.1 5 votes vote down vote up
def dbscan(matrix):
        dbscan = skcluster.DBSCAN(eps=0.3, min_samples=50, metric='euclidean')
        # dbscan = skcluster.DBSCAN(eps=0.3, min_samples=50,
        #                           metric=nltk.cosine_distance)
        dbscan.fit(matrix)

        labels = dbscan.labels_
        # Number of clusters in labels, ignoring noise if present.
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        print('Estimated number of clusters:', n_clusters_)

        return labels

    # OK 
Example #28
Source File: run_center_clustering.py    From pyImSegm with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def cluster_center_candidates(points, max_dist=100, min_samples=1):
    """ cluster center candidates by given density clustering

    :param [[float]] points: points
    :param float max_dist: maximal distance among points
    :param int min_samples: minimal number od samples
    :return (ndarray, [int]):
    """
    points = np.array(points)
    if not list(points):
        return points, []
    dbscan = cluster.DBSCAN(eps=max_dist, min_samples=min_samples)
    dbscan.fit(points)
    labels = dbscan.labels_.copy()

    centers = []
    for i in range(max(labels) + 1):
        clust = points[labels == i]
        if len(clust) > 0:
            center = np.mean(clust, axis=0)
            centers.append(center)

    return np.array(centers), labels 
Example #29
Source File: dbscan_analysis.py    From ns4_chatbot with Apache License 2.0 5 votes vote down vote up
def DBSCAN_analysis(self,tfidf_data):
		dbscan = DBSCAN()#5是默认的
		labels = dbscan.fit_predict(tfidf_data)
		logger.debug("DBSCAN聚出来类:%d", len(set(labels)))
		# logger.debug(labels)
		cores = dbscan.core_sample_indices_
		# logger.debug("dbscan components:%r",dbscan.components_)
		return labels, dbscan,cores

	# 先将words变成tfidf,那么tfidf词表就不能变化,这个点要注意
	# - 要验证词表没有变化,这样才能保证tfidf向量不会变化
	# - 还要注意传入的分词组中不存在的词会不会报错
	# 然后,启动KNN的it,将之前保存的分类传入KNN,形成KNN模型
	# 使用KNN模型来预测这个新传入的分词组,属于的类别,并打印词类别中的3个样例,随机
	#核心方法!!!! 
Example #30
Source File: cluster.py    From PAST-ReID with MIT License 5 votes vote down vote up
def dbscancluster(self, dist, iteration=-1):
        # DBSCAN cluster
        tri_mat = np.triu(dist, 1)  # tri_mat.dim=2
        tri_mat = tri_mat[np.nonzero(tri_mat)]  # tri_mat.dim=1
        tri_mat = np.sort(tri_mat, axis=None)
        top_num = np.round(self.args.rho * tri_mat.size).astype(int)
        eps = tri_mat[:top_num].mean()
        print('eps in cluster: {:.3f}'.format(eps))

        clusterer = DBSCAN(eps=eps, min_samples=self.args.dbscan_minsample, metric='precomputed', n_jobs=8)

        labels = clusterer.fit_predict(dist)

        # select & cluster images as training set of this epochs
        print('Clustering and labeling...')
        num_ids = len(set(labels)) - 1

        print('Epoch {} have {} training ids'.format(iteration, num_ids))
        # generate new dataset
        new_dataset = []
        new_indices = []

        for (fname, _, _), label, indice in zip(self.traindataset, labels, self.old_indices):
            if label == -1:
                continue
            # dont need to change codes in trainer.py _parsing_input function and sampler function after add 0
            new_dataset.append((fname, label, indice))
            new_indices.append(indice)

        print('Iteration {} have {} training images'.format(iteration, len(new_dataset)))

        return new_dataset, new_indices