Python sklearn.metrics.pairwise.euclidean_distances() Examples

The following are code examples for showing how to use sklearn.metrics.pairwise.euclidean_distances(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: scanorama   Author: brianhie   File: time_align.py    MIT License 6 votes vote down vote up
def time_dist(datasets_dimred, time):
    time_dist = euclidean_distances(time, time)

    time_dists, scores = [], []
    for i in range(time_dist.shape[0]):
        for j in range(time_dist.shape[1]):
            if i >= j:
                continue
            score = np.mean(euclidean_distances(
                datasets_dimred[i], datasets_dimred[j]
            ))
            time_dists.append(time_dist[i, j])
            scores.append(score)

    print('Spearman rho = {}'.format(spearmanr(time_dists, scores)))
    print('Pearson rho = {}'.format(pearsonr(time_dists, scores))) 
Example 2
Project: scikit-learn-extra   Author: scikit-learn-contrib   File: test_k_medoids.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_kmedoids_pp():
    """Initial clusters should be well-separated for k-medoids++"""
    rng = np.random.RandomState(seed)
    kmedoids = KMedoids()
    X = [
        [10, 0],
        [11, 0],
        [0, 10],
        [0, 11],
        [10, 10],
        [11, 10],
        [12, 10],
        [10, 11],
    ]
    D = euclidean_distances(X)

    centers = kmedoids._kpp_init(D, n_clusters=3, random_state_=rng)

    assert len(centers) == 3

    inter_medoid_distances = D[centers][:, centers]
    assert np.all((inter_medoid_distances > 5) | (inter_medoid_distances == 0)) 
Example 3
Project: scikit-learn-extra   Author: scikit-learn-contrib   File: test_k_medoids.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_precomputed():
    """Test the 'precomputed' distance metric."""
    rng = np.random.RandomState(seed)
    X_1 = [[1.0, 0.0], [1.1, 0.0], [0.0, 1.0], [0.0, 1.1]]
    D_1 = euclidean_distances(X_1)
    X_2 = [[1.1, 0.0], [0.0, 0.9]]
    D_2 = euclidean_distances(X_2, X_1)

    kmedoids = KMedoids(metric="precomputed", n_clusters=2, random_state=rng)
    kmedoids.fit(D_1)

    assert_allclose(kmedoids.inertia_, 0.2)
    assert_array_equal(kmedoids.medoid_indices_, [2, 0])
    assert_array_equal(kmedoids.labels_, [1, 1, 0, 0])
    assert kmedoids.cluster_centers_ is None

    med_1, med_2 = tuple(kmedoids.medoid_indices_)
    predictions = kmedoids.predict(D_2)
    assert_array_equal(predictions, [med_1 // 2, med_2 // 2])

    transformed = kmedoids.transform(D_2)
    assert_array_equal(transformed, D_2[:, kmedoids.medoid_indices_]) 
Example 4
Project: Scuba   Author: gzampieri   File: compute_kernel.py    GNU General Public License v2.0 6 votes vote down vote up
def get_RBF(A, s=1.):
    """ Compute radial basis function kernel.
    
    Parameters:
        A -- Feature matrix.
        s -- Scale parameter (positive float, 1.0 by default).
        
    Return:
        K -- Radial basis function kernel matrix.
    """
    
    from sklearn.metrics.pairwise import euclidean_distances, rbf_kernel
    from sklearn.preprocessing import scale
    
    A = scale(A)
    dist_matrix = euclidean_distances(A, A, None, squared=True)
    dist_vector = dist_matrix[np.nonzero(np.tril(dist_matrix))]
    dist_median = np.median(dist_vector)
    K = rbf_kernel(A, None, dist_median*s)
    
    return K 
Example 5
Project: DHGNN   Author: iMoonLab   File: layers.py    MIT License 6 votes vote down vote up
def _cluster_select(self, feats: torch.Tensor):
        """
        compute k-means centers and cluster labels of each node
        return top #n_cluster nearest cluster transformed features
        :param feats:
        :return: top #n_cluster nearest cluster mapped features
        """
        np_feats = feats.detach().cpu().numpy()
        N = np_feats.shape[0]
        kmeans = KMeans(n_clusters=self.n_cluster, random_state=0).fit(np_feats)
        centers = kmeans.cluster_centers_
        dis = euclidean_distances(np_feats, centers)
        _, cluster_center_dict = torch.topk(torch.Tensor(dis), self.n_center, largest=False)
        cluster_center_dict = cluster_center_dict.numpy()
        point_labels = kmeans.labels_
        point_in_which_cluster = [np.where(point_labels == i)[0] for i in range(self.n_cluster)]

        cluster_feats = torch.stack([torch.stack([feats[SampledGraphConvolution.sample_ids_v2
                        (point_in_which_cluster[cluster_center_dict[point][i]], self.kc)]   # (N, n_cluster, kc, d)
                        for i in range(self.n_center)], dim=0) for point in range(N)], dim=0)
        cluster_feats = torch.stack([self.trans_c[i](cluster_feats[:, i, :, :]) for i in range(self.n_center)], dim=1)
        return cluster_feats                           # (N, n_cluster, d) 
Example 6
Project: DHGNN   Author: iMoonLab   File: layers.py    MIT License 6 votes vote down vote up
def _cluster_select(self, feats: torch.Tensor):
        """
        compute k-means centers and cluster labels of each node
        return top #n_cluster nearest cluster transformed features
        :param feats:
        :return: top #n_cluster nearest cluster mapped features
        """
        np_feats = feats.detach().cpu().numpy()
        N = np_feats.shape[0]
        kmeans = KMeans(n_clusters=self.n_cluster, random_state=0).fit(np_feats)
        centers = kmeans.cluster_centers_
        dis = euclidean_distances(np_feats, centers)
        _, cluster_center_dict = torch.topk(torch.Tensor(dis), self.n_center, largest=False)
        cluster_center_dict = cluster_center_dict.numpy()
        point_labels = kmeans.labels_
        point_in_which_cluster = [np.where(point_labels == i)[0] for i in range(self.n_cluster)]

        cluster_feats = torch.stack([torch.stack([feats[SampledGraphConvolution.sample_ids_v2
        (point_in_which_cluster[cluster_center_dict[point][i]], self.kc)]  # (N, n_cluster, kc, d)
                                                  for i in range(self.n_center)], dim=0) for point in range(N)],
                                    dim=0)
        cluster_feats = torch.stack([cluster_feats[:, i, :, :].mean(dim=1) for i in range(self.n_center)],
                                    dim=1)
        return cluster_feats  # (N, n_cluster, d) 
Example 7
Project: DHGNN   Author: iMoonLab   File: layers.py    MIT License 6 votes vote down vote up
def _cluster_select(self, feats: torch.Tensor):
        """
        compute k-means centers and cluster labels of each node
        return top #n_cluster nearest cluster transformed features
        :param feats:
        :return: top #n_cluster nearest cluster mapped features
        """
        np_feats = feats.detach().cpu().numpy()
        N = np_feats.shape[0]
        kmeans = KMeans(n_clusters=self.n_cluster, random_state=0).fit(np_feats)
        centers = kmeans.cluster_centers_
        dis = euclidean_distances(np_feats, centers)
        _, cluster_center_dict = torch.topk(torch.Tensor(dis), self.n_center, largest=False)
        cluster_center_dict = cluster_center_dict.numpy()
        point_labels = kmeans.labels_
        point_in_which_cluster = [np.where(point_labels == i)[0] for i in range(self.n_cluster)]

        cluster_feats = torch.stack([torch.stack([feats[SampledGraphConvolution.sample_ids_v2
                        (point_in_which_cluster[cluster_center_dict[point][i]], self.kc)]   # (N, n_cluster, kc, d)
                        for i in range(self.n_center)], dim=0) for point in range(N)], dim=0)
        cluster_feats = torch.stack([self.trans_c[i](cluster_feats[:, i, :, :]) for i in range(self.n_center)], dim=1)
        return cluster_feats                           # (N, n_cluster, d) 
Example 8
Project: fsfc   Author: danilkolikov   File: Lasso.py    MIT License 6 votes vote down vote up
def _calc_objective_vector(x, labels):
        clusters = {}
        for i, label in enumerate(labels):
            if label not in clusters:
                clusters[label] = []
            clusters[label].append(i)
        result = np.zeros([1, x.shape[1]])
        for i in range(x.shape[1]):
            feature = 0
            samples = x[:, i].T.reshape([x.shape[0], 1])
            for label, cluster in clusters.items():
                size = len(cluster)
                cluster_samples = samples[cluster]
                distances = euclidean_distances(cluster_samples)
                feature += np.sum(distances) / size
            result[0, i] = np.sum(euclidean_distances(samples)) / x.shape[0] - feature
        return result 
Example 9
Project: optimizer   Author: ocelot-collab   File: DKLmodel.py    GNU General Public License v3.0 6 votes vote down vote up
def eval_LL(self, X, Y):
        N = X.shape[0]
        Z = self.embed(X)
        diffs = euclidean_distances(Z, squared=True)

        alpha = np.exp(self.ogp.covar_params[1]) # kind of a hack
        rbf_K = alpha * np.exp(-diffs / 2.)
        K_full = rbf_K + (self.ogp.noise_var) * np.eye(N)

        L = np.linalg.cholesky(K_full)  # K = L * L.T
        Ly = np.linalg.solve(L, Y)  # finds inverse(L) * y
        log_lik = -0.5 * np.sum(Ly**2) # -1/2 * y.T * inverse(L * L.T) * y
        log_lik -= np.sum(np.log(np.diag(L)))  # equivalent to -1/2 * log(det(K))
        log_lik -= 0.5 * N * np.log(2 * np.pi)

        return float(log_lik)

    # allows passing custom alpha/noise
    # if compute_deriv is true, assumes that embedding is linear and returns derivative w.r.t. transform 
Example 10
Project: CIKM-AnalytiCup-2018   Author: zake7749   File: feature_engineering.py    Apache License 2.0 6 votes vote down vote up
def _get_similarity_values(self, q1_csc, q2_csc):
        cosine_sim = []
        manhattan_dis = []
        eucledian_dis = []
        jaccard_dis = []
        minkowsk_dis = []
        
        for i,j in zip(q1_csc, q2_csc):
            sim = cs(i, j)
            cosine_sim.append(sim[0][0])
            sim = md(i, j)
            manhattan_dis.append(sim[0][0])
            sim = ed(i, j)
            eucledian_dis.append(sim[0][0])
            i_ = i.toarray()
            j_ = j.toarray()
            try:
                sim = jsc(i_, j_)
                jaccard_dis.append(sim)
            except:
                jaccard_dis.append(0)
                
            sim = minkowski_dis.pairwise(i_, j_)
            minkowsk_dis.append(sim[0][0])
        return cosine_sim, manhattan_dis, eucledian_dis, jaccard_dis, minkowsk_dis 
Example 11
Project: scanorama   Author: brianhie   File: time_align.py    MIT License 5 votes vote down vote up
def time_align_correlate(alignments, time):
    time_dist = euclidean_distances(time, time)

    assert(time_dist.shape == alignments.shape)

    time_dists, scores = [], []
    for i in range(time_dist.shape[0]):
        for j in range(time_dist.shape[1]):
            if i >= j:
                continue
            time_dists.append(time_dist[i, j])
            scores.append(alignments[i, j])

    print('Spearman rho = {}'.format(spearmanr(time_dists, scores)))
    print('Pearson rho = {}'.format(pearsonr(time_dists, scores))) 
Example 12
Project: ABRW   Author: houchengbin   File: utils.py    MIT License 5 votes vote down vote up
def pairwise_similarity(mat, type='cosine'):
    # XXX: possible to integrate pairwise_similarity with top_k to enhance performance? 
    # we'll use it elsewhere. if really needed, write a new method for this purpose
    if type == 'cosine':  # support sprase and dense mat
        from sklearn.metrics.pairwise import cosine_similarity
        result = cosine_similarity(mat, dense_output=True)
    elif type == 'jaccard':
        from sklearn.metrics import jaccard_similarity_score
        from sklearn.metrics.pairwise import pairwise_distances
        # n_jobs=-1 means using all CPU for parallel computing
        result = pairwise_distances(mat.todense(), metric=jaccard_similarity_score, n_jobs=-1)
    elif type == 'euclidean':
        from sklearn.metrics.pairwise import euclidean_distances
        # note: similarity = - distance
        # other version: similarity = 1 - 2 / pi * arctan(distance)
        result = euclidean_distances(mat)
        result = -result
        # result = 1 - 2 / np.pi * np.arctan(result)
    elif type == 'manhattan':
        from sklearn.metrics.pairwise import manhattan_distances
        # note: similarity = - distance
        # other version: similarity = 1 - 2 / pi * arctan(distance)
        result = manhattan_distances(mat)
        result = -result
        # result = 1 - 2 / np.pi * np.arctan(result)
    else:
        print('Please choose from: cosine, jaccard, euclidean or manhattan')
        return 'Not found!'
    return result


# ---------------------------------ulits for preprocessing-------------------------------- 
Example 13
Project: pyCeterisParibus   Author: ModelOriented   File: test_select.py    Apache License 2.0 5 votes vote down vote up
def test_select_neighbours(self):
        neighbours = select_neighbours(self.x, self.x[0], dist_fun=euclidean_distances, n=1)
        neighbours2 = select_neighbours(self.x, self.x[0], dist_fun='gower', n=1)
        self.assertSequenceEqual(list(neighbours.iloc[0]), list(self.x[0]))
        self.assertSequenceEqual(list(neighbours2.iloc[0]), list(self.x[0])) 
Example 14
Project: pyCeterisParibus   Author: ModelOriented   File: test_select.py    Apache License 2.0 5 votes vote down vote up
def test_select_neighbours_2(self):
        (_, m) = self.x.shape
        size = 3
        neighbours = select_neighbours(self.x, np.array([4, 3, 2]), dist_fun=euclidean_distances, n=size)
        self.assertEqual(neighbours.shape, (size, m))
        neighbours2 = select_neighbours(self.x, np.array([4, 3, 2]), dist_fun='gower', n=size)
        self.assertEqual(neighbours2.shape, (size, m)) 
Example 15
Project: ICASSP-2020-clustering   Author: xavierfav   File: clustering.py    GNU General Public License v3.0 5 votes vote down vote up
def compute_similarity_matrix(X):
    """
    Compute similarity matrix of the given features.

    """
    euclidian_distances = euclidean_distances(X)
    similarity_matrix = 1 - euclidian_distances/euclidian_distances.max()
    similarity_matrix = np.exp(-1 * euclidian_distances / euclidian_distances.std())
    # similarity_matrix = cosine_similarity(X)
    return similarity_matrix 
Example 16
Project: scikit-learn-extra   Author: scikit-learn-contrib   File: test_k_medoids.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_random_deterministic():
    """Random_state should determine 'random' init output."""
    rng = np.random.RandomState(seed)

    X = load_iris()["data"]
    D = euclidean_distances(X)

    medoids = KMedoids(init="random")._initialize_medoids(D, 4, rng)
    assert_array_equal(medoids, [47, 117, 67, 103]) 
Example 17
Project: scikit-learn-extra   Author: scikit-learn-contrib   File: test_k_medoids.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_heuristic_deterministic():
    """Result of heuristic init method should not depend on rnadom state."""
    rng1 = np.random.RandomState(1)
    rng2 = np.random.RandomState(2)
    X = load_iris()["data"]
    D = euclidean_distances(X)

    medoids_1 = KMedoids(init="heuristic")._initialize_medoids(D, 10, rng1)

    medoids_2 = KMedoids(init="heuristic")._initialize_medoids(D, 10, rng2)

    assert_array_equal(medoids_1, medoids_2) 
Example 18
Project: scikit-multiflow   Author: scikit-multiflow   File: online_smote_bagging.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def online_smote(self, k=5):
        if len(self.pos_samples) > 1:
            x = self.pos_samples[-1]
            distance_vector = euclidean_distances(self.pos_samples[:-1], [x])[0]
            neighbors = np.argsort(distance_vector)
            if k > len(neighbors):
                k = len(neighbors)
            i = self._random_state.randint(0, k)
            gamma = self._random_state.rand()
            x_smote = np.add(x, np.multiply(gamma, np.subtract(x, self.pos_samples[neighbors[i]])))
            return x_smote
        return self.pos_samples[-1] 
Example 19
Project: nlp_research   Author: zhufz   File: test_match.py    MIT License 5 votes vote down vote up
def __call__(self, text):
        if self.tfrecords_mode == 'point':
            assert text.find('||') != -1,"input should cotain two sentences seperated by ||"
            text_a = text.split('||')[0]
            text_b = text.split('||')[-1]
            pred,score = self._get_label([text_a], [text_b], need_preprocess = True)
            return pred[0][0], score[0][0]

        #加载自定义问句(自定义优先)
        if self.sim_mode == 'cross':
            text_list = self.text_list
            label_list = self.label_list
            if self.zdy != {}:
                text_list = self.zdy['text_list'] + text_list
                label_list = self.zdy['label_list'] + label_list
            pred,score = self._get_label([text], self.text_list, need_preprocess = True)
            selected_id = np.argmax(score)
            out_score = score[selected_id]
        elif self.sim_mode == 'represent':
            text_list = self.text_list
            vec_list = self.vec_list
            label_list = self.label_list
            if self.zdy != {}:
                text_list = self.zdy['text_list'] + text_list
                vec_list = np.concatenate([self.zdy['vec_list'], self.vec_list], axis = 0)
                label_list = self.zdy['label_list'] + label_list
            vec = self._get_vecs([text], need_preprocess = True)
            if self.is_distance:
                scores = euclidean_distances(vec, vec_list)[0]
                selected_id = np.argmin(scores)
                out_score = 1 - scores[selected_id]
            else:
                scores = cosine_similarity(vec, vec_list)[0]
                selected_id = np.argmax(scores)
                out_score = scores[selected_id]
        else:
            raise ValueError('unknown sim mode, represent or cross?')
        ret = (label_list[selected_id], out_score, selected_id, \
               self.text_list[selected_id])
        return ret 
Example 20
Project: nlp_research   Author: zhufz   File: similarity.py    MIT License 5 votes vote down vote up
def similarity(self, query, type):
        assert self.corpus != None, "self.corpus can't be None"
        ret = []
        if type == 'cosine':
            query = self.get_vector(query)
            for item in self.corpus_vec:
                sim = cosine_similarity(item, query)
                ret.append(sim[0][0])
        elif type == 'manhattan':
            query = self.get_vector(query)
            for item in self.corpus_vec:
                sim = manhattan_distances(item, query)
                ret.append(sim[0][0])
        elif type == 'euclidean':
            query = self.get_vector(query)
            for item in self.corpus_vec:
                sim = euclidean_distances (item, query)
                ret.append(sim[0][0])
        #elif type == 'jaccard':
        #    #query = query.split()
        #    query = self.get_vector(query)
        #    for item in self.corpus_vec:
        #        pdb.set_trace()
        #        sim = jaccard_similarity_score(item, query)
        #        ret.append(sim)
        elif type == 'bm25':
            query = query.split()
            ret = self.bm25_model.get_scores(query)
        else:
            raise ValueError('similarity type error:%s'%type)
        return ret 
Example 21
Project: DHGNN   Author: iMoonLab   File: construct_hypergraph.py    MIT License 5 votes vote down vote up
def _construct_edge_list_from_cluster(X, clusters, adjacent_clusters, k_neighbors) -> np.array:
    """
    construct edge list (numpy array) from cluster for single modality
    :param X: feature
    :param clusters: number of clusters for k-means
    :param adjacent_clusters: a node's adjacent clusters
    :param k_neighbors: number of a node's neighbors
    :return:
    """
    N = X.shape[0]
    kmeans = KMeans(n_clusters=clusters, random_state=0).fit(X)
    centers = kmeans.cluster_centers_
    dis = euclidean_distances(X, centers)
    _, cluster_center_dict = torch.topk(torch.Tensor(dis), adjacent_clusters, largest=False)
    cluster_center_dict = cluster_center_dict.numpy()
    point_labels = kmeans.labels_
    point_in_which_cluster = [np.where(point_labels == i)[0] for i in range(clusters)]

    def _list_cat(list_of_array):
        """
        example: [[0,1],[3,5,6],[-1]] -> [0,1,3,5,6,-1]
        :param list_of_array: list of np.array
        :return: list of numbers
        """
        ret = list()
        for array in list_of_array:
            ret += array.tolist()
        return ret

    cluster_neighbor_dict = [_list_cat([point_in_which_cluster[cluster_center_dict[point][i]]
                                        for i in range(adjacent_clusters)]) for point in range(N)]
    for point, entry in enumerate(cluster_neighbor_dict):
        entry.append(point)
    sampled_ids = [SampledGraphConvolution.sample_ids(cluster_neighbor_dict[point], k_neighbors) for point in range(N)]
    return np.array(sampled_ids) 
Example 22
Project: DHGNN   Author: iMoonLab   File: layers.py    MIT License 5 votes vote down vote up
def _cluster_select(self, feats: torch.Tensor):
        """
        compute k-means centers and cluster labels of each node
        :param feats:
        :return:
        """
        np_feats = feats.detach().cpu().numpy()
        N = np_feats.shape[0]
        kmeans = KMeans(n_clusters=self.n_cluster, random_state=0).fit(np_feats)
        centers = kmeans.cluster_centers_
        dis = euclidean_distances(np_feats, centers)
        _, cluster_center_dict = torch.topk(torch.Tensor(dis), self.n_center, largest=False)
        cluster_center_dict = cluster_center_dict.numpy()
        point_labels = kmeans.labels_
        point_in_which_cluster = [np.where(point_labels == i)[0] for i in range(self.n_cluster)]

        def _list_cat(list_of_array):
            """
            example: [[0,1],[3,5,6],[-1]] -> [0,1,3,5,6,-1]
            :param list_of_array: list of np.array
            :return: list of numbers
            """
            ret = list()
            for array in list_of_array:
                ret += array.tolist()
            return ret

        cluster_neighbor_dict = [_list_cat([point_in_which_cluster[cluster_center_dict[point][i]]
                                  for i in range(self.n_center)]) for point in range(N)]
        for point, entry in enumerate(cluster_neighbor_dict):
            entry.append(point)
        sampled_ids = [SampledGraphConvolution.sample_ids(cluster_neighbor_dict[point], self.kc) for point in range(N)]
        cluster_feats = torch.stack([feats[sampled_ids[point]] for point in range(N)], dim=0)
        return cluster_feats 
Example 23
Project: heat   Author: DavidMcDonald1993   File: evaluate_reconstruction.py    MIT License 5 votes vote down vote up
def hyperbolic_distance_poincare(X):
	norm_X = np.linalg.norm(X, keepdims=True, axis=-1)
	norm_X = np.minimum(norm_X, np.nextafter(1,0, ))
	uu = euclidean_distances(X) ** 2
	dd = (1 - norm_X**2) * (1 - norm_X**2).T
	return np.arccosh(1 + 2 * uu / dd) 
Example 24
Project: heat   Author: DavidMcDonald1993   File: evaluate_reconstruction.py    MIT License 5 votes vote down vote up
def euclidean_distance(X):
	return euclidean_distances(X) 
Example 25
Project: heat   Author: DavidMcDonald1993   File: evaluate_lp.py    MIT License 5 votes vote down vote up
def hyperbolic_distance_poincare(X):
	norm_X = np.linalg.norm(X, keepdims=True, axis=-1)
	norm_X = np.minimum(norm_X, np.nextafter(1,0, ))
	uu = euclidean_distances(X) ** 2
	dd = (1 - norm_X**2) * (1 - norm_X**2).T
	return np.arccosh(1 + 2 * uu / dd) 
Example 26
Project: heat   Author: DavidMcDonald1993   File: evaluate_lp.py    MIT License 5 votes vote down vote up
def euclidean_distance(X):
	return euclidean_distances(X) 
Example 27
Project: linear_neuron   Author: uglyboxer   File: test_pairwise.py    MIT License 5 votes vote down vote up
def test_euclidean_distances():
    # Check the pairwise Euclidean distances computation
    X = [[0]]
    Y = [[1], [2]]
    D = euclidean_distances(X, Y)
    assert_array_almost_equal(D, [[1., 2.]])

    X = csr_matrix(X)
    Y = csr_matrix(Y)
    D = euclidean_distances(X, Y)
    assert_array_almost_equal(D, [[1., 2.]])


# Paired distances 
Example 28
Project: Weiss   Author: WangWenjun559   File: test_pairwise.py    Apache License 2.0 5 votes vote down vote up
def test_euclidean_distances():
    # Check the pairwise Euclidean distances computation
    X = [[0]]
    Y = [[1], [2]]
    D = euclidean_distances(X, Y)
    assert_array_almost_equal(D, [[1., 2.]])

    X = csr_matrix(X)
    Y = csr_matrix(Y)
    D = euclidean_distances(X, Y)
    assert_array_almost_equal(D, [[1., 2.]])


# Paired distances 
Example 29
Project: optimizer   Author: ocelot-collab   File: DKLmodel.py    GNU General Public License v3.0 5 votes vote down vote up
def custom_LL(self, X, Y, alpha, noise_variance, compute_deriv=False):
        N,dim = X.shape
        Z = self.embed(X)
        if not compute_deriv:
            diffs = euclidean_distances(Z, squared=True)

            rbf_K = alpha * np.exp(-diffs / 2.)
            K_full = rbf_K + noise_variance * np.eye(N)

            L = np.linalg.cholesky(K_full)  # K = L * L.T
            Ly = np.linalg.solve(L, Y)  # finds inverse(L) * y
            log_lik = -0.5 * np.sum(Ly**2) # -1/2 * y.T * inverse(L * L.T) * y
            log_lik -= np.sum(np.log(np.diag(L)))  # equivalent to -1/2 * log(det(K))
            log_lik -= 0.5 * N * np.log(2 * np.pi)

            return float(log_lik)

        lengths = [0. for d in range(dim)]
        params = lengths + [np.log(alpha)] + [np.log(noise_variance)]
        neglik, deriv = SPGP_likelihood_4scipy(params, Y, Z)

        deriv_noise = deriv[-1]
        deriv_coeff = deriv[-2]
        deriv_z = deriv[:self.dim_z*N].reshape((N,self.dim_z))

        deriv_transform = np.dot(X.T, deriv_z)
        mask = self.mask or np.ones((dim,dim_z))
        return -neglik, deriv_transform * mask, deriv_coeff, deriv_noise

    # takes an n x dim_z matrix Z and translates it to x, assuming the embedding is linear
    # currently requires that the model embedding was set via set_linear 
Example 30
Project: DynWalks   Author: houchengbin   File: utils.py    MIT License 5 votes vote down vote up
def pairwise_similarity(mat, type='cosine'):
    ''' pairwise similarity; can be used as score function;
        vectorized computation 
    '''
    if type == 'cosine':  # support sprase and dense mat
        from sklearn.metrics.pairwise import cosine_similarity
        result = cosine_similarity(mat, dense_output=True)
    elif type == 'jaccard':
        from sklearn.metrics import jaccard_similarity_score
        from sklearn.metrics.pairwise import pairwise_distances
        # n_jobs=-1 means using all CPU for parallel computing
        result = pairwise_distances(mat.todense(), metric=jaccard_similarity_score, n_jobs=-1)
    elif type == 'euclidean':
        from sklearn.metrics.pairwise import euclidean_distances
        # note: similarity = - distance
        result = euclidean_distances(mat)
        result = -result
    elif type == 'manhattan':
        from sklearn.metrics.pairwise import manhattan_distances
        # note: similarity = - distance
        result = manhattan_distances(mat)
        result = -result
    else:
        print('Please choose from: cosine, jaccard, euclidean or manhattan')
        return 'Not found!'
    return result 
Example 31
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_pairwise.py    Apache License 2.0 5 votes vote down vote up
def test_euclidean_distances():
    # Check the pairwise Euclidean distances computation
    X = [[0]]
    Y = [[1], [2]]
    D = euclidean_distances(X, Y)
    assert_array_almost_equal(D, [[1., 2.]])

    X = csr_matrix(X)
    Y = csr_matrix(Y)
    D = euclidean_distances(X, Y)
    assert_array_almost_equal(D, [[1., 2.]])

    rng = np.random.RandomState(0)
    X = rng.random_sample((10, 4))
    Y = rng.random_sample((20, 4))
    X_norm_sq = (X ** 2).sum(axis=1).reshape(1, -1)
    Y_norm_sq = (Y ** 2).sum(axis=1).reshape(1, -1)

    # check that we still get the right answers with {X,Y}_norm_squared
    D1 = euclidean_distances(X, Y)
    D2 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq)
    D3 = euclidean_distances(X, Y, Y_norm_squared=Y_norm_sq)
    D4 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq,
                             Y_norm_squared=Y_norm_sq)
    assert_array_almost_equal(D2, D1)
    assert_array_almost_equal(D3, D1)
    assert_array_almost_equal(D4, D1)

    # check we get the wrong answer with wrong {X,Y}_norm_squared
    X_norm_sq *= 0.5
    Y_norm_sq *= 0.5
    wrong_D = euclidean_distances(X, Y,
                                  X_norm_squared=np.zeros_like(X_norm_sq),
                                  Y_norm_squared=np.zeros_like(Y_norm_sq))
    assert_greater(np.max(np.abs(wrong_D - D1)), .01) 
Example 32
Project: iww   Author: MohamedHmini   File: pairwise.py    MIT License 5 votes vote down vote up
def euclidean_similarity(vect1, vect2, max_distance):
    
    distance = euclidean_distances([vect1],[vect2])[0][0]
    euclidean_sim = 1 - ((distance / max_distance) if max_distance != 0 else 0)
    
    return euclidean_sim
    
    pass 
Example 33
Project: iww   Author: MohamedHmini   File: pairwise.py    MIT License 5 votes vote down vote up
def get_max_distance(expected_vect, max_val = 1):
    
    max_vect = np.full(len(expected_vect), max_val)
    max_distance = euclidean_distances([expected_vect],[max_vect])[0][0]
    
    return max_distance
        
    pass 
Example 34
Project: iww   Author: MohamedHmini   File: pairwise.py    MIT License 5 votes vote down vote up
def simple_euclidean_similarity(vect1, vect2):
    
    d = euclidean_distances([vect1],[vect2])[0][0]
    return 1/(1+d)
    
    pass 
Example 35
Project: iww   Author: MohamedHmini   File: lists_detector.py    MIT License 5 votes vote down vote up
def __end_adjust(self, node):
        
        if len(node['LISTS']['adjust']['expected_vect']) != 0:
            node['LISTS']['adjust']['width'] = 1- euclidean_distances([node['LISTS']['adjust']['width']], [node['LISTS']['adjust']['expected_vect']])[0][0]
            node['LISTS']['adjust']['height'] = 1- euclidean_distances([node['LISTS']['adjust']['height']], [node['LISTS']['adjust']['expected_vect']])[0][0]
            node['LISTS']['adjust']['area'] = 1- euclidean_distances([node['LISTS']['adjust']['area']], [node['LISTS']['adjust']['expected_vect']])[0][0]
            node['LISTS']['adjust']['tagsCount'] = 1- euclidean_distances([node['LISTS']['adjust']['tagsCount']], [node['LISTS']['adjust']['expected_vect']])[0][0]

        else:
            
            node['LISTS']['adjust']['width'] = 0
            node['LISTS']['adjust']['height'] = 0
            node['LISTS']['adjust']['area'] = 0
            node['LISTS']['adjust']['tagsCount'] = 0

            
            pass
        
# =============================================================================
#         features = [
#             'xpath','LISTS.adjust.width', 'LISTS.adjust.height', 'LISTS.adjust.area', 
#             'LISTS.adjust.font-size', 'LISTS.adjust.font-family', 'LISTS.adjust.background-color', 
#             'LISTS.adjust.color', 'LISTS.adjust.classes-coherence','LISTS.adjust.tagsCount']
#         
#         final_expected_vect = [1,1,1,1,1,1,1,1,1]
#                 
#         self.flatten_single_node(features)
# =============================================================================
            
        return node
        
        pass 
Example 36
Project: iww   Author: MohamedHmini   File: old_lists.py    MIT License 5 votes vote down vote up
def __end_adjust(self, node):
        
        if node['LISTS']['adjust']['expected_vect'].shape[0] != 0:
            node['LISTS']['adjust']['width'] = 1- euclidean_distances([node['LISTS']['adjust']['width']], [node['LISTS']['adjust']['expected_vect']])[0][0]
            node['LISTS']['adjust']['height'] = 1- euclidean_distances([node['LISTS']['adjust']['height']], [node['LISTS']['adjust']['expected_vect']])[0][0]
            node['LISTS']['adjust']['area'] = 1- euclidean_distances([node['LISTS']['adjust']['area']], [node['LISTS']['adjust']['expected_vect']])[0][0]
#            node['LISTS']['adjust']['font-size'] = 1- euclidean_distances([node['LISTS']['adjust']['font-size']], [node['LISTS']['adjust']['expected_vect']])[0][0]
#            node['LISTS']['adjust']['font-family-count'] = 1- euclidean_distances([node['LISTS']['adjust']['font-family-count']], [node['LISTS']['adjust']['expected_vect']])[0][0]
#            node['LISTS']['adjust']['background-color-count'] = 1- euclidean_distances([node['LISTS']['adjust']['background-color-count']], [node['LISTS']['adjust']['expected_vect']])[0][0]
#            node['LISTS']['adjust']['color-count'] = 1- euclidean_distances([node['LISTS']['adjust']['color-count']], [node['LISTS']['adjust']['expected_vect']])[0][0]
            node['LISTS']['adjust']['tagsCount'] = 1- euclidean_distances([node['LISTS']['adjust']['tagsCount']], [node['LISTS']['adjust']['expected_vect']])[0][0]
#            node['LISTS']['adjust']['densitySum'] = cosine_similarity([node['LISTS']['adjust']['densitySum']], [node['LISTS']['adjust']['expected_vect']])[0][0]

        else:
            
            node['LISTS']['adjust']['width'] = 0
            node['LISTS']['adjust']['height'] = 0
            node['LISTS']['adjust']['area'] = 0
#            node['LISTS']['adjust']['font-size'] = 0
#            node['LISTS']['adjust']['font-family-count'] = 0
#            node['LISTS']['adjust']['background-color-count'] = 0
#            node['LISTS']['adjust']['color-count'] = 0
            node['LISTS']['adjust']['tagsCount'] = 0
#            node['LISTS']['adjust']['densitySum'] = 0

            
            pass
        
        
#        expected_val = len(node['LISTS']['adjust']['bag-of-classes'])
#        expected_val = expected_val if expected_val != 0 else 1
#        node['LISTS']['adjust']['classes-coherence'] = (sum(node['LISTS']['adjust']['bag-of-classes']) * 100)/expected_val
        
            
        return node
        
        pass 
Example 37
Project: popsom   Author: njali2001   File: popsom.py    GNU General Public License v3.0 5 votes vote down vote up
def compute_umat(self, smoothing=None):
		""" compute_umat -- compute the unified distance matrix
		
			parameters:
			- smoothing - is either NULL, 0, or a positive floating point value controlling the
			              smoothing of the umat representation
			return:
			- a matrix with the same x-y dims as the original map containing the umat values
		"""

		d = euclidean_distances(self.neurons, self.neurons)
		umat = self.compute_heat(d, smoothing)

		return umat 
Example 38
Project: incremental-label-propagation   Author: johny-c   File: knn_graph_utils.py    MIT License 5 votes vote down vote up
def squared_distances(X1, X2, L=None):

    if L is None:
        dist = euclidean_distances(X1, X2, squared=True)
    else:
        dist = euclidean_distances(X1.dot(L.T), X2.dot(L.T), squared=True)

    return dist 
Example 39
Project: Same-Size-K-Means   Author: ndanielsen   File: equal_groups.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _transform(self, X):
        """guts of transform method; no input validation"""
        return euclidean_distances(X, self.cluster_centers_) 
Example 40
Project: scRNA-Seq   Author: broadinstitute   File: diffusion_map.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def run_pseudotime_calculation(data, roots):
	start = time.time()
	data.uns['roots'] = roots
	mask = np.isin(data.obs_names, data.uns['roots'])
	distances = np.mean(euclidean_distances(data.obsm['X_diffmap'][mask, :], data.obsm['X_diffmap']), axis = 0)
	dmin = distances.min()
	dmax = distances.max()
	data.obs['pseudotime'] = (distances - dmin) / (dmax - dmin)
	end = time.time()
	print("run_pseudotime_calculation finished. Time spent = {:.2f}s".format(end - start)) 
Example 41
Project: soft-dtw   Author: mblondel   File: distance.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def compute(self):
        """
        Compute distance matrix.

        Returns
        -------
        D: array, shape = [m, n]
            Distance matrix.
        """
        return euclidean_distances(self.X, self.Y, squared=True) 
Example 42
Project: alphacsc   Author: alphacsc   File: distance.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def compute(self):
        """
        Compute distance matrix.

        Returns
        -------
        D: array, shape = [m, n]
            Distance matrix.
        """
        return euclidean_distances(self.X, self.Y, squared=True) 
Example 43
Project: tslearn   Author: rtavenar   File: metrics.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def compute(self):
        """Compute distance matrix.

        Returns
        -------
        D: array, shape = [m, n]
            Distance matrix.
        """
        return euclidean_distances(self.X, self.Y, squared=True) 
Example 44
Project: klcpd_code   Author: OctoberChang   File: median_heuristic.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def median_heuristic(X, beta=0.5):
    max_n = min(30000, X.shape[0])
    D2 = euclidean_distances(X[:max_n], squared=True)
    med_sqdist = np.median(D2[np.triu_indices_from(D2, k=1)])
    beta_list = [beta**2, beta**1, 1, (1.0/beta)**1, (1.0/beta)**2]
    return [med_sqdist * b for b in beta_list] 
Example 45
Project: klcpd_code   Author: OctoberChang   File: mmd_util.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def median_heuristic(X, beta=0.5):
    max_n = min(30000, X.shape[0])
    D2 = euclidean_distances(X[:max_n], squared=True)
    med_sqdist = np.median(D2[np.triu_indices_from(D2, k=1)])
    beta_list = [beta**2, beta**1, 1, (1.0/beta)**1, (1.0/beta)**2]
    return [med_sqdist * b for b in beta_list]


# X_p_enc: batch_size x seq_len x hid_dim
# X_f_enc: batch_size x seq_len x hid_dim
# hid_dim could be either dataspace_dim or codespace_dim
# return: MMD2(X_p_enc[i,:,:], X_f_enc[i,:,:]) for i = 1:batch_size 
Example 46
Project: OpenANE   Author: houchengbin   File: utils.py    MIT License 5 votes vote down vote up
def pairwise_similarity(mat, type='cosine'):   # for efficiency, plz given dense mat as the input
    if type == 'cosine':  # support sprase and dense mat
        from sklearn.metrics.pairwise import cosine_similarity
        result = cosine_similarity(mat, dense_output=True)
    elif type == 'jaccard':
        from sklearn.metrics import jaccard_similarity_score
        from sklearn.metrics.pairwise import pairwise_distances
        # n_jobs=-1 means using all CPU for parallel computing
        result = pairwise_distances(mat.todense(), metric=jaccard_similarity_score, n_jobs=-1)
    elif type == 'euclidean':
        from sklearn.metrics.pairwise import euclidean_distances
        # note: similarity = - distance
        result = euclidean_distances(mat)
        result = -result
    elif type == 'manhattan':
        from sklearn.metrics.pairwise import manhattan_distances
        # note: similarity = - distance
        result = manhattan_distances(mat)
        result = -result
    else:
        print('Please choose from: cosine, jaccard, euclidean or manhattan')
        return 'Not found!'
    return result


# ---------------------------------ulits for downstream tasks-------------------------------- 
Example 47
Project: Concurrent_AP   Author: GGiecold   File: Concurrent_AP.py    MIT License 5 votes vote down vote up
def process(self, rows_slice):
        tmp = self.array[rows_slice, ...]
        result = - euclidean_distances(tmp, self.array, squared = True)

        with Worker.hdf5_lock:            
            with tables.open_file(self.hdf5_file, 'r+') as fileh:
                hdf5_array = fileh.get_node(self.path)
                hdf5_array[rows_slice, ...] = result
                
        del tmp 
Example 48
Project: Hands-on-Supervised-Machine-Learning-with-Python   Author: PacktPublishing   File: knn.py    MIT License 5 votes vote down vote up
def predict(self, X):
        # Compute the pairwise distances between each observation in
        # the dataset and the training data. This can be relatively expensive
        # for very large datasets!!
        train = self.X
        dists = euclidean_distances(X, train)

        # Arg sort to find the shortest distance for each row. This sorts
        # elements in each row (independent of other rows) to determine the
        # order required to sort the rows.
        # I.e:
        # >>> P = np.array([[4, 5, 1], [3, 1, 6]])
        # >>> np.argsort(P, axis=1)
        # array([[2, 0, 1],
        #        [1, 0, 2]])
        nearest = np.argsort(dists, axis=1)

        # We only care about the top K, really, so get sorted and then truncate
        # I.e:
        # array([[1, 2, 1],
        #           ...
        #        [0, 0, 0]])
        predicted_labels = self.y[nearest][:, :self.k]

        # We want the most common along the rows as the predictions
        # I.e:
        # array([1, ..., 0])
        return mode(predicted_labels, axis=1)[0].ravel() 
Example 49
Project: MultiKE   Author: nju-websoft   File: similarity.py    MIT License 4 votes vote down vote up
def sim(embed1, embed2, metric='inner', normalize=False, csls_k=0):
    """
    Compute pairwise similarity between the two collections of embeddings.

    Parameters
    ----------
    embed1 : matrix_like
        An embedding matrix of size n1*d, where n1 is the number of embeddings and d is the dimension.
    embed2 : matrix_like
        An embedding matrix of size n2*d, where n2 is the number of embeddings and d is the dimension.
    metric : str, optional, inner default.
        The distance metric to use. It can be 'cosine', 'euclidean', 'inner'.
    normalize : bool, optional, default false.
        Whether to normalize the input embeddings.
    csls_k : int, optional, 0 by default.
        K value for csls. If k > 0, enhance the similarity by csls.

    Returns
    -------
    sim_mat : An similarity matrix of size n1*n2.
    """
    if normalize:
        embed1 = preprocessing.normalize(embed1)
        embed2 = preprocessing.normalize(embed2)
    if metric == 'inner':
        sim_mat = np.matmul(embed1, embed2.T)  # numpy.ndarray, float32
    elif metric == 'cosine' and normalize:
        sim_mat = np.matmul(embed1, embed2.T)  # numpy.ndarray, float32
    elif metric == 'euclidean':
        sim_mat = 1 - euclidean_distances(embed1, embed2)
        print(type(sim_mat), sim_mat.dtype)
        sim_mat = sim_mat.astype(np.float32)
    elif metric == 'cosine':
        sim_mat = 1 - cdist(embed1, embed2, metric='cosine')   # numpy.ndarray, float64
        sim_mat = sim_mat.astype(np.float32)
    elif metric == 'manhattan':
        sim_mat = 1 - cdist(embed1, embed2, metric='cityblock')
        sim_mat = sim_mat.astype(np.float32)
    else:
        sim_mat = 1 - cdist(embed1, embed2, metric=metric)
        sim_mat = sim_mat.astype(np.float32)
    if csls_k > 0:
        sim_mat = csls_sim(sim_mat, csls_k)
    return sim_mat 
Example 50
Project: scikit-learn-extra   Author: scikit-learn-contrib   File: _eigenpro.py    BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def _kernel(self, X, Y):
        """Calculate the kernel matrix

        Parameters
        ---------
        X : {float, array}, shape = [n_samples, n_features]
            Input data.

        Y : {float, array}, shape = [n_centers, n_targets]
            Kernel centers.

        Returns
        -------
        K : {float, array}, shape = [n_samples, n_centers]
            Kernel matrix.
        """
        if (
            self.kernel != "rbf"
            and self.kernel != "laplace"
            and self.kernel != "cauchy"
        ):
            if callable(self.kernel):
                params = self.kernel_params or {}
            else:
                params = {
                    "gamma": self.gamma_,
                    "degree": self.degree,
                    "coef0": self.coef0,
                }
            return pairwise_kernels(
                X, Y, metric=self.kernel, filter_params=True, **params
            )
        distance = euclidean_distances(X, Y, squared=True)
        bandwidth = np.float32(1.0 / np.sqrt(2.0 * self.gamma_))
        if self.kernel == "rbf":
            distance = -self.gamma_ * distance
            K = np.exp(distance)
        elif self.kernel == "laplace":
            d = np.maximum(distance, 0)
            K = np.exp(-np.sqrt(d) / bandwidth)
        else:  # self.kernel == "cauchy":
            K = 1 / (1 + 2.0 * self.gamma_ * distance)
        return K 
Example 51
Project: GewitterGefahr   Author: thunderhoser   File: colours.py    MIT License 4 votes vote down vote up
def get_random_colours(num_colours, colour_to_exclude_rgb=None,
                       min_rgb_distance=DEFAULT_MIN_RGB_DISTANCE):
    """Returns list of random colours.

    N = number of colours

    :param num_colours: Number of colours desired.
    :param colour_to_exclude_rgb: Colour to exclude (length-3 numpy array with
        values in 0...1).
    :param min_rgb_distance: All colours returned will be at least this far away
        from `colour_to_exclude_rgb`.  Distance is Euclidean.
    :return: rgb_matrix: N-by-3 numpy array with values in 0...1.  Each row is
        one colour.
    """

    orig_num_colours = num_colours + 0

    if colour_to_exclude_rgb is not None:
        error_checking.assert_is_numpy_array(
            colour_to_exclude_rgb, exact_dimensions=numpy.array([3], dtype=int)
        )

        error_checking.assert_is_geq_numpy_array(colour_to_exclude_rgb, 0.)
        error_checking.assert_is_leq_numpy_array(colour_to_exclude_rgb, 1.)
        error_checking.assert_is_greater(min_rgb_distance, 0.)
        error_checking.assert_is_leq(min_rgb_distance, 1.)

        num_colours = 10 * num_colours

    rgb_matrix = numpy.random.uniform(low=0., high=1., size=(num_colours, 3))

    if colour_to_exclude_rgb is not None:
        colour_to_exclude_rgb = numpy.reshape(colour_to_exclude_rgb, (1, 3))

        squared_distances = euclidean_distances(
            X=rgb_matrix, Y=numpy.reshape(colour_to_exclude_rgb, (1, 3)),
            squared=True
        )

        good_indices = numpy.where(
            squared_distances >= min_rgb_distance ** 2
        )[0]

        rgb_matrix = rgb_matrix[good_indices, ...]

    num_colours = min([
        orig_num_colours, rgb_matrix.shape[0]
    ])

    rgb_matrix = rgb_matrix[:num_colours, ...]
    numpy.random.shuffle(rgb_matrix)
    return rgb_matrix 
Example 52
Project: ppt_controller   Author: rkat7   File: code.py    GNU General Public License v3.0 4 votes vote down vote up
def count(thresholded, segmented):
    
    chull = cv2.convexHull(segmented)
    extreme_top    = tuple(chull[chull[:, :, 1].argmin()][0])
    extreme_bottom = tuple(chull[chull[:, :, 1].argmax()][0])
    extreme_left   = tuple(chull[chull[:, :, 0].argmin()][0])
    extreme_right  = tuple(chull[chull[:, :, 0].argmax()][0])
    
    
    if extreme_left[1] > extreme_right[1]:
            print("right")
            p.append(0)
    else:
            print("left")
            p.append(1)
    
    cX = int((extreme_left[0] + extreme_right[0]) / 2)
    cY = int((extreme_top[1] + extreme_bottom[1]) / 2)
    distance = pairwise.euclidean_distances([(cX, cY)], Y=[extreme_left, extreme_right, extreme_top, extreme_bottom])[0]
    maximum_distance = distance[distance.argmax()]
    radius = int(0.5 * maximum_distance)
    circumference = (2 * np.pi * radius)
    circular_roi = np.zeros(thresholded.shape[:2], dtype="uint8")
    cv2.circle(circular_roi, (cX, cY), radius, 255, 1)
    circular_roi = cv2.bitwise_and(thresholded, thresholded, mask=circular_roi)
    (_, cnts, _) = cv2.findContours(circular_roi.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    count = 0
    for c in cnts:
        
        (x, y, w, h) = cv2.boundingRect(c)
        if ((cY + (cY * 0.25)) > (y + h)) and ((circumference * 0.25) > c.shape[0]):
                    count += 1
                    l.append(count)
                    
                              
           
    return(l)    
    
  
           
#-------------------------------------------------------------------------------
# Main function
#------------------------------------------------------------------------------- 
Example 53
Project: ml-recsys-tools   Author: DomainGroupOSS   File: similarity.py    MIT License 4 votes vote down vote up
def _top_N_similar(source_inds, source_mat, target_mat, n,
                   exclude_mat_sp=None, source_biases=None, target_biases=None,
                   simil_mode='cosine'):
    """
    for each row in specified inds in source_mat calculates top N similar items in target_mat
    :param source_inds: indices into source mat
    :param source_mat: matrix of features for similarity calculation (left side)
    :param target_mat: matrix of features for similarity calculation (right side)
    :param n: number of top elements to retreive
    :param exclude_mat_sp: a sparse matrix with interactions to exclude
    :param source_biases: bias terms for source_mat
    :param target_biases: bias terms for target_mat
    :param simil_mode: type of similarity calculation:
        'cosine' dot product of normalized matrices (each row sums to 1), without biases
        'dot' regular dot product, without normalization
    :return:
    """

    if not len(source_inds) or \
            0 in target_mat.shape + source_mat.shape:
        return np.array([[]]), np.array([[]])

    if simil_mode == 'cosine':
        scores = cosine_similarity(source_mat[source_inds, :], target_mat)

    elif simil_mode == 'euclidean':
        scores = 1 / (euclidean_distances(source_mat[source_inds, :], target_mat) + 0.001)

    elif simil_mode == 'dot':
        scores = np.dot(source_mat[source_inds, :], target_mat.T)

        if source_biases is not None:
            scores = (scores.T + source_biases[source_inds]).T

        if target_biases is not None:
            scores += target_biases

        if sp.issparse(scores):
            scores = scores.toarray()
        else:
            scores = np.array(scores)

    else:
        raise NotImplementedError('unknown similarity mode')

    if exclude_mat_sp is not None:
        exclude_mat_sp_coo = exclude_mat_sp[source_inds, :].tocoo()
        scores[exclude_mat_sp_coo.row, exclude_mat_sp_coo.col] = -np.inf

    best_inds, best_scores = top_N_unsorted(scores, n)

    sort_inds = _argsort_mask_descending(best_scores)

    return best_inds[sort_inds], best_scores[sort_inds] 
Example 54
Project: SPL-ADVisE   Author: vithursant   File: cluster_dataset.py    GNU General Public License v3.0 4 votes vote down vote up
def get_cluster(X, pca_dim, num_cluster, dataset_name, save_file = True, topk = 1):

    n = X.shape[0]
    center_nn = np.array([])
    centers_ = ()

    # dimension reduction
    if issparse(X):
        print ('TruncatedSVD of sparse X', (n, X.shape[1]))
        svd = TruncatedSVD(n_components=pca_dim, algorithm='randomized', n_iter=15)
        X_pca = svd.fit_transform(X)
        print ('TruncatedSVD finished')
    elif n > 10000:
        print ('PCA of data size', n)
        pca = PCA(n_components = pca_dim, svd_solver='randomized')
        X_pca = pca.fit_transform(X)
        print ('PCA finished')
    else:
        X_pca = X
        print ('PCA not applied')

    # clustering
    print ('k-means to', num_cluster, 'clusters')
    kmeans = MiniBatchKMeans(n_clusters = num_cluster, max_iter = 100, init_size = 3*num_cluster).fit(X_pca.astype('float64'))    
    labels_ = kmeans.labels_.astype('int32')
    labels_ = np.array([np.where(labels_ == i)[0].astype('int32') for i in range(num_cluster)])
    labels_weight = np.asarray(list(map(len, labels_)))
    #print(labels_weight)
    labels_weight = np.divide(labels_weight,float(np.max(labels_weight)))
    nnz_ind = np.where(labels_weight != 0)[0]
    labels_ = labels_[nnz_ind]
    labels_weight = labels_weight[nnz_ind]
    
    for j in range(len(nnz_ind)):
        centers_ = centers_ + (np.mean(X[labels_[j], :], axis = 0),)
        center_nn = np.append(center_nn, labels_[j][np.argmin(euclidean_distances([kmeans.cluster_centers_[nnz_ind[j]]], X_pca[labels_[j]]))])
    centers_ = np.vstack(centers_)

    if save_file:
        np.savetxt(dataset_name + '_kmeans_labels.txt', cluster_label)
        np.savetxt(dataset_name + '_kmeans_centers.txt', cluster_centers)
        np.savetxt(dataset_name + '_center_nn.txt', center_nn)
        labels_, labels_weight, centers_, center_nn = [],[],[],[]
    else:
        return labels_, labels_weight, centers_, center_nn.astype('int32') 
Example 55
Project: SPL-ADVisE   Author: vithursant   File: cluster_dataset.py    GNU General Public License v3.0 4 votes vote down vote up
def group_data(X, tsne_dim, num_cluster, dataset_name, save_file = True):

    print('clustering...')
    n = X.shape[0]
    center_nn = np.array([])
    cluster_centers = ()
    if tsne_dim <= 0 and not issparse(X) and n <= 10000:
        X_tsne = X
    elif    n > 10000:
        if tsne_dim == 0:
            tsne_dim = 48
        print('TruncatedSVD of data size', (n, X.shape[1]))
        svd = TruncatedSVD(n_components=tsne_dim, algorithm='randomized', n_iter=10, random_state=42)
        X_tsne = svd.fit_transform(X)
        #print(len(X_tsne))
        print('finish TruncatedSVD.')
    else:
        print('PCA of data size', n)
        pca = PCA(n_components = tsne_dim)
        X_tsne = pca.fit_transform(X)
        print('finish PCA.')
    print('k-means to', num_cluster, 'clusters')
    #reduced_data = X_tsne.astype('float64')
    kmeans = KMeans(n_clusters = num_cluster, max_iter = 50, n_jobs=4).fit(X_tsne.astype('float64'))
    cluster_label = kmeans.labels_
    for j in range(num_cluster):
        jIndex = np.where(cluster_label==j)[0]
        centerj = kmeans.cluster_centers_[j] #np.mean(X[jIndex, :], axis = 0)
        cluster_centers = cluster_centers + (centerj,)
        center_nn = np.append(center_nn, jIndex[np.argmin(euclidean_distances([kmeans.cluster_centers_[j]], X_tsne[jIndex]))])

    cluster_centers = np.vstack(cluster_centers)

    if save_file:
        np.savetxt(dataset_name + '_kmeans_labels.txt', cluster_label)
        np.savetxt(dataset_name + '_kmeans_centers.txt', cluster_centers)
        np.savetxt(dataset_name + '_center_nn.txt', center_nn)
        cluster_label, cluster_centers, center_nn = [],[],[]
    else:
        #print("Visualize KMeans")
        #visualize_kmeans(X, cluster_label)
        #reduced_data = PCA(n_components=2).fit_transform(X[:100])
        #kmeans = KMeans(n_clusters = num_cluster, max_iter = 50, n_jobs=4).fit(reduced_data)
        #plot_kmeans_2d(kmeans, reduced_data, cluster_label)
        #print("Done visualizing kmeans")
        #exit()
        return cluster_label, cluster_centers, center_nn 
Example 56
Project: popsom   Author: njali2001   File: popsom.py    GNU General Public License v3.0 4 votes vote down vote up
def smooth_2d(self, Y, ind=None, weight_obj=None, grid=None, nrow=64, ncol=64, surface=True, theta=None):
		""" smooth_2d -- Kernel Smoother For Irregular 2-D Data """

		def exp_cov(x1, x2, theta=2, p=2, distMat=0):
			x1 = x1*(1/theta)
			x2 = x2*(1/theta)
			distMat = euclidean_distances(x1, x2)
			distMat = distMat**p
			return np.exp(-distMat)

		NN = [[1]*ncol] * nrow
		grid = {'x': [i for i in range(nrow)], "y": [i for i in range(ncol)]}

		if weight_obj is None:
			dx = grid['x'][1] - grid['x'][0]
			dy = grid['y'][1] - grid['y'][0]
			m = len(grid['x'])
			n = len(grid['y'])
			M = 2 * m
			N = 2 * n
			xg = []

			for i in range(N):
				for j in range(M):
					xg.extend([[j, i]])

			xg = np.matrix(xg)

			center = []
			center.append([int(dx * M/2-1), int((dy * N)/2-1)])

			out = exp_cov(xg, np.matrix(center),theta=theta)
			out = np.matrix.transpose(np.reshape(out, (N, M)))
			temp = np.zeros((M, N))
			temp[int(M/2-1)][int(N/2-1)] = 1

			wght = np.fft.fft2(out)/(np.fft.fft2(temp) * M * N)
			weight_obj = {"m": m, "n": n, "N": N, "M": M, "wght": wght}

		temp = np.zeros((weight_obj['M'], weight_obj['N']))
		temp[0:m, 0:n] = Y
		temp2 = np.fft.ifft2(np.fft.fft2(temp) *
							 weight_obj['wght']).real[0:weight_obj['m'],
													  0:weight_obj['n']]

		temp = np.zeros((weight_obj['M'], weight_obj['N']))
		temp[0:m, 0:n] = NN
		temp3 = np.fft.ifft2(np.fft.fft2(temp) *
							 weight_obj['wght']).real[0:weight_obj['m'],
													  0:weight_obj['n']]

		return temp2/temp3 
Example 57
Project: dhSegment   Author: dhlab-epfl   File: line_vectorization.py    GNU General Public License v3.0 4 votes vote down vote up
def find_lines(lines_mask: np.ndarray) -> list:
    """
    Finds the longest central line for each connected component in the given binary mask.

    :param lines_mask: Binary mask of the detected line-areas
    :return: a list of Opencv-style polygonal lines (each contour encoded as [N,1,2] elements where each tuple is (x,y) )
    """
    # Make sure one-pixel wide 8-connected mask
    lines_mask = skeletonize(lines_mask)

    class MakeLineMCP(MCP_Connect):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.connections = dict()
            self.scores = defaultdict(lambda: np.inf)

        def create_connection(self, id1, id2, pos1, pos2, cost1, cost2):
            k = (min(id1, id2), max(id1, id2))
            s = cost1 + cost2
            if self.scores[k] > s:
                self.connections[k] = (pos1, pos2, s)
                self.scores[k] = s

        def get_connections(self, subsample=5):
            results = dict()
            for k, (pos1, pos2, s) in self.connections.items():
                path = np.concatenate([self.traceback(pos1), self.traceback(pos2)[::-1]])
                results[k] = path[::subsample]
            return results

        def goal_reached(self, int_index, float_cumcost):
            if float_cumcost > 0:
                return 2
            else:
                return 0

    if np.sum(lines_mask) == 0:
        return []
    # Find extremities points
    end_points_candidates = np.stack(np.where((convolve2d(lines_mask, np.ones((3, 3)), mode='same') == 2) & lines_mask)).T
    connected_components = skimage_label(lines_mask, connectivity=2)
    # Group endpoint by connected components and keep only the two points furthest away
    d = defaultdict(list)
    for pt in end_points_candidates:
        d[connected_components[pt[0], pt[1]]].append(pt)
    end_points = []
    for pts in d.values():
        d = euclidean_distances(np.stack(pts), np.stack(pts))
        i, j = np.unravel_index(d.argmax(), d.shape)
        end_points.append(pts[i])
        end_points.append(pts[j])
    end_points = np.stack(end_points)

    mcp = MakeLineMCP(~lines_mask)
    mcp.find_costs(end_points)
    connections = mcp.get_connections()
    if not np.all(np.array(sorted([i for k in connections.keys() for i in k])) == np.arange(len(end_points))):
        print('Warning : find_lines seems weird')
    return [c[:, None, ::-1] for c in connections.values()] 
Example 58
Project: Concurrent_AP   Author: GGiecold   File: Concurrent_AP.py    MIT License 4 votes vote down vote up
def set_preference(data, chunk_size):
    """Return the median of the distribution of pairwise L2 Euclidean distances 
        between samples (the rows of 'data') as the default preference parameter
        for Affinity Propagation clustering.

    Parameters
    ----------
    data : array of shape (N_samples, N_features)
        The data-set submitted for Affinity Propagation clustering.
        
    chunk_size : int
        The size of random subsamples from the data-set whose similarity
        matrix is computed. The resulting median of the distribution of 
        pairwise distances between the data-points selected as part of a
        given subsample is stored into a list of medians. 

    Returns
    -------
    preference : float
        The preference parameter for Affinity Propagation clustering is computed
        as the median of the list of median pairwise distances between the data-points
        selected as part of each of 15 rounds of random subsampling.
    """

    N_samples, N_features = data.shape
    
    rng = np.arange(0, N_samples, dtype = int)
    medians = []
    
    for i in range(15):
        selected_samples = np.random.choice(N_samples, size = chunk_size, replace = False)
        samples = data[selected_samples, :]
                
        S = - euclidean_distances(samples, data, squared = True)
                
        n = chunk_size * N_samples - (chunk_size * (chunk_size + 1) / 2)
                
        rows = np.zeros(0, dtype = int)
        for i in range(chunk_size):
            rows = np.append(rows, np.full(N_samples - i, i, dtype = int))
                
        cols = np.zeros(0, dtype = int)
        for i in range(chunk_size):
            cols = np.append(cols, np.delete(rng, selected_samples[:i+1]))
                        
        triu_indices = tuple((rows, cols))
                
        preference = np.median(S, overwrite_input = True)
        medians.append(preference)
                
        del S
                
        if i % 4 == 3:
            gc.collect()       
            
    preference = np.median(medians)

    return preference 
Example 59
Project: HOTT   Author: IBM   File: data.py    MIT License 4 votes vote down vote up
def loader(data_path,
           embeddings_path,
           p=1,
           K_lda=70,
           glove_embeddings=True,
           stemming=True):
    """ Load dataset and embeddings from data path."""
    # Load dataset from data_path
    vocab, embed_vocab, bow_data, y = load_wmd_data(data_path)
    y = y - 1

    # Use GLOVE word embeddings
    if glove_embeddings:
        vocab, embed_vocab, bow_data = change_embeddings(
            vocab, bow_data, embeddings_path)
    # Reduce vocabulary by removing short words, stop words, and stemming
    if stemming:
        vocab, embed_vocab, bow_data = reduce_vocab(
            bow_data, vocab, embed_vocab, embed_aggregate='mean')

    # Get embedded documents
    embed_data = get_embedded_data(bow_data, embed_vocab, vocab)
    # Matrix of word embeddings
    embeddings = np.array([embed_vocab[w] for w in vocab])

    topics, lda_centers, topic_proportions = fit_topics(
        bow_data, embeddings, vocab, K_lda)

    cost_embeddings = euclidean_distances(embeddings, embeddings) ** p
    cost_topics = np.zeros((topics.shape[0], topics.shape[0]))

    for i in range(cost_topics.shape[0]):
        for j in range(i + 1, cost_topics.shape[1]):
            cost_topics[i, j] = ot.emd2(topics[i], topics[j], cost_embeddings)
    cost_topics = cost_topics + cost_topics.T

    out = {'X': bow_data, 'y': y,
           'embeddings': embeddings,
           'topics': topics, 'proportions': topic_proportions,
           'cost_E': cost_embeddings, 'cost_T': cost_topics}

    return out