Python sklearn.metrics.pairwise.cosine_distances() Examples

The following are code examples for showing how to use sklearn.metrics.pairwise.cosine_distances(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: DHGNN   Author: iMoonLab   File: construct_hypergraph.py    MIT License 6 votes vote down vote up
def construct_H_with_KNN(X, K_neigs=[10], is_probH=False, m_prob=1):
    """
    init multi-scale hypergraph Vertex-Edge matrix from original node feature matrix
    :param X: N_object x feature_number
    :param K_neigs: the number of neighbor expansion
    :param is_probH: prob Vertex-Edge matrix or binary
    :param m_prob: prob
    :return: N_object x N_hyperedge
    """
    if len(X.shape) != 2:
        X = X.reshape(-1, X.shape[-1])

    if type(K_neigs) == int:
        K_neigs = [K_neigs]

    dis_mat = cos_dis(X)
    H = None
    for k_neig in K_neigs:
        H_tmp = construct_H_with_KNN_from_distance(dis_mat, k_neig, is_probH, m_prob)
        H = hyperedge_concat(H, H_tmp)
    return H 
Example 2
Project: deep-align   Author: prokolyvakis   File: ppdb_utils.py    Apache License 2.0 6 votes vote down vote up
def createAnts(syns, ants, neg_num, start=0):
    S = []
    A = []
    for syn in syns:
        S.append(syn.representation)
    for ant in ants:
        A.append(ant.representation)
    dist = cosine_distances( np.array(S), np.array(A) )
    
    sort_dist = np.argsort( dist )
    
    res= {}
    
    for idx, syn in enumerate(syns):
        tmp = []
        for i in range(start,start+neg_num):
            tmp.append(ants[sort_dist[idx,i]].embeddings)
        res[syn.phrase] = tmp    
    return res 
Example 3
Project: scattertext   Author: JasonKessler   File: pairplot.py    Apache License 2.0 5 votes vote down vote up
def produce_category_focused_pairplot(corpus,
                                      category,
                                      category_projector=CategoryProjector(projector=PCA(20)),
                                      category_projection=None,
                                      **kwargs):
    '''
    Produces a pair-plot which is focused on a single category.

    :param corpus: TermDocMatrix
    :param category: str, name of a category in the corpus
    :param category_projector: CategoryProjector, a factor analysis of the category/feature vector
    :param category_projection: CategoryProjection, None by default. If present, overrides category projector
    :param kwargs: remaining kwargs for produce_pairplot
    :return: str, HTML
    '''

    category_num = corpus.get_categories().index(category)

    uncorrelated_components_projection = (category_projector.project(corpus)
                                          if category_projection is None
                                          else category_projection)

    distances = cosine_distances(uncorrelated_components_projection.get_category_embeddings().T)

    similarity_to_category_scores = -2 * (rankdata(distances[category_num]) - 0.5)

    uncorrelated_components = uncorrelated_components_projection.get_projection()

    least_correlated_dimension = min([(np.abs(pearsonr(similarity_to_category_scores,
                                                       uncorrelated_components.T[i])[0]), i)]
                                     for i in range(uncorrelated_components.shape[1]))[0][1]

    projection_to_plot = np.array([uncorrelated_components.T[least_correlated_dimension],
                                   similarity_to_category_scores]).T

    return produce_pairplot(corpus,
                            initial_category=category,
                            category_projection=uncorrelated_components_projection.use_alternate_projection(
                                projection_to_plot),
                            category_focused=True,
                            **kwargs) 
Example 4
Project: clustering4docs   Author: lovit   File: _kmeans.py    GNU General Public License v3.0 5 votes vote down vote up
def _transform(self, X):
        """guts of transform method; no input validation"""
        return cosine_distances(X, self.cluster_centers_) 
Example 5
Project: tokenquery   Author: ramtinms   File: vector_opr.py    GNU General Public License v3.0 5 votes vote down vote up
def vec_cos_dist(token_input, operation_input):
    operation_string = None
    ref_vector_string = None
    cond_value_string = None
    for opr_sign in ['==', '>=', '<=', '!=', '<>', '<', '>', '=']:
        if opr_sign in operation_input:
            ref_vector_string = operation_input.split(opr_sign)[0]
            operation_string = opr_sign
            cond_value_string = operation_input.split(opr_sign)[1]
            break

    if ref_vector_string and cond_value_string and operation_string:
        try:
            cond_value = float(cond_value_string)
            ref_vector = change_string_to_vector(ref_vector_string)
            token_vector = change_string_to_vector(token_input)
            if len(ref_vector) != len(token_vector):
                print ('len of vectors does not match')
                return False
            if operation_string == "=" or operation_string == "==":
                return cosine_distances(token_vector, ref_vector) == cond_value
            elif operation_string == "<":
                return cosine_distances(token_vector, ref_vector) < cond_value
            elif operation_string == ">":
                return cosine_distances(token_vector, ref_vector) > cond_value
            elif operation_string == ">=":
                return cosine_distances(token_vector, ref_vector) >= cond_value
            elif operation_string == "<=":
                return cosine_distances(token_vector, ref_vector) <= cond_value
            elif operation_string == "!=" or operation_string == "<>":
                return cosine_distances(token_vector, ref_vector) != cond_value
            else:
                return False
        except ValueError:
            # TODO raise tokenregex error
            return False

    else:
        # TODO raise tokenregex error
        print ('Problem with the operation input') 
Example 6
Project: Firmware_Slap   Author: ChrisTheCoolHut   File: firmware_clustering.py    GNU General Public License v3.0 5 votes vote down vote up
def get_cosine_dist(all_functions):
    return_dict = {}
    vect, func_sparse = funcs_to_sparse(all_functions)

    transformer = Normalizer().fit(func_sparse)

    func_sparse = transformer.transform(func_sparse)

    return cosine_distances(func_sparse, func_sparse) 
Example 7
Project: chameleon_recsys   Author: gabrielspmoreira   File: metrics.py    MIT License 5 votes vote down vote up
def cosine_distance(v1, v2):
    #As cosine similarity interval is [-1.0, 1.0], the cosine distance interval is [0.0, 2.0].
    #This normalizes the cosine distance to interval [0.0, 1.0]
    return pairwise.cosine_distances(v1, v2) / 2.0



#For ranks index starting from 0 
Example 8
Project: DHGNN   Author: iMoonLab   File: construct_hypergraph.py    MIT License 5 votes vote down vote up
def _construct_edge_list_from_distance(X, k_neigh):
    """
    construct edge_list (numpy array) from kNN distance for single modality
    :param X -> numpy array: feature
    :param k_neigh -> int: # of neighbors
    :return: N * k_neigh numpy array
    """
    dis = cos_dis(X)
    dis = torch.Tensor(dis)
    _, k_idx = dis.topk(k_neigh, dim=-1, largest=False)
    return k_idx.numpy() 
Example 9
Project: celeb-detection-oss   Author: Giphy   File: face_recognizer.py    Mozilla Public License 2.0 5 votes vote down vote up
def _distance(x1, x2):
        return cosine_distances(x1, x2) 
Example 10
Project: ml-recsys-tools   Author: DomainGroupOSS   File: factorisation_clustering.py    MIT License 5 votes vote down vote up
def cluster_factors(self, verbose=False):
        u_f = self.factoriser.user_factors_dataframe()
        i_f = self.factoriser.item_factors_dataframe()
        # u_f, user_centers = add_clusters(u_f, n_clusters, True)
        # i_f, item_centers = add_clusters(i_f, n_clusters, True)
        df_factors = pd.concat([u_f, i_f], sort=False)
        self._calc_clusters(df_factors, verbose=verbose)

        self.u_cluster_labels = self.cluster_labels[:len(u_f)]
        self.i_cluster_labels = self.cluster_labels[len(u_f):]

        self.center_neighbours, _ = top_N_sorted(
            -cosine_distances(self.centers), len(self.centers)) 
Example 11
Project: wine-ml-on-aws-lambda   Author: pierreant   File: test_pairwise.py    Apache License 2.0 5 votes vote down vote up
def test_cosine_distances():
    # Check the pairwise Cosine distances computation
    rng = np.random.RandomState(1337)
    x = np.abs(rng.rand(910))
    XA = np.vstack([x, x])
    D = cosine_distances(XA)
    assert_array_almost_equal(D, [[0., 0.], [0., 0.]])
    # check that all elements are in [0, 2]
    assert_true(np.all(D >= 0.))
    assert_true(np.all(D <= 2.))
    # check that diagonal elements are equal to 0
    assert_array_almost_equal(D[np.diag_indices_from(D)], [0., 0.])

    XB = np.vstack([x, -x])
    D2 = cosine_distances(XB)
    # check that all elements are in [0, 2]
    assert_true(np.all(D2 >= 0.))
    assert_true(np.all(D2 <= 2.))
    # check that diagonal elements are equal to 0 and non diagonal to 2
    assert_array_almost_equal(D2, [[0., 2.], [2., 0.]])

    # check large random matrix
    X = np.abs(rng.rand(1000, 5000))
    D = cosine_distances(X)
    # check that diagonal elements are equal to 0
    assert_array_almost_equal(D[np.diag_indices_from(D)], [0.] * D.shape[0])
    assert_true(np.all(D >= 0.))
    assert_true(np.all(D <= 2.))


# Paired distances 
Example 12
Project: QA   Author: KiddoZhu   File: search.py    Apache License 2.0 5 votes vote down vote up
def embedding(self, value) :
		self._embedding = value
		print "Building knn..."
		vectors = []
		self.id2title = {}
		for id, (text, attrib) in enumerate(self.database) :
			self.id2title[id] = attrib["title"]
			vectors.append(self.embedding[jieba.cut(text)])
		self.knn = NearestNeighbors(n_neighbors = N_NEIGHBORS, metric = cosine_distances, n_jobs = 64)
		with warnings.catch_warnings() :
			warnings.filterwarnings("ignore", category = DeprecationWarning)
			self.knn.fit(vectors)
		pickle.dump(self.knn, open("dump/knn_%d_w2v.dump" % N_NEIGHBORS, "w")) 
Example 13
Project: GDAN   Author: stevehuanghe   File: utils.py    MIT License 5 votes vote down vote up
def kNN_classify(*, x, y):
    """
    return the index of y that is closest to each x
    :param x: n*d matrix
    :param y: m*d matrix
    :return: n-dim vector
    """
    ds = cosine_distances(x, y)
    idx = y[np.argmin(ds, axis=1)]
    return idx 
Example 14
Project: tg2019task   Author: umanlp   File: baseline_tfidf.py    MIT License 5 votes vote down vote up
def main():
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('-n', '--nearest', type=int, default=10)
    parser.add_argument('tables')
    parser.add_argument('questions', type=argparse.FileType('r', encoding='UTF-8'))
    args = parser.parse_args()

    explanations = []

    for path, _, files in os.walk(args.tables):
        for file in files:
            explanations += read_explanations(os.path.join(path, file))

    if not explanations:
        warnings.warn('Empty explanations')

    df_q = pd.read_csv(args.questions, sep='\t', dtype=str)
    df_e = pd.DataFrame(explanations, columns=('uid', 'text'))

    vectorizer = TfidfVectorizer().fit(df_q['Question']).fit(df_e['text'])
    X_q = vectorizer.transform(df_q['Question'])
    X_e = vectorizer.transform(df_e['text'])
    X_dist = cosine_distances(X_q, X_e)

    for i_question, distances in enumerate(X_dist):
        for i_explanation in np.argsort(distances)[:args.nearest]:
            print('{}\t{}'.format(df_q.loc[i_question]['questionID'], df_e.loc[i_explanation]['uid'])) 
Example 15
Project: clustering4docs   Author: lovit   File: _kmeans.py    GNU General Public License v3.0 4 votes vote down vote up
def _k_init(X, n_clusters, random_state):
    """Init n_clusters seeds according to k-means++
    It modified for Spherical k-means 
    
    Parameters
    -----------
    X : sparse matrix, shape (n_samples, n_features)        
    n_clusters : integer
        The number of seeds to choose
    random_state : numpy.RandomState
        The generator used to initialize the centers.

    Notes
    -----
    Selects initial cluster centers for k-mean clustering in a smart way
    to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
    "k-means++: the advantages of careful seeding". ACM-SIAM symposium
    on Discrete algorithms. 2007
    Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip,
    which is the implementation used in the aforementioned paper.
    """
    
    n_samples, n_features = X.shape

    centers = np.empty((n_clusters, n_features), dtype=X.dtype)

    # Set the number of local seeding trials if none is given
    # This is what Arthur/Vassilvitskii tried, but did not report
    # specific results for other than mentioning in the conclusion
    # that it helped.
    n_local_trials = 2 + int(np.log(n_clusters))
        
    # Pick first center randomly
    center_id = random_state.randint(n_samples)
    centers[0] = X[center_id].toarray()

    # Initialize list of closest distances and calculate current potential
    closest_dist_sq = cosine_distances(centers[0, np.newaxis], X)[0] ** 2
    current_pot = closest_dist_sq.sum()

    # Pick the remaining n_clusters-1 points
    for c in range(1, n_clusters):
        # Choose center candidates by sampling with probability proportional
        # to the squared distance to the closest existing center
        rand_vals = random_state.random_sample() * current_pot
        candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq),
                                        rand_vals)

        centers[c] = X[candidate_ids].toarray()
        
        # Compute distances to center candidates
        new_dist_sq = cosine_distances(X[candidate_ids,:], X)[0] ** 2
        closest_dist_sq = np.minimum(new_dist_sq, closest_dist_sq)
        current_pot = closest_dist_sq.sum()

    return centers 
Example 16
Project: deep-align   Author: prokolyvakis   File: marriage.py    Apache License 2.0 4 votes vote down vote up
def ontology_alignment(model, ontoTerms_a, ontoTerms_b, words, ceil = 0.5):

    with open(ontoTerms_a) as f:
        ontoText_a = f.readlines()
    with open(ontoTerms_b) as f:
        ontoText_b = f.readlines()
    # Remove whitespace characters like `\n` at the end of each line.
    ontoText_a = [x.strip() for x in ontoText_a] 
    ontoText_b = [x.strip() for x in ontoText_b]

    whole = []
    for text_a in ontoText_a:
        for text_b in ontoText_b:
            txt_a = re.sub(' +',' ',text_a)
            txt_b = re.sub(' +',' ',text_b)
            if txt_a == txt_b:
                whole.append([text_a, text_b, 0.0])
                try:
                    ontoText_a.remove(text_a)
                except ValueError:
                    pass
                    #print(text_a)
                try:
                    ontoText_b.remove(text_b)
                except ValueError:
                    pass
                    #print(text_b)
    # Transform to Word & Mask vectors to apply "feedforward_function"
    ontoData_a, ontoData_b = [], []
    for sentence in ontoText_a:
        ontoData_a.append(getSeq(sentence, words))
    for sentence in ontoText_b:
        ontoData_b.append(getSeq(sentence, words))
    x1,m1 = utils.prepare_data(ontoData_a)
    x2,m2 = utils.prepare_data(ontoData_b)
    OntoEmbg_a = model.feedforward_function(x1,m1)
    OntoEmbg_b = model.feedforward_function(x2,m2)
    # Compute the Cosine Distances:
    dist = cosine_distances(OntoEmbg_a,OntoEmbg_b)
    disT = np.transpose(dist)

    
    males    = preferances(dist)
    females  = preferances(disT)
    del(disT)
    match = Matcher(males, females)
    marriage = match()
    del(males); del(females)

    for key, value in marriage.items():
        man         = ontoText_a[value]
        woman       = ontoText_b[key]
        value       = dist[value][key]
        if value < ceil:
            whole.append([man, woman, value])
    return whole