# Python sklearn.metrics.pairwise.cosine_similarity() Examples

The following are 30 code examples of sklearn.metrics.pairwise.cosine_similarity(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module , or try the search function .
Example #1
```def cos_sim(ind1,ind2=1999):
#val = []
MAP=0
for i,j in enumerate(view1):
val=[]
AP=0
for x in view2:
val.append(cosine_similarity(j,x)[0].tolist())
#val=val[0].tolist()
#print val[0].tolist()
val=[(q,p)for p,q in enumerate(val)]
#print val
val.sort()
val.reverse()
t = [w[1]for w in val[0:7]]
for x,y in enumerate(t):
if y in range(i,i+5):
AP+=1/(x+1)
print(t)
print(AP)
MAP+=AP
print('MAP is : ',MAP/ind1) ```
Example #2
```def test_cosine_similarity():
# Test the cosine_similarity.

rng = np.random.RandomState(0)
X = rng.random_sample((5, 4))
Y = rng.random_sample((3, 4))
Xcsr = csr_matrix(X)
Ycsr = csr_matrix(Y)

for X_, Y_ in ((X, None), (X, Y),
(Xcsr, None), (Xcsr, Ycsr)):
# Test that the cosine is kernel is equal to a linear kernel when data
# has been previously normalized by L2-norm.
K1 = pairwise_kernels(X_, Y=Y_, metric="cosine")
X_ = normalize(X_)
if Y_ is not None:
Y_ = normalize(Y_)
K2 = pairwise_kernels(X_, Y=Y_, metric="linear")
assert_array_almost_equal(K1, K2) ```
Example #3
```def transform(self, X: dt.Frame):
X.replace([None, math.inf, -math.inf], self._repl_val)
from flair.embeddings import WordEmbeddings, BertEmbeddings, DocumentPoolEmbeddings, Sentence
if self.embedding_name in ["glove", "en"]:
self.embedding = WordEmbeddings(self.embedding_name)
elif self.embedding_name in ["bert"]:
self.embedding = BertEmbeddings()
self.doc_embedding = DocumentPoolEmbeddings([self.embedding])
output = []
X = X.to_pandas()
text1_arr = X.iloc[:, 0].values
text2_arr = X.iloc[:, 1].values
for ind, text1 in enumerate(text1_arr):
try:
text1 = Sentence(str(text1).lower())
self.doc_embedding.embed(text1)
text2 = text2_arr[ind]
text2 = Sentence(str(text2).lower())
self.doc_embedding.embed(text2)
score = cosine_similarity(text1.get_embedding().reshape(1, -1),
text2.get_embedding().reshape(1, -1))[0, 0]
output.append(score)
except:
output.append(-99)
return np.array(output) ```
Example #4
```def compared(request):
if request.method == 'POST':
if len(request.FILES) != 2:
return HttpResponse('{"status":false,"data":"","msg":"图片参数错误！"}')
starttime = time.time()
name1 = str(random.randint(10000, 99999)) + str(time.time())  # 随机名字
name2 = str(random.randint(10000, 99999)) + str(time.time())

tz1 = get_feature(root + "RestServer/upload/" + str(name1))

tz2 = get_feature(root + "RestServer/upload/" + str(name2))

comparedValue = pw.cosine_similarity(tz1, tz2)[0][0]

endtime = time.time()
Runtime=endtime-starttime
return HttpResponse('{"status":true,"data":"' + str(comparedValue) + '","msg":"成功","runtime": ' + str(Runtime) + '  }')
else:
return HttpResponse('{"status":false,"data":"","msg":"请求不合法"}')
return HttpResponse('{"status":false,"data":"","msg":"未知错误"}') ```
Example #5
```def get_closest_docs(uri):
#user_doc = requests.get(uri).text
r = requests.get(uri)
if r.status_code == 200:
user_doc = r.text
print("URI content length",len(user_doc))
normalized_code = normalize_text(code, remove_stop_words=False, only_letters=False, return_list=True)
model.random.seed(0)
user_vector = model.infer_vector(normalized_code)
print("finding similar...")
sys.stdout.flush()
stored_urls = list()
stored_vectors = list()
for url in vectors:
stored_urls.append(url)
stored_vectors.append(vectors[url])
pair_sims = cosine_similarity(user_vector.reshape(1, -1), stored_vectors)
indices = (-pair_sims[0]).argsort()[:5]
return [(stored_urls[index],round(float(pair_sims[0][index]),2)) for index in indices]
else:
print("URL returned status code", r.status_code)
raise ValueError('URL error') ```
Example #6
```def clustering(self, threshold):
"""分不同词性的聚类

:return: partition: dict {word_id: cluster_id}
"""
print("Louvain clustering")
partition = {}
part_offset = 0
for etype, ners in self.type_entity_dict.items():
sub_id_mapping = [self.word2id[ner0] for ner0 in ners if ner0 in self.word2id]
if len(sub_id_mapping) == 0:
continue
emb_mat_sub = self.emb_mat[sub_id_mapping, :]
cos_sims = cosine_similarity(emb_mat_sub)
cos_sims -= np.eye(len(emb_mat_sub))
partition_sub = community.best_partition(G)
for sub_id, main_id in enumerate(sub_id_mapping):
sub_part_id = partition_sub[sub_id]
partition[main_id] = sub_part_id + part_offset
part_offset += max(partition_sub.values()) + 1
return partition ```
Example #7
```def cosine_sim(x, y):
try:
if type(x) is np.ndarray: x = x.reshape(1, -1) # get rid of the warning
if type(y) is np.ndarray: y = y.reshape(1, -1)
d = cosine_similarity(x, y)
d = d[0][0]
except:
print x
print y
d = 0.
return d

#   Copyright 2017 Cisco Systems, Inc.
#
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#
#   Unless required by applicable law or agreed to in writing, software
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License. ```
Example #8
```def _get_similarity_values(self, q1_csc, q2_csc):
cosine_sim = []
manhattan_dis = []
eucledian_dis = []
jaccard_dis = []
minkowsk_dis = []

for i,j in zip(q1_csc, q2_csc):
sim = cs(i, j)
cosine_sim.append(sim[0][0])
sim = md(i, j)
manhattan_dis.append(sim[0][0])
sim = ed(i, j)
eucledian_dis.append(sim[0][0])
i_ = i.toarray()
j_ = j.toarray()
try:
sim = jsc(i_, j_)
jaccard_dis.append(sim)
except:
jaccard_dis.append(0)

sim = minkowski_dis.pairwise(i_, j_)
minkowsk_dis.append(sim[0][0])
return cosine_sim, manhattan_dis, eucledian_dis, jaccard_dis, minkowsk_dis ```
Example #9
```def _compute_sim(self, R, k):
# compute the similarity between all the items. This calculates the
# similarity between each ITEM
sim = cosine_similarity(R.T)

# Only keep the similarities of the top K, setting all others to zero
# (negative since we want descending)
not_top_k = np.argsort(-sim, axis=1)[:, k:]  # shape=(n_items, k)

if not_top_k.shape[1]:  # only if there are cols (k < n_items)
# now we have to set these to zero in the similarity matrix
row_indices = np.repeat(range(not_top_k.shape[0]),
not_top_k.shape[1])
sim[row_indices, not_top_k.ravel()] = 0.

return sim ```
Example #10
```def decision_function(self, X):
"""Evaluate the cosine similarity between document-term matrix and X.

Parameters
----------
X : array-like, shape (n_samples, n_timestamps)
Test samples.

Returns
-------
X : array-like, shape (n_samples, n_classes)
osine similarity between the document-term matrix and X.

"""
check_is_fitted(self, ['vocabulary_', 'tfidf_', 'idf_',
'_tfidf', 'classes_'])
X = check_array(X)
X_bow = self._bow.transform(X)
vectorizer = CountVectorizer(vocabulary=self._tfidf.vocabulary_)
X_transformed = vectorizer.transform(X_bow).toarray()
return cosine_similarity(X_transformed, self.tfidf_) ```
Example #11
```def test_init():
default = Spanning_Forest()
assert default.metric == skm.manhattan_distances
assert default.center == np.mean
assert default.reduction == np.sum
change = Spanning_Forest(dissimilarity=skm.euclidean_distances,
center=np.median, reduction=np.max)
assert change.metric == skm.euclidean_distances
assert change.center == np.median
assert change.reduction == np.max

sym = Spanning_Forest(affinity=skm.cosine_similarity)
assert isinstance(sym.metric, types.LambdaType)
test_distance = -np.log(skm.cosine_similarity(data[:2,]))
comparator = sym.metric(data[:2,])
np.testing.assert_allclose(test_distance, comparator) ```
Example #12
```def save_model(model: Model, tokenizer: Tokenizer):
"""
Saves the important parts of the model
:param model: Keras model to save
:param tokenizer: Keras Tokenizer to save
"""
for layer in model.layers:
if '_biases' in layer.name or '_embeddings' in layer.name:
np.save(file=f'{OUTPUT_FOLDER}{layer.name}', arr=layer.get_weights()[0])

# save tokenizer
pickle.dump(obj=tokenizer.index_word, file=open(f'{OUTPUT_FOLDER}{INDEX2WORD}', 'wb'))
pickle.dump(obj=tokenizer.word_index, file=open(f'{OUTPUT_FOLDER}{WORD2INDEX}', 'wb'))

# save combined embeddings & correlation matrix

np.save(file=f'{OUTPUT_FOLDER}{AGGREGATED_EMBEDDINGS}', arr=agg_embeddings)
np.save(file=f'{OUTPUT_FOLDER}{CORRELATION_MATRIX}', arr=cosine_similarity(cosine_similarity(agg_embeddings))) ```
Example #13
```def test_cosine_similarity():
# Test the cosine_similarity.

rng = np.random.RandomState(0)
X = rng.random_sample((5, 4))
Y = rng.random_sample((3, 4))
Xcsr = csr_matrix(X)
Ycsr = csr_matrix(Y)

for X_, Y_ in ((X, None), (X, Y),
(Xcsr, None), (Xcsr, Ycsr)):
# Test that the cosine is kernel is equal to a linear kernel when data
# has been previously normalized by L2-norm.
K1 = pairwise_kernels(X_, Y=Y_, metric="cosine")
X_ = normalize(X_)
if Y_ is not None:
Y_ = normalize(Y_)
K2 = pairwise_kernels(X_, Y=Y_, metric="linear")
assert_array_almost_equal(K1, K2) ```
Example #14
```def evaluate(self, category_projection):
assert issubclass(type(category_projection), CategoryProjectionBase)
topics = category_projection.get_nearest_terms()
total_similarity = 0
for topic in topics.values():
topic_vectors = np.array([self.get_vector(term) for term in topic])
#simport pdb; pdb.set_trace()
sim_matrix = cosine_similarity(topic_vectors)
tril_sim_matrix = np.tril(sim_matrix)
mean_similarity = tril_sim_matrix.sum()/(tril_sim_matrix.shape[0] ** 2 - tril_sim_matrix.shape[0]) / 2
total_similarity += mean_similarity
Example #15
```def process(self,data):
claim_bow = self.bow_vectorizer.transform(self.claims(data))
claim_tfs = self.tfreq_vectorizer.transform(claim_bow)
claim_tfidf = self.tfidf_vectorizer.transform(self.claims(data))

body_texts = self.texts(data)
body_bow = self.bow_vectorizer.transform(body_texts)
body_tfs = self.tfreq_vectorizer.transform(body_bow)
body_tfidf = self.tfidf_vectorizer.transform(body_texts)

cosines = np.array([cosine_similarity(c, b)[0] for c,b in zip(claim_tfidf,body_tfidf)])

return hstack([body_tfs,claim_tfs,cosines]) ```
Example #16
```def process(self, data):
claim_bow = self.bow_vectorizer.transform(self.claims(data))
claim_tfs = self.tfreq_vectorizer.transform(claim_bow)
claim_tfidf = self.tfidf_vectorizer.transform(self.claims(data))

body_texts = self.texts(data)
body_bow = self.bow_vectorizer.transform(body_texts)
body_tfs = self.tfreq_vectorizer.transform(body_bow)
body_tfidf = self.tfidf_vectorizer.transform(body_texts)

cosines = np.array([cosine_similarity(c, b)[0] for c, b in zip(claim_tfidf, body_tfidf)])

return cosines ```
Example #17
```def process(self, data):
claim_bow = self.bow_vectorizer.transform(self.claims(data))
claim_tfs = self.tfreq_vectorizer.transform(claim_bow)
claim_tfidf = self.tfidf_vectorizer.transform(self.claims(data))

body_texts = self.texts(data)
body_bow = self.bow_vectorizer.transform(body_texts)
body_tfs = self.tfreq_vectorizer.transform(body_bow)
body_tfidf = self.tfidf_vectorizer.transform(body_texts)

cosines = np.array([cosine_similarity(c, b)[0] for c, b in zip(claim_tfidf, body_tfidf)])

return cosines ```
Example #18
```def closest_distances(query_vector,all_feat_vecs,norm='euclidian',num=3):
if norm=='euclidian':
dist = np.linalg.norm(query_vector-all_feat_vecs,axis=1)
if norm =='cosine':
dist = 1 - cosine_similarity(query_vector.reshape(1, -1),all_feat_vecs)[0]
idxs = np.arange(0,dist.shape[0])
return idxs[dist.argsort()][:num] ```
Example #19
```def read(self, P, Q):
A_embed = []
for a in A:
A_embed.append(self.embedder.embed(a))
Q_embed = self.embedder.embed(Q)
similarities_raw = cosine_similarity(A_embed, Q_embed.reshape(1, -1))
similarities = [s[0] for s in similarities_raw]
indices_sorted = np.argsort(similarities)[::-1]  # reverse order
return A[indices_sorted[0]] ```
Example #20
```def read(self, P , Q):
Q = self.stem_string(Q)
query_tfidf = self.vectorizer.transform([Q])
similarities_raw = cosine_similarity(self.tfidf_matrix, query_tfidf)
similarities = []
for s in similarities_raw:
similarities.append(s[0])
max_index = np.argmax(similarities)
return self.docs[max_index] ```
Example #21
```def get_topk_docs(self, query):
"""
:param query: a string
:return: top documents according to cosine similarity of embeddings
"""
emb_query = self.embed_string(query)
similarities_raw = cosine_similarity(self.emb_matrix, emb_query.reshape(1,-1))
similarities = [s[0] for s in similarities_raw]
indices_sorted = np.argsort(similarities)[::-1]  # reverse order
topk_docs = []
for i in range(0, self.k):
topk_docs.append(self.docs[indices_sorted[i]])
Example #22
```def _cosine_sim(vec1, vec2):
try:
s = cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]
except:
try:
s = cosine_similarity(vec1, vec2)[0][0]
except:
s = config.MISSING_VALUE_NUMERIC
return s ```
Example #23
```def vec_cos_sim(token_input, operation_input):
operation_string = None
ref_vector_string = None
cond_value_string = None
ref_vector_string = operation_input.split(opr_sign)[0]
operation_string = opr_sign
cond_value_string = operation_input.split(opr_sign)[1]
break

if ref_vector_string and cond_value_string and operation_string:
try:
cond_value = float(cond_value_string)
ref_vector = change_string_to_vector(ref_vector_string)
token_vector = change_string_to_vector(token_input)
if len(ref_vector) != len(token_vector):
print ('len of vectors does not match')
return False
if operation_string == "=" or operation_string == "==":
return cosine_similarity(token_vector, ref_vector) == cond_value
elif operation_string == "<":
return cosine_similarity(token_vector, ref_vector) < cond_value
elif operation_string == ">":
return cosine_similarity(token_vector, ref_vector) > cond_value
elif operation_string == ">=":
return cosine_similarity(token_vector, ref_vector) >= cond_value
elif operation_string == "<=":
return cosine_similarity(token_vector, ref_vector) <= cond_value
elif operation_string == "!=" or operation_string == "<>":
return cosine_similarity(token_vector, ref_vector) != cond_value
else:
return False
except ValueError:
# TODO raise tokenregex error
return False

else:
# TODO raise tokenregex error
print ('Problem with the operation input') ```
Example #24
```def cosineSimilarity(self):
vec = TfidfVectorizer()
matrix = vec.fit_transform(self.statements)
for j in range(1, 5):
i = j - 1
print("\tsimilarity of document {} with others".format(i))
similarity = cosine_similarity(matrix[i:j], matrix)
print(similarity) ```
Example #25
```def cosine(x1,x2):
#find common ratings
new_x1, new_x2 = common(x1,x2)
#compute the cosine similarity between two vectors
sum = new_x1.dot(new_x2)
denom = sqrt(new_x1.dot(new_x1)*new_x2.dot(new_x2))
try:
return float(sum)/denom
except ZeroDivisionError:
return 0

#return cosine_similarity(x1,x2)[0][0] ```
Example #26
```def s_norm(self, test, enroll):
""" Run speaker normalization (S-Norm) on cached embeddings.

Args:
test (np.array): test embedding
enroll (np.array): enroll embedding

Returns:
float: hypothesis
"""
if self.plda:
a = self.plda.score(test, self.embeddings).T
b = self.plda.score(enroll, self.embeddings).T
c = self.plda.score(enroll, test).T
else:
a = cosine_similarity(test, self.embeddings).T
b = cosine_similarity(enroll, self.embeddings).T
c = cosine_similarity(enroll, test).T
scores = []
for ii in range(test.shape[0]):
test_scores = []
for jj in range(enroll.shape[0]):
test_mean, test_std = np.mean(a.T[ii]), np.std(a.T[ii])
enroll_mean, enroll_std = np.mean(b.T[jj]), np.std(b.T[jj])
s = c[ii][jj]
test_scores.append((((s - test_mean) / test_std + (s - enroll_mean) / enroll_std) / 2))
scores.append(test_scores)
return np.array(scores) ```
Example #27
```def predict(self, users_ids, sessions_items, topk=5, valid_items=None):
acr_embeddings = self.eval_benchmark_params['content_article_embeddings_matrix']

recent_items_buffer = self.clicked_items_state.get_recent_clicks_buffer()
if valid_items is None:
recent_unique_item_ids = np.unique([recent_items_buffer[np.nonzero(recent_items_buffer)]])
else:
recent_unique_item_ids = np.unique(valid_items)

acr_embeddings_recent_items = acr_embeddings[recent_unique_item_ids]

session_predictions = np.zeros(dtype=np.int64,
shape=[sessions_items.shape[0],
sessions_items.shape[1],
topk])

for row_idx, session_items in enumerate(sessions_items):

for col_idx, item in enumerate(session_items):
if item != 0:

#Computing cosine similarity between this item and all recent items (from buffer)
#P.s. Do not need to ignore the current item (whose similarity is always, because this item will not be among the valid items (next click + negative samples not present in the session))
similarities = cosine_similarity(acr_embeddings[item].reshape(1, -1),
acr_embeddings_recent_items)[0]
similar_items_sorted_idx = np.argsort(similarities, axis=0)[::-1]
similar_items_ids = recent_unique_item_ids[similar_items_sorted_idx]

session_predictions[row_idx, col_idx] = list(self._get_top_n_valid_items(similar_items_ids, topk, valid_items[row_idx, col_idx]))

return session_predictions ```
Example #28
```def visualize_between_sentences(sentences, vec_list, palette="Viridis256",
filename="/notebooks/embedding/between-sentences.png",
use_notebook=False):
df_list, score_list = [], []
for sent1_idx, sentence1 in enumerate(sentences):
for sent2_idx, sentence2 in enumerate(sentences):
vec1, vec2 = vec_list[sent1_idx], vec_list[sent2_idx]
if np.any(vec1) and np.any(vec2):
score = cosine_similarity(X=[vec1], Y=[vec2])
df_list.append({'x': sentence1, 'y': sentence2, 'similarity': score[0][0]})
score_list.append(score[0][0])
df = pd.DataFrame(df_list)
color_mapper = LinearColorMapper(palette=palette, low=np.max(score_list), high=np.min(score_list))
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
p = figure(x_range=sentences, y_range=list(reversed(sentences)),
x_axis_location="above", plot_width=900, plot_height=900,
toolbar_location='below', tools=TOOLS,
tooltips=[('sentences', '@x @y'), ('similarity', '@similarity')])
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = 3.14 / 3
p.rect(x="x", y="y", width=1, height=1,
source=df,
fill_color={'field': 'similarity', 'transform': color_mapper},
line_color=None)
color_bar = ColorBar(ticker=BasicTicker(desired_num_ticks=5),
color_mapper=color_mapper, major_label_text_font_size="7pt",
label_standoff=6, border_line_color=None, location=(0, 0))
if use_notebook:
output_notebook()
show(p)
else:
export_png(p, filename)
print("save @ " + filename) ```
Example #29
```def visualize_between_words(words, vecs, palette="Viridis256", filename="/notebooks/embedding/between-words.png",
use_notebook=False):
df_list = []
for word1_idx, word1 in enumerate(words):
for word2_idx, word2 in enumerate(words):
vec1 = vecs[word1_idx]
vec2 = vecs[word2_idx]
if np.any(vec1) and np.any(vec2):
score = cosine_similarity(X=[vec1], Y=[vec2])
df_list.append({'x': word1, 'y': word2, 'similarity': score[0][0]})
df = pd.DataFrame(df_list)
color_mapper = LinearColorMapper(palette=palette, low=1, high=0)
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
p = figure(x_range=list(words), y_range=list(reversed(list(words))),
x_axis_location="above", plot_width=900, plot_height=900,
toolbar_location='below', tools=TOOLS,
tooltips=[('words', '@x @y'), ('similarity', '@similarity')])
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = 3.14 / 3
p.rect(x="x", y="y", width=1, height=1,
source=df,
fill_color={'field': 'similarity', 'transform': color_mapper},
line_color=None)
color_bar = ColorBar(ticker=BasicTicker(desired_num_ticks=5),
color_mapper=color_mapper, major_label_text_font_size="7pt",
label_standoff=6, border_line_color=None, location=(0, 0))
if use_notebook:
output_notebook()
show(p)
else:
export_png(p, filename)
print("save @ " + filename) ```
Example #30
```def __call__(self, text):
if self.tfrecords_mode == 'point':
assert text.find('||') != -1,"input should cotain two sentences seperated by ||"
text_a = text.split('||')[0]
text_b = text.split('||')[-1]
pred,score = self._get_label([text_a], [text_b], need_preprocess = True)
return pred[0][0], score[0][0]

#加载自定义问句(自定义优先)
if self.sim_mode == 'cross':
text_list = self.text_list
label_list = self.label_list
if self.zdy != {}:
text_list = self.zdy['text_list'] + text_list
label_list = self.zdy['label_list'] + label_list
pred,score = self._get_label([text], self.text_list, need_preprocess = True)
selected_id = np.argmax(score)
out_score = score[selected_id]
elif self.sim_mode == 'represent':
text_list = self.text_list
vec_list = self.vec_list
label_list = self.label_list
if self.zdy != {}:
text_list = self.zdy['text_list'] + text_list
vec_list = np.concatenate([self.zdy['vec_list'], self.vec_list], axis = 0)
label_list = self.zdy['label_list'] + label_list
vec = self._get_vecs([text], need_preprocess = True)
if self.is_distance:
scores = euclidean_distances(vec, vec_list)[0]
selected_id = np.argmin(scores)
out_score = 1 - scores[selected_id]
else:
scores = cosine_similarity(vec, vec_list)[0]
selected_id = np.argmax(scores)
out_score = scores[selected_id]
else:
raise ValueError('unknown sim mode, represent or cross?')
ret = (label_list[selected_id], out_score, selected_id, \
self.text_list[selected_id])
return ret ```