# Python sklearn.metrics.pairwise.cosine_similarity() Examples

Example #1
```def cos_sim(ind1,ind2=1999):
#val = []
MAP=0
for i,j in enumerate(view1):
val=[]
AP=0
for x in view2:
val.append(cosine_similarity(j,x)[0].tolist())
#val=val[0].tolist()
#print val[0].tolist()
val=[(q,p)for p,q in enumerate(val)]
#print val
val.sort()
val.reverse()
t = [w[1]for w in val[0:7]]
for x,y in enumerate(t):
if y in range(i,i+5):
AP+=1/(x+1)
print(t)
print(AP)
MAP+=AP
print('MAP is : ',MAP/ind1) ```
Example #2
```def test_cosine_similarity():
# Test the cosine_similarity.

rng = np.random.RandomState(0)
X = rng.random_sample((5, 4))
Y = rng.random_sample((3, 4))
Xcsr = csr_matrix(X)
Ycsr = csr_matrix(Y)

for X_, Y_ in ((X, None), (X, Y),
(Xcsr, None), (Xcsr, Ycsr)):
# Test that the cosine is kernel is equal to a linear kernel when data
# has been previously normalized by L2-norm.
K1 = pairwise_kernels(X_, Y=Y_, metric="cosine")
X_ = normalize(X_)
if Y_ is not None:
Y_ = normalize(Y_)
K2 = pairwise_kernels(X_, Y=Y_, metric="linear")
assert_array_almost_equal(K1, K2) ```
Example #3
```def transform(self, X: dt.Frame):
X.replace([None, math.inf, -math.inf], self._repl_val)
from flair.embeddings import WordEmbeddings, BertEmbeddings, DocumentPoolEmbeddings, Sentence
if self.embedding_name in ["glove", "en"]:
self.embedding = WordEmbeddings(self.embedding_name)
elif self.embedding_name in ["bert"]:
self.embedding = BertEmbeddings()
self.doc_embedding = DocumentPoolEmbeddings([self.embedding])
output = []
X = X.to_pandas()
text1_arr = X.iloc[:, 0].values
text2_arr = X.iloc[:, 1].values
for ind, text1 in enumerate(text1_arr):
try:
text1 = Sentence(str(text1).lower())
self.doc_embedding.embed(text1)
text2 = text2_arr[ind]
text2 = Sentence(str(text2).lower())
self.doc_embedding.embed(text2)
score = cosine_similarity(text1.get_embedding().reshape(1, -1),
text2.get_embedding().reshape(1, -1))[0, 0]
output.append(score)
except:
output.append(-99)
return np.array(output) ```
Example #4
```def compared(request):
if request.method == 'POST':
if len(request.FILES) != 2:
return HttpResponse('{"status":false,"data":"","msg":"图片参数错误！"}')
starttime = time.time()
name1 = str(random.randint(10000, 99999)) + str(time.time())  # 随机名字
name2 = str(random.randint(10000, 99999)) + str(time.time())

tz1 = get_feature(root + "RestServer/upload/" + str(name1))

tz2 = get_feature(root + "RestServer/upload/" + str(name2))

comparedValue = pw.cosine_similarity(tz1, tz2)[0][0]

endtime = time.time()
Runtime=endtime-starttime
return HttpResponse('{"status":true,"data":"' + str(comparedValue) + '","msg":"成功","runtime": ' + str(Runtime) + '  }')
else:
return HttpResponse('{"status":false,"data":"","msg":"请求不合法"}')
return HttpResponse('{"status":false,"data":"","msg":"未知错误"}') ```
Example #5
```def get_closest_docs(uri):
#user_doc = requests.get(uri).text
r = requests.get(uri)
if r.status_code == 200:
user_doc = r.text
print("URI content length",len(user_doc))
normalized_code = normalize_text(code, remove_stop_words=False, only_letters=False, return_list=True)
model.random.seed(0)
user_vector = model.infer_vector(normalized_code)
print("finding similar...")
sys.stdout.flush()
stored_urls = list()
stored_vectors = list()
for url in vectors:
stored_urls.append(url)
stored_vectors.append(vectors[url])
pair_sims = cosine_similarity(user_vector.reshape(1, -1), stored_vectors)
indices = (-pair_sims[0]).argsort()[:5]
return [(stored_urls[index],round(float(pair_sims[0][index]),2)) for index in indices]
else:
print("URL returned status code", r.status_code)
raise ValueError('URL error') ```
Example #6
```def clustering(self, threshold):
"""分不同词性的聚类

:return: partition: dict {word_id: cluster_id}
"""
print("Louvain clustering")
partition = {}
part_offset = 0
for etype, ners in self.type_entity_dict.items():
sub_id_mapping = [self.word2id[ner0] for ner0 in ners if ner0 in self.word2id]
if len(sub_id_mapping) == 0:
continue
emb_mat_sub = self.emb_mat[sub_id_mapping, :]
cos_sims = cosine_similarity(emb_mat_sub)
cos_sims -= np.eye(len(emb_mat_sub))
partition_sub = community.best_partition(G)
for sub_id, main_id in enumerate(sub_id_mapping):
sub_part_id = partition_sub[sub_id]
partition[main_id] = sub_part_id + part_offset
part_offset += max(partition_sub.values()) + 1
return partition ```
Example #7
```def cosine_sim(x, y):
try:
if type(x) is np.ndarray: x = x.reshape(1, -1) # get rid of the warning
if type(y) is np.ndarray: y = y.reshape(1, -1)
d = cosine_similarity(x, y)
d = d[0][0]
except:
print x
print y
d = 0.
return d

#   Copyright 2017 Cisco Systems, Inc.
#
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#
#   Unless required by applicable law or agreed to in writing, software
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License. ```
Example #8
```def _get_similarity_values(self, q1_csc, q2_csc):
cosine_sim = []
manhattan_dis = []
eucledian_dis = []
jaccard_dis = []
minkowsk_dis = []

for i,j in zip(q1_csc, q2_csc):
sim = cs(i, j)
cosine_sim.append(sim[0][0])
sim = md(i, j)
manhattan_dis.append(sim[0][0])
sim = ed(i, j)
eucledian_dis.append(sim[0][0])
i_ = i.toarray()
j_ = j.toarray()
try:
sim = jsc(i_, j_)
jaccard_dis.append(sim)
except:
jaccard_dis.append(0)

sim = minkowski_dis.pairwise(i_, j_)
minkowsk_dis.append(sim[0][0])
return cosine_sim, manhattan_dis, eucledian_dis, jaccard_dis, minkowsk_dis ```
Example #9
```def _compute_sim(self, R, k):
# compute the similarity between all the items. This calculates the
# similarity between each ITEM
sim = cosine_similarity(R.T)

# Only keep the similarities of the top K, setting all others to zero
# (negative since we want descending)
not_top_k = np.argsort(-sim, axis=1)[:, k:]  # shape=(n_items, k)

if not_top_k.shape[1]:  # only if there are cols (k < n_items)
# now we have to set these to zero in the similarity matrix
row_indices = np.repeat(range(not_top_k.shape[0]),
not_top_k.shape[1])
sim[row_indices, not_top_k.ravel()] = 0.

return sim ```
Example #10
```def decision_function(self, X):
"""Evaluate the cosine similarity between document-term matrix and X.

Parameters
----------
X : array-like, shape (n_samples, n_timestamps)
Test samples.

Returns
-------
X : array-like, shape (n_samples, n_classes)
osine similarity between the document-term matrix and X.

"""
check_is_fitted(self, ['vocabulary_', 'tfidf_', 'idf_',
'_tfidf', 'classes_'])
X = check_array(X)
X_bow = self._bow.transform(X)
vectorizer = CountVectorizer(vocabulary=self._tfidf.vocabulary_)
X_transformed = vectorizer.transform(X_bow).toarray()
return cosine_similarity(X_transformed, self.tfidf_) ```
Example #11
```def test_init():
default = Spanning_Forest()
assert default.metric == skm.manhattan_distances
assert default.center == np.mean
assert default.reduction == np.sum
change = Spanning_Forest(dissimilarity=skm.euclidean_distances,
center=np.median, reduction=np.max)
assert change.metric == skm.euclidean_distances
assert change.center == np.median
assert change.reduction == np.max

sym = Spanning_Forest(affinity=skm.cosine_similarity)
assert isinstance(sym.metric, types.LambdaType)
test_distance = -np.log(skm.cosine_similarity(data[:2,]))
comparator = sym.metric(data[:2,])
np.testing.assert_allclose(test_distance, comparator) ```
Example #12
```def save_model(model: Model, tokenizer: Tokenizer):
"""
Saves the important parts of the model
:param model: Keras model to save
:param tokenizer: Keras Tokenizer to save
"""
for layer in model.layers:
if '_biases' in layer.name or '_embeddings' in layer.name:
np.save(file=f'{OUTPUT_FOLDER}{layer.name}', arr=layer.get_weights()[0])

# save tokenizer
pickle.dump(obj=tokenizer.index_word, file=open(f'{OUTPUT_FOLDER}{INDEX2WORD}', 'wb'))
pickle.dump(obj=tokenizer.word_index, file=open(f'{OUTPUT_FOLDER}{WORD2INDEX}', 'wb'))

# save combined embeddings & correlation matrix

np.save(file=f'{OUTPUT_FOLDER}{AGGREGATED_EMBEDDINGS}', arr=agg_embeddings)
np.save(file=f'{OUTPUT_FOLDER}{CORRELATION_MATRIX}', arr=cosine_similarity(cosine_similarity(agg_embeddings))) ```
Example #13
```def test_cosine_similarity():
# Test the cosine_similarity.

rng = np.random.RandomState(0)
X = rng.random_sample((5, 4))
Y = rng.random_sample((3, 4))
Xcsr = csr_matrix(X)
Ycsr = csr_matrix(Y)

for X_, Y_ in ((X, None), (X, Y),
(Xcsr, None), (Xcsr, Ycsr)):
# Test that the cosine is kernel is equal to a linear kernel when data
# has been previously normalized by L2-norm.
K1 = pairwise_kernels(X_, Y=Y_, metric="cosine")
X_ = normalize(X_)
if Y_ is not None:
Y_ = normalize(Y_)
K2 = pairwise_kernels(X_, Y=Y_, metric="linear")
assert_array_almost_equal(K1, K2) ```
Example #14
```def evaluate(self, category_projection):
assert issubclass(type(category_projection), CategoryProjectionBase)
topics = category_projection.get_nearest_terms()
total_similarity = 0
for topic in topics.values():
topic_vectors = np.array([self.get_vector(term) for term in topic])
#simport pdb; pdb.set_trace()
sim_matrix = cosine_similarity(topic_vectors)
tril_sim_matrix = np.tril(sim_matrix)
mean_similarity = tril_sim_matrix.sum()/(tril_sim_matrix.shape[0] ** 2 - tril_sim_matrix.shape[0]) / 2
total_similarity += mean_similarity
Example #15
```def process(self,data):
claim_bow = self.bow_vectorizer.transform(self.claims(data))
claim_tfs = self.tfreq_vectorizer.transform(claim_bow)
claim_tfidf = self.tfidf_vectorizer.transform(self.claims(data))

body_texts = self.texts(data)
body_bow = self.bow_vectorizer.transform(body_texts)
body_tfs = self.tfreq_vectorizer.transform(body_bow)
body_tfidf = self.tfidf_vectorizer.transform(body_texts)

cosines = np.array([cosine_similarity(c, b)[0] for c,b in zip(claim_tfidf,body_tfidf)])

return hstack([body_tfs,claim_tfs,cosines]) ```
Example #16
```def process(self, data):
claim_bow = self.bow_vectorizer.transform(self.claims(data))
claim_tfs = self.tfreq_vectorizer.transform(claim_bow)
claim_tfidf = self.tfidf_vectorizer.transform(self.claims(data))

body_texts = self.texts(data)
body_bow = self.bow_vectorizer.transform(body_texts)
body_tfs = self.tfreq_vectorizer.transform(body_bow)
body_tfidf = self.tfidf_vectorizer.transform(body_texts)

cosines = np.array([cosine_similarity(c, b)[0] for c, b in zip(claim_tfidf, body_tfidf)])

return cosines ```
Example #17
```def process(self, data):
claim_bow = self.bow_vectorizer.transform(self.claims(data))
claim_tfs = self.tfreq_vectorizer.transform(claim_bow)
claim_tfidf = self.tfidf_vectorizer.transform(self.claims(data))

body_texts = self.texts(data)
body_bow = self.bow_vectorizer.transform(body_texts)
body_tfs = self.tfreq_vectorizer.transform(body_bow)
body_tfidf = self.tfidf_vectorizer.transform(body_texts)

cosines = np.array([cosine_similarity(c, b)[0] for c, b in zip(claim_tfidf, body_tfidf)])

return cosines ```
Example #18
```def closest_distances(query_vector,all_feat_vecs,norm='euclidian',num=3):
if norm=='euclidian':
dist = np.linalg.norm(query_vector-all_feat_vecs,axis=1)
if norm =='cosine':
dist = 1 - cosine_similarity(query_vector.reshape(1, -1),all_feat_vecs)[0]
idxs = np.arange(0,dist.shape[0])
return idxs[dist.argsort()][:num] ```
Example #19
```def read(self, P, Q):
A_embed = []
for a in A:
A_embed.append(self.embedder.embed(a))
Q_embed = self.embedder.embed(Q)
similarities_raw = cosine_similarity(A_embed, Q_embed.reshape(1, -1))
similarities = [s[0] for s in similarities_raw]
indices_sorted = np.argsort(similarities)[::-1]  # reverse order
return A[indices_sorted[0]] ```
Example #20
```def read(self, P , Q):
Q = self.stem_string(Q)
query_tfidf = self.vectorizer.transform([Q])
similarities_raw = cosine_similarity(self.tfidf_matrix, query_tfidf)
similarities = []
for s in similarities_raw:
similarities.append(s[0])
max_index = np.argmax(similarities)
return self.docs[max_index] ```
Example #21
```def get_topk_docs(self, query):
"""
:param query: a string
:return: top documents according to cosine similarity of embeddings
"""
emb_query = self.embed_string(query)
similarities_raw = cosine_similarity(self.emb_matrix, emb_query.reshape(1,-1))
similarities = [s[0] for s in similarities_raw]
indices_sorted = np.argsort(similarities)[::-1]  # reverse order
topk_docs = []
for i in range(0, self.k):
topk_docs.append(self.docs[indices_sorted[i]])
Example #22
```def _cosine_sim(vec1, vec2):
try:
s = cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]
except:
try:
s = cosine_similarity(vec1, vec2)[0][0]
except:
s = config.MISSING_VALUE_NUMERIC
return s ```
Example #23
```def vec_cos_sim(token_input, operation_input):
operation_string = None
ref_vector_string = None
cond_value_string = None
ref_vector_string = operation_input.split(opr_sign)[0]
operation_string = opr_sign
cond_value_string = operation_input.split(opr_sign)[1]
break

if ref_vector_string and cond_value_string and operation_string:
try:
cond_value = float(cond_value_string)
ref_vector = change_string_to_vector(ref_vector_string)
token_vector = change_string_to_vector(token_input)
if len(ref_vector) != len(token_vector):
print ('len of vectors does not match')
return False
if operation_string == "=" or operation_string == "==":
return cosine_similarity(token_vector, ref_vector) == cond_value
elif operation_string == "<":
return cosine_similarity(token_vector, ref_vector) < cond_value
elif operation_string == ">":
return cosine_similarity(token_vector, ref_vector) > cond_value
elif operation_string == ">=":
return cosine_similarity(token_vector, ref_vector) >= cond_value
elif operation_string == "<=":
return cosine_similarity(token_vector, ref_vector) <= cond_value
elif operation_string == "!=" or operation_string == "<>":
return cosine_similarity(token_vector, ref_vector) != cond_value
else:
return False
except ValueError:
# TODO raise tokenregex error
return False

else:
# TODO raise tokenregex error
print ('Problem with the operation input') ```
Example #24
```def cosineSimilarity(self):
vec = TfidfVectorizer()
matrix = vec.fit_transform(self.statements)
for j in range(1, 5):
i = j - 1
print("\tsimilarity of document {} with others".format(i))
similarity = cosine_similarity(matrix[i:j], matrix)
print(similarity) ```
Example #25
```def cosine(x1,x2):
#find common ratings
new_x1, new_x2 = common(x1,x2)
#compute the cosine similarity between two vectors
sum = new_x1.dot(new_x2)
denom = sqrt(new_x1.dot(new_x1)*new_x2.dot(new_x2))
try:
return float(sum)/denom
except ZeroDivisionError:
return 0

#return cosine_similarity(x1,x2)[0][0] ```
Example #26
```def s_norm(self, test, enroll):
""" Run speaker normalization (S-Norm) on cached embeddings.

Args:
test (np.array): test embedding
enroll (np.array): enroll embedding

Returns:
float: hypothesis
"""
if self.plda:
a = self.plda.score(test, self.embeddings).T
b = self.plda.score(enroll, self.embeddings).T
c = self.plda.score(enroll, test).T
else:
a = cosine_similarity(test, self.embeddings).T
b = cosine_similarity(enroll, self.embeddings).T
c = cosine_similarity(enroll, test).T
scores = []
for ii in range(test.shape[0]):
test_scores = []
for jj in range(enroll.shape[0]):
test_mean, test_std = np.mean(a.T[ii]), np.std(a.T[ii])
enroll_mean, enroll_std = np.mean(b.T[jj]), np.std(b.T[jj])
s = c[ii][jj]
test_scores.append((((s - test_mean) / test_std + (s - enroll_mean) / enroll_std) / 2))
scores.append(test_scores)
return np.array(scores) ```
Example #27
```def predict(self, users_ids, sessions_items, topk=5, valid_items=None):
acr_embeddings = self.eval_benchmark_params['content_article_embeddings_matrix']

recent_items_buffer = self.clicked_items_state.get_recent_clicks_buffer()
if valid_items is None:
recent_unique_item_ids = np.unique([recent_items_buffer[np.nonzero(recent_items_buffer)]])
else:
recent_unique_item_ids = np.unique(valid_items)

acr_embeddings_recent_items = acr_embeddings[recent_unique_item_ids]

session_predictions = np.zeros(dtype=np.int64,
shape=[sessions_items.shape[0],
sessions_items.shape[1],
topk])

for row_idx, session_items in enumerate(sessions_items):

for col_idx, item in enumerate(session_items):
if item != 0:

#Computing cosine similarity between this item and all recent items (from buffer)
#P.s. Do not need to ignore the current item (whose similarity is always, because this item will not be among the valid items (next click + negative samples not present in the session))
similarities = cosine_similarity(acr_embeddings[item].reshape(1, -1),
acr_embeddings_recent_items)[0]
similar_items_sorted_idx = np.argsort(similarities, axis=0)[::-1]
similar_items_ids = recent_unique_item_ids[similar_items_sorted_idx]

session_predictions[row_idx, col_idx] = list(self._get_top_n_valid_items(similar_items_ids, topk, valid_items[row_idx, col_idx]))

return session_predictions ```
Example #28
```def visualize_between_sentences(sentences, vec_list, palette="Viridis256",
filename="/notebooks/embedding/between-sentences.png",
use_notebook=False):
df_list, score_list = [], []
for sent1_idx, sentence1 in enumerate(sentences):
for sent2_idx, sentence2 in enumerate(sentences):
vec1, vec2 = vec_list[sent1_idx], vec_list[sent2_idx]
if np.any(vec1) and np.any(vec2):
score = cosine_similarity(X=[vec1], Y=[vec2])
df_list.append({'x': sentence1, 'y': sentence2, 'similarity': score[0][0]})
score_list.append(score[0][0])
df = pd.DataFrame(df_list)
color_mapper = LinearColorMapper(palette=palette, low=np.max(score_list), high=np.min(score_list))
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
p = figure(x_range=sentences, y_range=list(reversed(sentences)),
x_axis_location="above", plot_width=900, plot_height=900,
toolbar_location='below', tools=TOOLS,
tooltips=[('sentences', '@x @y'), ('similarity', '@similarity')])
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = 3.14 / 3
p.rect(x="x", y="y", width=1, height=1,
source=df,
fill_color={'field': 'similarity', 'transform': color_mapper},
line_color=None)
color_bar = ColorBar(ticker=BasicTicker(desired_num_ticks=5),
color_mapper=color_mapper, major_label_text_font_size="7pt",
label_standoff=6, border_line_color=None, location=(0, 0))
if use_notebook:
output_notebook()
show(p)
else:
export_png(p, filename)
print("save @ " + filename) ```
Example #29
```def visualize_between_words(words, vecs, palette="Viridis256", filename="/notebooks/embedding/between-words.png",
use_notebook=False):
df_list = []
for word1_idx, word1 in enumerate(words):
for word2_idx, word2 in enumerate(words):
vec1 = vecs[word1_idx]
vec2 = vecs[word2_idx]
if np.any(vec1) and np.any(vec2):
score = cosine_similarity(X=[vec1], Y=[vec2])
df_list.append({'x': word1, 'y': word2, 'similarity': score[0][0]})
df = pd.DataFrame(df_list)
color_mapper = LinearColorMapper(palette=palette, low=1, high=0)
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
p = figure(x_range=list(words), y_range=list(reversed(list(words))),
x_axis_location="above", plot_width=900, plot_height=900,
toolbar_location='below', tools=TOOLS,
tooltips=[('words', '@x @y'), ('similarity', '@similarity')])
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = 3.14 / 3
p.rect(x="x", y="y", width=1, height=1,
source=df,
fill_color={'field': 'similarity', 'transform': color_mapper},
line_color=None)
color_bar = ColorBar(ticker=BasicTicker(desired_num_ticks=5),
color_mapper=color_mapper, major_label_text_font_size="7pt",
label_standoff=6, border_line_color=None, location=(0, 0))
if use_notebook:
output_notebook()
show(p)
else:
export_png(p, filename)
print("save @ " + filename) ```
Example #30
```def __call__(self, text):
if self.tfrecords_mode == 'point':
assert text.find('||') != -1,"input should cotain two sentences seperated by ||"
text_a = text.split('||')[0]
text_b = text.split('||')[-1]
pred,score = self._get_label([text_a], [text_b], need_preprocess = True)
return pred[0][0], score[0][0]

#加载自定义问句(自定义优先)
if self.sim_mode == 'cross':
text_list = self.text_list
label_list = self.label_list
if self.zdy != {}:
text_list = self.zdy['text_list'] + text_list
label_list = self.zdy['label_list'] + label_list
pred,score = self._get_label([text], self.text_list, need_preprocess = True)
selected_id = np.argmax(score)
out_score = score[selected_id]
elif self.sim_mode == 'represent':
text_list = self.text_list
vec_list = self.vec_list
label_list = self.label_list
if self.zdy != {}:
text_list = self.zdy['text_list'] + text_list
vec_list = np.concatenate([self.zdy['vec_list'], self.vec_list], axis = 0)
label_list = self.zdy['label_list'] + label_list
vec = self._get_vecs([text], need_preprocess = True)
if self.is_distance:
scores = euclidean_distances(vec, vec_list)[0]
selected_id = np.argmin(scores)
out_score = 1 - scores[selected_id]
else:
scores = cosine_similarity(vec, vec_list)[0]
selected_id = np.argmax(scores)
out_score = scores[selected_id]
else:
raise ValueError('unknown sim mode, represent or cross?')
ret = (label_list[selected_id], out_score, selected_id, \
self.text_list[selected_id])
return ret ```