Python sklearn.metrics.pairwise.cosine_similarity() Examples
The following are 30
code examples of sklearn.metrics.pairwise.cosine_similarity().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.metrics.pairwise
, or try the search function
.

Example #1
Source Project: DeepLearn Author: GauravBh1010tt File: utility.py License: MIT License | 6 votes |
def cos_sim(ind1,ind2=1999): view1 = np.load("test_v1.npy")[0:ind1] view2 = np.load("test_v2.npy")[0:ind2] #val = [] MAP=0 for i,j in enumerate(view1): val=[] AP=0 for x in view2: val.append(cosine_similarity(j,x)[0].tolist()) #val=val[0].tolist() #print val[0].tolist() val=[(q,p)for p,q in enumerate(val)] #print val val.sort() val.reverse() t = [w[1]for w in val[0:7]] for x,y in enumerate(t): if y in range(i,i+5): AP+=1/(x+1) print(t) print(AP) MAP+=AP print('MAP is : ',MAP/ind1)
Example #2
Source Project: Mastering-Elasticsearch-7.0 Author: PacktPublishing File: test_pairwise.py License: MIT License | 6 votes |
def test_cosine_similarity(): # Test the cosine_similarity. rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) Y = rng.random_sample((3, 4)) Xcsr = csr_matrix(X) Ycsr = csr_matrix(Y) for X_, Y_ in ((X, None), (X, Y), (Xcsr, None), (Xcsr, Ycsr)): # Test that the cosine is kernel is equal to a linear kernel when data # has been previously normalized by L2-norm. K1 = pairwise_kernels(X_, Y=Y_, metric="cosine") X_ = normalize(X_) if Y_ is not None: Y_ = normalize(Y_) K2 = pairwise_kernels(X_, Y=Y_, metric="linear") assert_array_almost_equal(K1, K2)
Example #3
Source Project: driverlessai-recipes Author: h2oai File: text_embedding_similarity_transformers.py License: Apache License 2.0 | 6 votes |
def transform(self, X: dt.Frame): X.replace([None, math.inf, -math.inf], self._repl_val) from flair.embeddings import WordEmbeddings, BertEmbeddings, DocumentPoolEmbeddings, Sentence if self.embedding_name in ["glove", "en"]: self.embedding = WordEmbeddings(self.embedding_name) elif self.embedding_name in ["bert"]: self.embedding = BertEmbeddings() self.doc_embedding = DocumentPoolEmbeddings([self.embedding]) output = [] X = X.to_pandas() text1_arr = X.iloc[:, 0].values text2_arr = X.iloc[:, 1].values for ind, text1 in enumerate(text1_arr): try: text1 = Sentence(str(text1).lower()) self.doc_embedding.embed(text1) text2 = text2_arr[ind] text2 = Sentence(str(text2).lower()) self.doc_embedding.embed(text2) score = cosine_similarity(text1.get_embedding().reshape(1, -1), text2.get_embedding().reshape(1, -1))[0, 0] output.append(score) except: output.append(-99) return np.array(output)
Example #4
Source Project: FaceRecognition-RestApi Author: Jinnrry File: faceApi.py License: MIT License | 6 votes |
def compared(request): if request.method == 'POST': if len(request.FILES) != 2: return HttpResponse('{"status":false,"data":"","msg":"图片参数错误!"}') starttime = time.time() name1 = str(random.randint(10000, 99999)) + str(time.time()) # 随机名字 name2 = str(random.randint(10000, 99999)) + str(time.time()) handle_uploaded_file(request.FILES['face1'], str(name1)) handle_uploaded_file(request.FILES['face2'], str(name2)) tz1 = get_feature(root + "RestServer/upload/" + str(name1)) tz2 = get_feature(root + "RestServer/upload/" + str(name2)) comparedValue = pw.cosine_similarity(tz1, tz2)[0][0] os.remove(root + "RestServer/upload/" + str(name1)) os.remove(root + "RestServer/upload/" + str(name2)) endtime = time.time() Runtime=endtime-starttime return HttpResponse('{"status":true,"data":"' + str(comparedValue) + '","msg":"成功","runtime": ' + str(Runtime) + ' }') else: return HttpResponse('{"status":false,"data":"","msg":"请求不合法"}') return HttpResponse('{"status":false,"data":"","msg":"未知错误"}')
Example #5
Source Project: altair Author: Lab41 File: app.py License: Apache License 2.0 | 6 votes |
def get_closest_docs(uri): #user_doc = requests.get(uri).text r = requests.get(uri) if r.status_code == 200: user_doc = r.text print("URI content length",len(user_doc)) code, _ = separate_code_and_comments(user_doc,"user doc") normalized_code = normalize_text(code, remove_stop_words=False, only_letters=False, return_list=True) model.random.seed(0) user_vector = model.infer_vector(normalized_code) print("finding similar...") sys.stdout.flush() stored_urls = list() stored_vectors = list() for url in vectors: stored_urls.append(url) stored_vectors.append(vectors[url]) pair_sims = cosine_similarity(user_vector.reshape(1, -1), stored_vectors) indices = (-pair_sims[0]).argsort()[:5] return [(stored_urls[index],round(float(pair_sims[0][index]),2)) for index in indices] else: print("URL returned status code", r.status_code) raise ValueError('URL error')
Example #6
Source Project: HarvestText Author: blmoistawinde File: entity_discoverer.py License: MIT License | 6 votes |
def clustering(self, threshold): """分不同词性的聚类 :return: partition: dict {word_id: cluster_id} """ print("Louvain clustering") partition = {} part_offset = 0 for etype, ners in self.type_entity_dict.items(): sub_id_mapping = [self.word2id[ner0] for ner0 in ners if ner0 in self.word2id] if len(sub_id_mapping) == 0: continue emb_mat_sub = self.emb_mat[sub_id_mapping, :] cos_sims = cosine_similarity(emb_mat_sub) cos_sims -= np.eye(len(emb_mat_sub)) adj_mat = (cos_sims > threshold).astype(int) G = nx.from_numpy_array(adj_mat) partition_sub = community.best_partition(G) for sub_id, main_id in enumerate(sub_id_mapping): sub_part_id = partition_sub[sub_id] partition[main_id] = sub_part_id + part_offset part_offset += max(partition_sub.values()) + 1 return partition
Example #7
Source Project: fnc-1 Author: Cisco-Talos File: helpers.py License: Apache License 2.0 | 6 votes |
def cosine_sim(x, y): try: if type(x) is np.ndarray: x = x.reshape(1, -1) # get rid of the warning if type(y) is np.ndarray: y = y.reshape(1, -1) d = cosine_similarity(x, y) d = d[0][0] except: print x print y d = 0. return d # Copyright 2017 Cisco Systems, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License.
Example #8
Source Project: CIKM-AnalytiCup-2018 Author: zake7749 File: feature_engineering.py License: Apache License 2.0 | 6 votes |
def _get_similarity_values(self, q1_csc, q2_csc): cosine_sim = [] manhattan_dis = [] eucledian_dis = [] jaccard_dis = [] minkowsk_dis = [] for i,j in zip(q1_csc, q2_csc): sim = cs(i, j) cosine_sim.append(sim[0][0]) sim = md(i, j) manhattan_dis.append(sim[0][0]) sim = ed(i, j) eucledian_dis.append(sim[0][0]) i_ = i.toarray() j_ = j.toarray() try: sim = jsc(i_, j_) jaccard_dis.append(sim) except: jaccard_dis.append(0) sim = minkowski_dis.pairwise(i_, j_) minkowsk_dis.append(sim[0][0]) return cosine_sim, manhattan_dis, eucledian_dis, jaccard_dis, minkowsk_dis
Example #9
Source Project: Hands-on-Supervised-Machine-Learning-with-Python Author: PacktPublishing File: itemitem.py License: MIT License | 6 votes |
def _compute_sim(self, R, k): # compute the similarity between all the items. This calculates the # similarity between each ITEM sim = cosine_similarity(R.T) # Only keep the similarities of the top K, setting all others to zero # (negative since we want descending) not_top_k = np.argsort(-sim, axis=1)[:, k:] # shape=(n_items, k) if not_top_k.shape[1]: # only if there are cols (k < n_items) # now we have to set these to zero in the similarity matrix row_indices = np.repeat(range(not_top_k.shape[0]), not_top_k.shape[1]) sim[row_indices, not_top_k.ravel()] = 0. return sim
Example #10
Source Project: pyts Author: johannfaouzi File: saxvsm.py License: BSD 3-Clause "New" or "Revised" License | 6 votes |
def decision_function(self, X): """Evaluate the cosine similarity between document-term matrix and X. Parameters ---------- X : array-like, shape (n_samples, n_timestamps) Test samples. Returns ------- X : array-like, shape (n_samples, n_classes) osine similarity between the document-term matrix and X. """ check_is_fitted(self, ['vocabulary_', 'tfidf_', 'idf_', '_tfidf', 'classes_']) X = check_array(X) X_bow = self._bow.transform(X) vectorizer = CountVectorizer(vocabulary=self._tfidf.vocabulary_) X_transformed = vectorizer.transform(X_bow).toarray() return cosine_similarity(X_transformed, self.tfidf_)
Example #11
Source Project: region Author: pysal File: test_skater.py License: BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_init(): default = Spanning_Forest() assert default.metric == skm.manhattan_distances assert default.center == np.mean assert default.reduction == np.sum change = Spanning_Forest(dissimilarity=skm.euclidean_distances, center=np.median, reduction=np.max) assert change.metric == skm.euclidean_distances assert change.center == np.median assert change.reduction == np.max sym = Spanning_Forest(affinity=skm.cosine_similarity) assert isinstance(sym.metric, types.LambdaType) test_distance = -np.log(skm.cosine_similarity(data[:2,])) comparator = sym.metric(data[:2,]) np.testing.assert_allclose(test_distance, comparator)
Example #12
Source Project: keras-glove Author: erwtokritos File: save_utils.py License: MIT License | 6 votes |
def save_model(model: Model, tokenizer: Tokenizer): """ Saves the important parts of the model :param model: Keras model to save :param tokenizer: Keras Tokenizer to save """ for layer in model.layers: if '_biases' in layer.name or '_embeddings' in layer.name: np.save(file=f'{OUTPUT_FOLDER}{layer.name}', arr=layer.get_weights()[0]) # save tokenizer pickle.dump(obj=tokenizer.index_word, file=open(f'{OUTPUT_FOLDER}{INDEX2WORD}', 'wb')) pickle.dump(obj=tokenizer.word_index, file=open(f'{OUTPUT_FOLDER}{WORD2INDEX}', 'wb')) # save combined embeddings & correlation matrix agg_embeddings = np.load(f'{OUTPUT_FOLDER}{CENTRAL_EMBEDDINGS}.npy') + \ np.load(f'{OUTPUT_FOLDER}{CONTEXT_EMBEDDINGS}.npy') np.save(file=f'{OUTPUT_FOLDER}{AGGREGATED_EMBEDDINGS}', arr=agg_embeddings) np.save(file=f'{OUTPUT_FOLDER}{CORRELATION_MATRIX}', arr=cosine_similarity(cosine_similarity(agg_embeddings)))
Example #13
Source Project: twitter-stock-recommendation Author: alvarobartt File: test_pairwise.py License: MIT License | 6 votes |
def test_cosine_similarity(): # Test the cosine_similarity. rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) Y = rng.random_sample((3, 4)) Xcsr = csr_matrix(X) Ycsr = csr_matrix(Y) for X_, Y_ in ((X, None), (X, Y), (Xcsr, None), (Xcsr, Ycsr)): # Test that the cosine is kernel is equal to a linear kernel when data # has been previously normalized by L2-norm. K1 = pairwise_kernels(X_, Y=Y_, metric="cosine") X_ = normalize(X_) if Y_ is not None: Y_ = normalize(Y_) K2 = pairwise_kernels(X_, Y=Y_, metric="linear") assert_array_almost_equal(K1, K2)
Example #14
Source Project: scattertext Author: JasonKessler File: CategoryProjectorEvaluator.py License: Apache License 2.0 | 5 votes |
def evaluate(self, category_projection): assert issubclass(type(category_projection), CategoryProjectionBase) topics = category_projection.get_nearest_terms() total_similarity = 0 for topic in topics.values(): topic_vectors = np.array([self.get_vector(term) for term in topic]) #simport pdb; pdb.set_trace() sim_matrix = cosine_similarity(topic_vectors) tril_sim_matrix = np.tril(sim_matrix) mean_similarity = tril_sim_matrix.sum()/(tril_sim_matrix.shape[0] ** 2 - tril_sim_matrix.shape[0]) / 2 total_similarity += mean_similarity return total_similarity/len(topics)
Example #15
Source Project: fever-naacl-2018 Author: sheffieldnlp File: fever_features.py License: Apache License 2.0 | 5 votes |
def process(self,data): claim_bow = self.bow_vectorizer.transform(self.claims(data)) claim_tfs = self.tfreq_vectorizer.transform(claim_bow) claim_tfidf = self.tfidf_vectorizer.transform(self.claims(data)) body_texts = self.texts(data) body_bow = self.bow_vectorizer.transform(body_texts) body_tfs = self.tfreq_vectorizer.transform(body_bow) body_tfidf = self.tfidf_vectorizer.transform(body_texts) cosines = np.array([cosine_similarity(c, b)[0] for c,b in zip(claim_tfidf,body_tfidf)]) return hstack([body_tfs,claim_tfs,cosines])
Example #16
Source Project: fever-naacl-2018 Author: sheffieldnlp File: process_tfidf_grid.py License: Apache License 2.0 | 5 votes |
def process(self, data): claim_bow = self.bow_vectorizer.transform(self.claims(data)) claim_tfs = self.tfreq_vectorizer.transform(claim_bow) claim_tfidf = self.tfidf_vectorizer.transform(self.claims(data)) body_texts = self.texts(data) body_bow = self.bow_vectorizer.transform(body_texts) body_tfs = self.tfreq_vectorizer.transform(body_bow) body_tfidf = self.tfidf_vectorizer.transform(body_texts) cosines = np.array([cosine_similarity(c, b)[0] for c, b in zip(claim_tfidf, body_tfidf)]) return cosines
Example #17
Source Project: fever-naacl-2018 Author: sheffieldnlp File: process_tfidf.py License: Apache License 2.0 | 5 votes |
def process(self, data): claim_bow = self.bow_vectorizer.transform(self.claims(data)) claim_tfs = self.tfreq_vectorizer.transform(claim_bow) claim_tfidf = self.tfidf_vectorizer.transform(self.claims(data)) body_texts = self.texts(data) body_bow = self.bow_vectorizer.transform(body_texts) body_tfs = self.tfreq_vectorizer.transform(body_bow) body_tfidf = self.tfidf_vectorizer.transform(body_texts) cosines = np.array([cosine_similarity(c, b)[0] for c, b in zip(claim_tfidf, body_tfidf)]) return cosines
Example #18
Source Project: Clothing-Detection Author: simaiden File: utils.py License: GNU General Public License v3.0 | 5 votes |
def closest_distances(query_vector,all_feat_vecs,norm='euclidian',num=3): if norm=='euclidian': dist = np.linalg.norm(query_vector-all_feat_vecs,axis=1) if norm =='cosine': dist = 1 - cosine_similarity(query_vector.reshape(1, -1),all_feat_vecs)[0] idxs = np.arange(0,dist.shape[0]) return idxs[dist.argsort()][:num]
Example #19
Source Project: SOQAL Author: husseinmozannar File: embedding_match.py License: MIT License | 5 votes |
def read(self, P, Q): A = self.get_answer_canditates(P) A_embed = [] for a in A: A_embed.append(self.embedder.embed(a)) Q_embed = self.embedder.embed(Q) similarities_raw = cosine_similarity(A_embed, Q_embed.reshape(1, -1)) similarities = [s[0] for s in similarities_raw] indices_sorted = np.argsort(similarities)[::-1] # reverse order return A[indices_sorted[0]]
Example #20
Source Project: SOQAL Author: husseinmozannar File: tfidf_reader.py License: MIT License | 5 votes |
def read(self, P , Q): Q = self.stem_string(Q) query_tfidf = self.vectorizer.transform([Q]) similarities_raw = cosine_similarity(self.tfidf_matrix, query_tfidf) similarities = [] for s in similarities_raw: similarities.append(s[0]) max_index = np.argmax(similarities) return self.docs[max_index]
Example #21
Source Project: SOQAL Author: husseinmozannar File: EmbeddingRetriever.py License: MIT License | 5 votes |
def get_topk_docs(self, query): """ :param query: a string :return: top documents according to cosine similarity of embeddings """ emb_query = self.embed_string(query) similarities_raw = cosine_similarity(self.emb_matrix, emb_query.reshape(1,-1)) similarities = [s[0] for s in similarities_raw] indices_sorted = np.argsort(similarities)[::-1] # reverse order topk_docs = [] for i in range(0, self.k): topk_docs.append(self.docs[indices_sorted[i]]) return topk_docs
Example #22
Source Project: kaggle-HomeDepot Author: ChenglongChen File: dist_utils.py License: MIT License | 5 votes |
def _cosine_sim(vec1, vec2): try: s = cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0] except: try: s = cosine_similarity(vec1, vec2)[0][0] except: s = config.MISSING_VALUE_NUMERIC return s
Example #23
Source Project: tokenquery Author: ramtinms File: vector_opr.py License: GNU General Public License v3.0 | 5 votes |
def vec_cos_sim(token_input, operation_input): operation_string = None ref_vector_string = None cond_value_string = None for opr_sign in ['==', '>=', '<=', '!=', '<>', '<', '>', '=']: if opr_sign in operation_input: ref_vector_string = operation_input.split(opr_sign)[0] operation_string = opr_sign cond_value_string = operation_input.split(opr_sign)[1] break if ref_vector_string and cond_value_string and operation_string: try: cond_value = float(cond_value_string) ref_vector = change_string_to_vector(ref_vector_string) token_vector = change_string_to_vector(token_input) if len(ref_vector) != len(token_vector): print ('len of vectors does not match') return False if operation_string == "=" or operation_string == "==": return cosine_similarity(token_vector, ref_vector) == cond_value elif operation_string == "<": return cosine_similarity(token_vector, ref_vector) < cond_value elif operation_string == ">": return cosine_similarity(token_vector, ref_vector) > cond_value elif operation_string == ">=": return cosine_similarity(token_vector, ref_vector) >= cond_value elif operation_string == "<=": return cosine_similarity(token_vector, ref_vector) <= cond_value elif operation_string == "!=" or operation_string == "<>": return cosine_similarity(token_vector, ref_vector) != cond_value else: return False except ValueError: # TODO raise tokenregex error return False else: # TODO raise tokenregex error print ('Problem with the operation input')
Example #24
Source Project: Natural-Language-Processing-with-Python-Cookbook Author: PacktPublishing File: Similarity.py License: MIT License | 5 votes |
def cosineSimilarity(self): vec = TfidfVectorizer() matrix = vec.fit_transform(self.statements) for j in range(1, 5): i = j - 1 print("\tsimilarity of document {} with others".format(i)) similarity = cosine_similarity(matrix[i:j], matrix) print(similarity)
Example #25
Source Project: SDLib Author: Coder-Yu File: qmath.py License: GNU General Public License v3.0 | 5 votes |
def cosine(x1,x2): #find common ratings new_x1, new_x2 = common(x1,x2) #compute the cosine similarity between two vectors sum = new_x1.dot(new_x2) denom = sqrt(new_x1.dot(new_x1)*new_x2.dot(new_x2)) try: return float(sum)/denom except ZeroDivisionError: return 0 #return cosine_similarity(x1,x2)[0][0]
Example #26
Source Project: VBDiarization Author: Jamiroquai88 File: normalization.py License: Apache License 2.0 | 5 votes |
def s_norm(self, test, enroll): """ Run speaker normalization (S-Norm) on cached embeddings. Args: test (np.array): test embedding enroll (np.array): enroll embedding Returns: float: hypothesis """ if self.plda: a = self.plda.score(test, self.embeddings).T b = self.plda.score(enroll, self.embeddings).T c = self.plda.score(enroll, test).T else: a = cosine_similarity(test, self.embeddings).T b = cosine_similarity(enroll, self.embeddings).T c = cosine_similarity(enroll, test).T scores = [] for ii in range(test.shape[0]): test_scores = [] for jj in range(enroll.shape[0]): test_mean, test_std = np.mean(a.T[ii]), np.std(a.T[ii]) enroll_mean, enroll_std = np.mean(b.T[jj]), np.std(b.T[jj]) s = c[ii][jj] test_scores.append((((s - test_mean) / test_std + (s - enroll_mean) / enroll_std) / 2)) scores.append(test_scores) return np.array(scores)
Example #27
Source Project: chameleon_recsys Author: gabrielspmoreira File: content_based.py License: MIT License | 5 votes |
def predict(self, users_ids, sessions_items, topk=5, valid_items=None): acr_embeddings = self.eval_benchmark_params['content_article_embeddings_matrix'] recent_items_buffer = self.clicked_items_state.get_recent_clicks_buffer() if valid_items is None: recent_unique_item_ids = np.unique([recent_items_buffer[np.nonzero(recent_items_buffer)]]) else: recent_unique_item_ids = np.unique(valid_items) acr_embeddings_recent_items = acr_embeddings[recent_unique_item_ids] session_predictions = np.zeros(dtype=np.int64, shape=[sessions_items.shape[0], sessions_items.shape[1], topk]) for row_idx, session_items in enumerate(sessions_items): for col_idx, item in enumerate(session_items): if item != 0: #Computing cosine similarity between this item and all recent items (from buffer) #P.s. Do not need to ignore the current item (whose similarity is always, because this item will not be among the valid items (next click + negative samples not present in the session)) similarities = cosine_similarity(acr_embeddings[item].reshape(1, -1), acr_embeddings_recent_items)[0] similar_items_sorted_idx = np.argsort(similarities, axis=0)[::-1] similar_items_ids = recent_unique_item_ids[similar_items_sorted_idx] session_predictions[row_idx, col_idx] = list(self._get_top_n_valid_items(similar_items_ids, topk, valid_items[row_idx, col_idx])) return session_predictions
Example #28
Source Project: embedding Author: ratsgo File: visualize_utils.py License: MIT License | 5 votes |
def visualize_between_sentences(sentences, vec_list, palette="Viridis256", filename="/notebooks/embedding/between-sentences.png", use_notebook=False): df_list, score_list = [], [] for sent1_idx, sentence1 in enumerate(sentences): for sent2_idx, sentence2 in enumerate(sentences): vec1, vec2 = vec_list[sent1_idx], vec_list[sent2_idx] if np.any(vec1) and np.any(vec2): score = cosine_similarity(X=[vec1], Y=[vec2]) df_list.append({'x': sentence1, 'y': sentence2, 'similarity': score[0][0]}) score_list.append(score[0][0]) df = pd.DataFrame(df_list) color_mapper = LinearColorMapper(palette=palette, low=np.max(score_list), high=np.min(score_list)) TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom" p = figure(x_range=sentences, y_range=list(reversed(sentences)), x_axis_location="above", plot_width=900, plot_height=900, toolbar_location='below', tools=TOOLS, tooltips=[('sentences', '@x @y'), ('similarity', '@similarity')]) p.grid.grid_line_color = None p.axis.axis_line_color = None p.axis.major_tick_line_color = None p.axis.major_label_standoff = 0 p.xaxis.major_label_orientation = 3.14 / 3 p.rect(x="x", y="y", width=1, height=1, source=df, fill_color={'field': 'similarity', 'transform': color_mapper}, line_color=None) color_bar = ColorBar(ticker=BasicTicker(desired_num_ticks=5), color_mapper=color_mapper, major_label_text_font_size="7pt", label_standoff=6, border_line_color=None, location=(0, 0)) p.add_layout(color_bar, 'right') if use_notebook: output_notebook() show(p) else: export_png(p, filename) print("save @ " + filename)
Example #29
Source Project: embedding Author: ratsgo File: visualize_utils.py License: MIT License | 5 votes |
def visualize_between_words(words, vecs, palette="Viridis256", filename="/notebooks/embedding/between-words.png", use_notebook=False): df_list = [] for word1_idx, word1 in enumerate(words): for word2_idx, word2 in enumerate(words): vec1 = vecs[word1_idx] vec2 = vecs[word2_idx] if np.any(vec1) and np.any(vec2): score = cosine_similarity(X=[vec1], Y=[vec2]) df_list.append({'x': word1, 'y': word2, 'similarity': score[0][0]}) df = pd.DataFrame(df_list) color_mapper = LinearColorMapper(palette=palette, low=1, high=0) TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom" p = figure(x_range=list(words), y_range=list(reversed(list(words))), x_axis_location="above", plot_width=900, plot_height=900, toolbar_location='below', tools=TOOLS, tooltips=[('words', '@x @y'), ('similarity', '@similarity')]) p.grid.grid_line_color = None p.axis.axis_line_color = None p.axis.major_tick_line_color = None p.axis.major_label_standoff = 0 p.xaxis.major_label_orientation = 3.14 / 3 p.rect(x="x", y="y", width=1, height=1, source=df, fill_color={'field': 'similarity', 'transform': color_mapper}, line_color=None) color_bar = ColorBar(ticker=BasicTicker(desired_num_ticks=5), color_mapper=color_mapper, major_label_text_font_size="7pt", label_standoff=6, border_line_color=None, location=(0, 0)) p.add_layout(color_bar, 'right') if use_notebook: output_notebook() show(p) else: export_png(p, filename) print("save @ " + filename)
Example #30
Source Project: nlp_research Author: zhufz File: test_match.py License: MIT License | 5 votes |
def __call__(self, text): if self.tfrecords_mode == 'point': assert text.find('||') != -1,"input should cotain two sentences seperated by ||" text_a = text.split('||')[0] text_b = text.split('||')[-1] pred,score = self._get_label([text_a], [text_b], need_preprocess = True) return pred[0][0], score[0][0] #加载自定义问句(自定义优先) if self.sim_mode == 'cross': text_list = self.text_list label_list = self.label_list if self.zdy != {}: text_list = self.zdy['text_list'] + text_list label_list = self.zdy['label_list'] + label_list pred,score = self._get_label([text], self.text_list, need_preprocess = True) selected_id = np.argmax(score) out_score = score[selected_id] elif self.sim_mode == 'represent': text_list = self.text_list vec_list = self.vec_list label_list = self.label_list if self.zdy != {}: text_list = self.zdy['text_list'] + text_list vec_list = np.concatenate([self.zdy['vec_list'], self.vec_list], axis = 0) label_list = self.zdy['label_list'] + label_list vec = self._get_vecs([text], need_preprocess = True) if self.is_distance: scores = euclidean_distances(vec, vec_list)[0] selected_id = np.argmin(scores) out_score = 1 - scores[selected_id] else: scores = cosine_similarity(vec, vec_list)[0] selected_id = np.argmax(scores) out_score = scores[selected_id] else: raise ValueError('unknown sim mode, represent or cross?') ret = (label_list[selected_id], out_score, selected_id, \ self.text_list[selected_id]) return ret