Python sklearn.metrics.pairwise.cosine_similarity() Examples

The following are 30 code examples of sklearn.metrics.pairwise.cosine_similarity(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.metrics.pairwise , or try the search function .
Example #1
Source Project: DeepLearn   Author: GauravBh1010tt   File: utility.py    License: MIT License 6 votes vote down vote up
def cos_sim(ind1,ind2=1999):
    view1 = np.load("test_v1.npy")[0:ind1]
    view2 = np.load("test_v2.npy")[0:ind2]
    #val = []
    MAP=0
    for i,j in enumerate(view1):
        val=[]
        AP=0
        for x in view2:            
            val.append(cosine_similarity(j,x)[0].tolist())
        #val=val[0].tolist()
        #print val[0].tolist()
        val=[(q,p)for p,q in enumerate(val)]
        #print val
        val.sort()
        val.reverse()
        t = [w[1]for w in val[0:7]]
        for x,y in enumerate(t):
            if y in range(i,i+5):
                AP+=1/(x+1)
        print(t)
        print(AP)
        MAP+=AP
    print('MAP is : ',MAP/ind1) 
Example #2
Source Project: Mastering-Elasticsearch-7.0   Author: PacktPublishing   File: test_pairwise.py    License: MIT License 6 votes vote down vote up
def test_cosine_similarity():
    # Test the cosine_similarity.

    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    Y = rng.random_sample((3, 4))
    Xcsr = csr_matrix(X)
    Ycsr = csr_matrix(Y)

    for X_, Y_ in ((X, None), (X, Y),
                   (Xcsr, None), (Xcsr, Ycsr)):
        # Test that the cosine is kernel is equal to a linear kernel when data
        # has been previously normalized by L2-norm.
        K1 = pairwise_kernels(X_, Y=Y_, metric="cosine")
        X_ = normalize(X_)
        if Y_ is not None:
            Y_ = normalize(Y_)
        K2 = pairwise_kernels(X_, Y=Y_, metric="linear")
        assert_array_almost_equal(K1, K2) 
Example #3
Source Project: driverlessai-recipes   Author: h2oai   File: text_embedding_similarity_transformers.py    License: Apache License 2.0 6 votes vote down vote up
def transform(self, X: dt.Frame):
        X.replace([None, math.inf, -math.inf], self._repl_val)
        from flair.embeddings import WordEmbeddings, BertEmbeddings, DocumentPoolEmbeddings, Sentence
        if self.embedding_name in ["glove", "en"]:
            self.embedding = WordEmbeddings(self.embedding_name)
        elif self.embedding_name in ["bert"]:
            self.embedding = BertEmbeddings()
        self.doc_embedding = DocumentPoolEmbeddings([self.embedding])
        output = []
        X = X.to_pandas()
        text1_arr = X.iloc[:, 0].values
        text2_arr = X.iloc[:, 1].values
        for ind, text1 in enumerate(text1_arr):
            try:
                text1 = Sentence(str(text1).lower())
                self.doc_embedding.embed(text1)
                text2 = text2_arr[ind]
                text2 = Sentence(str(text2).lower())
                self.doc_embedding.embed(text2)
                score = cosine_similarity(text1.get_embedding().reshape(1, -1),
                                          text2.get_embedding().reshape(1, -1))[0, 0]
                output.append(score)
            except:
                output.append(-99)
        return np.array(output) 
Example #4
Source Project: FaceRecognition-RestApi   Author: Jinnrry   File: faceApi.py    License: MIT License 6 votes vote down vote up
def compared(request):
    if request.method == 'POST':
        if len(request.FILES) != 2:
            return HttpResponse('{"status":false,"data":"","msg":"图片参数错误!"}')
        starttime = time.time()
        name1 = str(random.randint(10000, 99999)) + str(time.time())  # 随机名字
        name2 = str(random.randint(10000, 99999)) + str(time.time())

        handle_uploaded_file(request.FILES['face1'], str(name1))
        handle_uploaded_file(request.FILES['face2'], str(name2))

        tz1 = get_feature(root + "RestServer/upload/" + str(name1))

        tz2 = get_feature(root + "RestServer/upload/" + str(name2))

        comparedValue = pw.cosine_similarity(tz1, tz2)[0][0]

        os.remove(root + "RestServer/upload/" + str(name1))
        os.remove(root + "RestServer/upload/" + str(name2))
        endtime = time.time()
        Runtime=endtime-starttime
        return HttpResponse('{"status":true,"data":"' + str(comparedValue) + '","msg":"成功","runtime": ' + str(Runtime) + '  }')
    else:
        return HttpResponse('{"status":false,"data":"","msg":"请求不合法"}')
    return HttpResponse('{"status":false,"data":"","msg":"未知错误"}') 
Example #5
Source Project: altair   Author: Lab41   File: app.py    License: Apache License 2.0 6 votes vote down vote up
def get_closest_docs(uri):
    #user_doc = requests.get(uri).text
    r = requests.get(uri)
    if r.status_code == 200:
        user_doc = r.text
        print("URI content length",len(user_doc))
        code, _ = separate_code_and_comments(user_doc,"user doc")
        normalized_code = normalize_text(code, remove_stop_words=False, only_letters=False, return_list=True)
        model.random.seed(0)
        user_vector = model.infer_vector(normalized_code)
        print("finding similar...")
        sys.stdout.flush()
        stored_urls = list()
        stored_vectors = list()
        for url in vectors:
            stored_urls.append(url)
            stored_vectors.append(vectors[url])
        pair_sims = cosine_similarity(user_vector.reshape(1, -1), stored_vectors)
        indices = (-pair_sims[0]).argsort()[:5]
        return [(stored_urls[index],round(float(pair_sims[0][index]),2)) for index in indices]
    else:
        print("URL returned status code", r.status_code)
        raise ValueError('URL error') 
Example #6
Source Project: HarvestText   Author: blmoistawinde   File: entity_discoverer.py    License: MIT License 6 votes vote down vote up
def clustering(self, threshold):
        """分不同词性的聚类

        :return: partition: dict {word_id: cluster_id}
        """
        print("Louvain clustering")
        partition = {}
        part_offset = 0
        for etype, ners in self.type_entity_dict.items():
            sub_id_mapping = [self.word2id[ner0] for ner0 in ners if ner0 in self.word2id]
            if len(sub_id_mapping) == 0:
                continue
            emb_mat_sub = self.emb_mat[sub_id_mapping, :]
            cos_sims = cosine_similarity(emb_mat_sub)
            cos_sims -= np.eye(len(emb_mat_sub))
            adj_mat = (cos_sims > threshold).astype(int)
            G = nx.from_numpy_array(adj_mat)
            partition_sub = community.best_partition(G)
            for sub_id, main_id in enumerate(sub_id_mapping):
                sub_part_id = partition_sub[sub_id]
                partition[main_id] = sub_part_id + part_offset
            part_offset += max(partition_sub.values()) + 1
        return partition 
Example #7
Source Project: fnc-1   Author: Cisco-Talos   File: helpers.py    License: Apache License 2.0 6 votes vote down vote up
def cosine_sim(x, y):
    try:
        if type(x) is np.ndarray: x = x.reshape(1, -1) # get rid of the warning
        if type(y) is np.ndarray: y = y.reshape(1, -1)
        d = cosine_similarity(x, y)
        d = d[0][0]
    except:
        print x
        print y
        d = 0.
    return d

 #   Copyright 2017 Cisco Systems, Inc.
 #  
 #   Licensed under the Apache License, Version 2.0 (the "License");
 #   you may not use this file except in compliance with the License.
 #   You may obtain a copy of the License at
 #  
 #     http://www.apache.org/licenses/LICENSE-2.0
 #  
 #   Unless required by applicable law or agreed to in writing, software
 #   distributed under the License is distributed on an "AS IS" BASIS,
 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #   See the License for the specific language governing permissions and
 #   limitations under the License. 
Example #8
Source Project: CIKM-AnalytiCup-2018   Author: zake7749   File: feature_engineering.py    License: Apache License 2.0 6 votes vote down vote up
def _get_similarity_values(self, q1_csc, q2_csc):
        cosine_sim = []
        manhattan_dis = []
        eucledian_dis = []
        jaccard_dis = []
        minkowsk_dis = []
        
        for i,j in zip(q1_csc, q2_csc):
            sim = cs(i, j)
            cosine_sim.append(sim[0][0])
            sim = md(i, j)
            manhattan_dis.append(sim[0][0])
            sim = ed(i, j)
            eucledian_dis.append(sim[0][0])
            i_ = i.toarray()
            j_ = j.toarray()
            try:
                sim = jsc(i_, j_)
                jaccard_dis.append(sim)
            except:
                jaccard_dis.append(0)
                
            sim = minkowski_dis.pairwise(i_, j_)
            minkowsk_dis.append(sim[0][0])
        return cosine_sim, manhattan_dis, eucledian_dis, jaccard_dis, minkowsk_dis 
Example #9
Source Project: Hands-on-Supervised-Machine-Learning-with-Python   Author: PacktPublishing   File: itemitem.py    License: MIT License 6 votes vote down vote up
def _compute_sim(self, R, k):
        # compute the similarity between all the items. This calculates the
        # similarity between each ITEM
        sim = cosine_similarity(R.T)

        # Only keep the similarities of the top K, setting all others to zero
        # (negative since we want descending)
        not_top_k = np.argsort(-sim, axis=1)[:, k:]  # shape=(n_items, k)

        if not_top_k.shape[1]:  # only if there are cols (k < n_items)
            # now we have to set these to zero in the similarity matrix
            row_indices = np.repeat(range(not_top_k.shape[0]),
                                    not_top_k.shape[1])
            sim[row_indices, not_top_k.ravel()] = 0.

        return sim 
Example #10
Source Project: pyts   Author: johannfaouzi   File: saxvsm.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def decision_function(self, X):
        """Evaluate the cosine similarity between document-term matrix and X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_timestamps)
            Test samples.

        Returns
        -------
        X : array-like, shape (n_samples, n_classes)
            osine similarity between the document-term matrix and X.

        """
        check_is_fitted(self, ['vocabulary_', 'tfidf_', 'idf_',
                               '_tfidf', 'classes_'])
        X = check_array(X)
        X_bow = self._bow.transform(X)
        vectorizer = CountVectorizer(vocabulary=self._tfidf.vocabulary_)
        X_transformed = vectorizer.transform(X_bow).toarray()
        return cosine_similarity(X_transformed, self.tfidf_) 
Example #11
Source Project: region   Author: pysal   File: test_skater.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_init():
    default = Spanning_Forest()
    assert default.metric == skm.manhattan_distances
    assert default.center == np.mean
    assert default.reduction == np.sum
    change = Spanning_Forest(dissimilarity=skm.euclidean_distances,
                             center=np.median, reduction=np.max)
    assert change.metric == skm.euclidean_distances
    assert change.center == np.median
    assert change.reduction == np.max
    
    sym = Spanning_Forest(affinity=skm.cosine_similarity)
    assert isinstance(sym.metric, types.LambdaType)
    test_distance = -np.log(skm.cosine_similarity(data[:2,]))
    comparator = sym.metric(data[:2,])
    np.testing.assert_allclose(test_distance, comparator) 
Example #12
Source Project: keras-glove   Author: erwtokritos   File: save_utils.py    License: MIT License 6 votes vote down vote up
def save_model(model: Model, tokenizer: Tokenizer):
    """
    Saves the important parts of the model
    :param model: Keras model to save
    :param tokenizer: Keras Tokenizer to save
    """
    for layer in model.layers:
        if '_biases' in layer.name or '_embeddings' in layer.name:
            np.save(file=f'{OUTPUT_FOLDER}{layer.name}', arr=layer.get_weights()[0])

    # save tokenizer
    pickle.dump(obj=tokenizer.index_word, file=open(f'{OUTPUT_FOLDER}{INDEX2WORD}', 'wb'))
    pickle.dump(obj=tokenizer.word_index, file=open(f'{OUTPUT_FOLDER}{WORD2INDEX}', 'wb'))

    # save combined embeddings & correlation matrix
    agg_embeddings = np.load(f'{OUTPUT_FOLDER}{CENTRAL_EMBEDDINGS}.npy') + \
                     np.load(f'{OUTPUT_FOLDER}{CONTEXT_EMBEDDINGS}.npy')

    np.save(file=f'{OUTPUT_FOLDER}{AGGREGATED_EMBEDDINGS}', arr=agg_embeddings)
    np.save(file=f'{OUTPUT_FOLDER}{CORRELATION_MATRIX}', arr=cosine_similarity(cosine_similarity(agg_embeddings))) 
Example #13
Source Project: twitter-stock-recommendation   Author: alvarobartt   File: test_pairwise.py    License: MIT License 6 votes vote down vote up
def test_cosine_similarity():
    # Test the cosine_similarity.

    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    Y = rng.random_sample((3, 4))
    Xcsr = csr_matrix(X)
    Ycsr = csr_matrix(Y)

    for X_, Y_ in ((X, None), (X, Y),
                   (Xcsr, None), (Xcsr, Ycsr)):
        # Test that the cosine is kernel is equal to a linear kernel when data
        # has been previously normalized by L2-norm.
        K1 = pairwise_kernels(X_, Y=Y_, metric="cosine")
        X_ = normalize(X_)
        if Y_ is not None:
            Y_ = normalize(Y_)
        K2 = pairwise_kernels(X_, Y=Y_, metric="linear")
        assert_array_almost_equal(K1, K2) 
Example #14
Source Project: scattertext   Author: JasonKessler   File: CategoryProjectorEvaluator.py    License: Apache License 2.0 5 votes vote down vote up
def evaluate(self, category_projection):
        assert issubclass(type(category_projection), CategoryProjectionBase)
        topics = category_projection.get_nearest_terms()
        total_similarity = 0
        for topic in topics.values():
            topic_vectors = np.array([self.get_vector(term) for term in topic])
            #simport pdb; pdb.set_trace()
            sim_matrix = cosine_similarity(topic_vectors)
            tril_sim_matrix = np.tril(sim_matrix)
            mean_similarity = tril_sim_matrix.sum()/(tril_sim_matrix.shape[0] ** 2 - tril_sim_matrix.shape[0]) / 2
            total_similarity += mean_similarity
        return total_similarity/len(topics) 
Example #15
Source Project: fever-naacl-2018   Author: sheffieldnlp   File: fever_features.py    License: Apache License 2.0 5 votes vote down vote up
def process(self,data):
        claim_bow = self.bow_vectorizer.transform(self.claims(data))
        claim_tfs = self.tfreq_vectorizer.transform(claim_bow)
        claim_tfidf = self.tfidf_vectorizer.transform(self.claims(data))

        body_texts = self.texts(data)
        body_bow = self.bow_vectorizer.transform(body_texts)
        body_tfs = self.tfreq_vectorizer.transform(body_bow)
        body_tfidf = self.tfidf_vectorizer.transform(body_texts)

        cosines = np.array([cosine_similarity(c, b)[0] for c,b in zip(claim_tfidf,body_tfidf)])

        return hstack([body_tfs,claim_tfs,cosines]) 
Example #16
Source Project: fever-naacl-2018   Author: sheffieldnlp   File: process_tfidf_grid.py    License: Apache License 2.0 5 votes vote down vote up
def process(self, data):
        claim_bow = self.bow_vectorizer.transform(self.claims(data))
        claim_tfs = self.tfreq_vectorizer.transform(claim_bow)
        claim_tfidf = self.tfidf_vectorizer.transform(self.claims(data))

        body_texts = self.texts(data)
        body_bow = self.bow_vectorizer.transform(body_texts)
        body_tfs = self.tfreq_vectorizer.transform(body_bow)
        body_tfidf = self.tfidf_vectorizer.transform(body_texts)

        cosines = np.array([cosine_similarity(c, b)[0] for c, b in zip(claim_tfidf, body_tfidf)])

        return cosines 
Example #17
Source Project: fever-naacl-2018   Author: sheffieldnlp   File: process_tfidf.py    License: Apache License 2.0 5 votes vote down vote up
def process(self, data):
        claim_bow = self.bow_vectorizer.transform(self.claims(data))
        claim_tfs = self.tfreq_vectorizer.transform(claim_bow)
        claim_tfidf = self.tfidf_vectorizer.transform(self.claims(data))

        body_texts = self.texts(data)
        body_bow = self.bow_vectorizer.transform(body_texts)
        body_tfs = self.tfreq_vectorizer.transform(body_bow)
        body_tfidf = self.tfidf_vectorizer.transform(body_texts)

        cosines = np.array([cosine_similarity(c, b)[0] for c, b in zip(claim_tfidf, body_tfidf)])

        return cosines 
Example #18
Source Project: Clothing-Detection   Author: simaiden   File: utils.py    License: GNU General Public License v3.0 5 votes vote down vote up
def closest_distances(query_vector,all_feat_vecs,norm='euclidian',num=3):
    if norm=='euclidian':
        dist = np.linalg.norm(query_vector-all_feat_vecs,axis=1)
    if norm =='cosine':
        dist = 1 - cosine_similarity(query_vector.reshape(1, -1),all_feat_vecs)[0]
    idxs = np.arange(0,dist.shape[0])
    return idxs[dist.argsort()][:num] 
Example #19
Source Project: SOQAL   Author: husseinmozannar   File: embedding_match.py    License: MIT License 5 votes vote down vote up
def read(self, P, Q):
        A = self.get_answer_canditates(P)
        A_embed = []
        for a in A:
            A_embed.append(self.embedder.embed(a))
        Q_embed = self.embedder.embed(Q)
        similarities_raw = cosine_similarity(A_embed, Q_embed.reshape(1, -1))
        similarities = [s[0] for s in similarities_raw]
        indices_sorted = np.argsort(similarities)[::-1]  # reverse order
        return A[indices_sorted[0]] 
Example #20
Source Project: SOQAL   Author: husseinmozannar   File: tfidf_reader.py    License: MIT License 5 votes vote down vote up
def read(self, P , Q):
        Q = self.stem_string(Q)
        query_tfidf = self.vectorizer.transform([Q])
        similarities_raw = cosine_similarity(self.tfidf_matrix, query_tfidf)
        similarities = []
        for s in similarities_raw:
            similarities.append(s[0])
        max_index = np.argmax(similarities)
        return self.docs[max_index] 
Example #21
Source Project: SOQAL   Author: husseinmozannar   File: EmbeddingRetriever.py    License: MIT License 5 votes vote down vote up
def get_topk_docs(self, query):
        """
        :param query: a string
        :return: top documents according to cosine similarity of embeddings
        """
        emb_query = self.embed_string(query)
        similarities_raw = cosine_similarity(self.emb_matrix, emb_query.reshape(1,-1))
        similarities = [s[0] for s in similarities_raw]
        indices_sorted = np.argsort(similarities)[::-1]  # reverse order
        topk_docs = []
        for i in range(0, self.k):
            topk_docs.append(self.docs[indices_sorted[i]])
        return topk_docs 
Example #22
Source Project: kaggle-HomeDepot   Author: ChenglongChen   File: dist_utils.py    License: MIT License 5 votes vote down vote up
def _cosine_sim(vec1, vec2):
    try:
        s = cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]
    except:
        try:
            s = cosine_similarity(vec1, vec2)[0][0]
        except:
            s = config.MISSING_VALUE_NUMERIC
    return s 
Example #23
Source Project: tokenquery   Author: ramtinms   File: vector_opr.py    License: GNU General Public License v3.0 5 votes vote down vote up
def vec_cos_sim(token_input, operation_input):
    operation_string = None
    ref_vector_string = None
    cond_value_string = None
    for opr_sign in ['==', '>=', '<=', '!=', '<>', '<', '>', '=']:
        if opr_sign in operation_input:
            ref_vector_string = operation_input.split(opr_sign)[0]
            operation_string = opr_sign
            cond_value_string = operation_input.split(opr_sign)[1]
            break

    if ref_vector_string and cond_value_string and operation_string:
        try:
            cond_value = float(cond_value_string)
            ref_vector = change_string_to_vector(ref_vector_string)
            token_vector = change_string_to_vector(token_input)
            if len(ref_vector) != len(token_vector):
                print ('len of vectors does not match')
                return False
            if operation_string == "=" or operation_string == "==":
                return cosine_similarity(token_vector, ref_vector) == cond_value
            elif operation_string == "<":
                return cosine_similarity(token_vector, ref_vector) < cond_value
            elif operation_string == ">":
                return cosine_similarity(token_vector, ref_vector) > cond_value
            elif operation_string == ">=":
                return cosine_similarity(token_vector, ref_vector) >= cond_value
            elif operation_string == "<=":
                return cosine_similarity(token_vector, ref_vector) <= cond_value
            elif operation_string == "!=" or operation_string == "<>":
                return cosine_similarity(token_vector, ref_vector) != cond_value
            else:
                return False
        except ValueError:
            # TODO raise tokenregex error
            return False

    else:
        # TODO raise tokenregex error
        print ('Problem with the operation input') 
Example #24
Source Project: Natural-Language-Processing-with-Python-Cookbook   Author: PacktPublishing   File: Similarity.py    License: MIT License 5 votes vote down vote up
def cosineSimilarity(self):
        vec = TfidfVectorizer()
        matrix = vec.fit_transform(self.statements)
        for j in range(1, 5):
            i = j - 1
            print("\tsimilarity of document {} with others".format(i))
            similarity = cosine_similarity(matrix[i:j], matrix)
            print(similarity) 
Example #25
Source Project: SDLib   Author: Coder-Yu   File: qmath.py    License: GNU General Public License v3.0 5 votes vote down vote up
def cosine(x1,x2):
    #find common ratings
    new_x1, new_x2 = common(x1,x2)
    #compute the cosine similarity between two vectors
    sum = new_x1.dot(new_x2)
    denom = sqrt(new_x1.dot(new_x1)*new_x2.dot(new_x2))
    try:
        return float(sum)/denom
    except ZeroDivisionError:
        return 0

    #return cosine_similarity(x1,x2)[0][0] 
Example #26
Source Project: VBDiarization   Author: Jamiroquai88   File: normalization.py    License: Apache License 2.0 5 votes vote down vote up
def s_norm(self, test, enroll):
        """ Run speaker normalization (S-Norm) on cached embeddings.

        Args:
            test (np.array): test embedding
            enroll (np.array): enroll embedding

        Returns:
            float: hypothesis
        """
        if self.plda:
            a = self.plda.score(test, self.embeddings).T
            b = self.plda.score(enroll, self.embeddings).T
            c = self.plda.score(enroll, test).T
        else:
            a = cosine_similarity(test, self.embeddings).T
            b = cosine_similarity(enroll, self.embeddings).T
            c = cosine_similarity(enroll, test).T
        scores = []
        for ii in range(test.shape[0]):
            test_scores = []
            for jj in range(enroll.shape[0]):
                test_mean, test_std = np.mean(a.T[ii]), np.std(a.T[ii])
                enroll_mean, enroll_std = np.mean(b.T[jj]), np.std(b.T[jj])
                s = c[ii][jj]
                test_scores.append((((s - test_mean) / test_std + (s - enroll_mean) / enroll_std) / 2))
            scores.append(test_scores)
        return np.array(scores) 
Example #27
Source Project: chameleon_recsys   Author: gabrielspmoreira   File: content_based.py    License: MIT License 5 votes vote down vote up
def predict(self, users_ids, sessions_items, topk=5, valid_items=None):         
        acr_embeddings = self.eval_benchmark_params['content_article_embeddings_matrix']

        recent_items_buffer = self.clicked_items_state.get_recent_clicks_buffer()
        if valid_items is None:
            recent_unique_item_ids = np.unique([recent_items_buffer[np.nonzero(recent_items_buffer)]])            
        else:
            recent_unique_item_ids = np.unique(valid_items)
            
        acr_embeddings_recent_items = acr_embeddings[recent_unique_item_ids]


        session_predictions = np.zeros(dtype=np.int64,
                                       shape=[sessions_items.shape[0],
                                              sessions_items.shape[1],
                                              topk])

        for row_idx, session_items in enumerate(sessions_items):    

            for col_idx, item in enumerate(session_items):
                if item != 0:

                    #Computing cosine similarity between this item and all recent items (from buffer)
                    #P.s. Do not need to ignore the current item (whose similarity is always, because this item will not be among the valid items (next click + negative samples not present in the session))
                    similarities = cosine_similarity(acr_embeddings[item].reshape(1, -1), 
                                                     acr_embeddings_recent_items)[0]
                    similar_items_sorted_idx = np.argsort(similarities, axis=0)[::-1]
                    similar_items_ids = recent_unique_item_ids[similar_items_sorted_idx]

                    session_predictions[row_idx, col_idx] = list(self._get_top_n_valid_items(similar_items_ids, topk, valid_items[row_idx, col_idx]))
                    

        return session_predictions 
Example #28
Source Project: embedding   Author: ratsgo   File: visualize_utils.py    License: MIT License 5 votes vote down vote up
def visualize_between_sentences(sentences, vec_list, palette="Viridis256",
                                filename="/notebooks/embedding/between-sentences.png",
                                use_notebook=False):
    df_list, score_list = [], []
    for sent1_idx, sentence1 in enumerate(sentences):
        for sent2_idx, sentence2 in enumerate(sentences):
            vec1, vec2 = vec_list[sent1_idx], vec_list[sent2_idx]
            if np.any(vec1) and np.any(vec2):
                score = cosine_similarity(X=[vec1], Y=[vec2])
                df_list.append({'x': sentence1, 'y': sentence2, 'similarity': score[0][0]})
                score_list.append(score[0][0])
    df = pd.DataFrame(df_list)
    color_mapper = LinearColorMapper(palette=palette, low=np.max(score_list), high=np.min(score_list))
    TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
    p = figure(x_range=sentences, y_range=list(reversed(sentences)),
                x_axis_location="above", plot_width=900, plot_height=900,
                toolbar_location='below', tools=TOOLS,
                tooltips=[('sentences', '@x @y'), ('similarity', '@similarity')])
    p.grid.grid_line_color = None
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_standoff = 0
    p.xaxis.major_label_orientation = 3.14 / 3
    p.rect(x="x", y="y", width=1, height=1,
            source=df,
            fill_color={'field': 'similarity', 'transform': color_mapper},
            line_color=None)
    color_bar = ColorBar(ticker=BasicTicker(desired_num_ticks=5),
                        color_mapper=color_mapper, major_label_text_font_size="7pt",
                        label_standoff=6, border_line_color=None, location=(0, 0))
    p.add_layout(color_bar, 'right')
    if use_notebook:
        output_notebook()
        show(p)
    else:
        export_png(p, filename)
        print("save @ " + filename) 
Example #29
Source Project: embedding   Author: ratsgo   File: visualize_utils.py    License: MIT License 5 votes vote down vote up
def visualize_between_words(words, vecs, palette="Viridis256", filename="/notebooks/embedding/between-words.png",
                            use_notebook=False):
    df_list = []
    for word1_idx, word1 in enumerate(words):
        for word2_idx, word2 in enumerate(words):
            vec1 = vecs[word1_idx]
            vec2 = vecs[word2_idx]
            if np.any(vec1) and np.any(vec2):
                score = cosine_similarity(X=[vec1], Y=[vec2])
                df_list.append({'x': word1, 'y': word2, 'similarity': score[0][0]})
    df = pd.DataFrame(df_list)
    color_mapper = LinearColorMapper(palette=palette, low=1, high=0)
    TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
    p = figure(x_range=list(words), y_range=list(reversed(list(words))),
               x_axis_location="above", plot_width=900, plot_height=900,
               toolbar_location='below', tools=TOOLS,
               tooltips=[('words', '@x @y'), ('similarity', '@similarity')])
    p.grid.grid_line_color = None
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_standoff = 0
    p.xaxis.major_label_orientation = 3.14 / 3
    p.rect(x="x", y="y", width=1, height=1,
           source=df,
           fill_color={'field': 'similarity', 'transform': color_mapper},
           line_color=None)
    color_bar = ColorBar(ticker=BasicTicker(desired_num_ticks=5),
                         color_mapper=color_mapper, major_label_text_font_size="7pt",
                         label_standoff=6, border_line_color=None, location=(0, 0))
    p.add_layout(color_bar, 'right')
    if use_notebook:
        output_notebook()
        show(p)
    else:
        export_png(p, filename)
        print("save @ " + filename) 
Example #30
Source Project: nlp_research   Author: zhufz   File: test_match.py    License: MIT License 5 votes vote down vote up
def __call__(self, text):
        if self.tfrecords_mode == 'point':
            assert text.find('||') != -1,"input should cotain two sentences seperated by ||"
            text_a = text.split('||')[0]
            text_b = text.split('||')[-1]
            pred,score = self._get_label([text_a], [text_b], need_preprocess = True)
            return pred[0][0], score[0][0]

        #加载自定义问句(自定义优先)
        if self.sim_mode == 'cross':
            text_list = self.text_list
            label_list = self.label_list
            if self.zdy != {}:
                text_list = self.zdy['text_list'] + text_list
                label_list = self.zdy['label_list'] + label_list
            pred,score = self._get_label([text], self.text_list, need_preprocess = True)
            selected_id = np.argmax(score)
            out_score = score[selected_id]
        elif self.sim_mode == 'represent':
            text_list = self.text_list
            vec_list = self.vec_list
            label_list = self.label_list
            if self.zdy != {}:
                text_list = self.zdy['text_list'] + text_list
                vec_list = np.concatenate([self.zdy['vec_list'], self.vec_list], axis = 0)
                label_list = self.zdy['label_list'] + label_list
            vec = self._get_vecs([text], need_preprocess = True)
            if self.is_distance:
                scores = euclidean_distances(vec, vec_list)[0]
                selected_id = np.argmin(scores)
                out_score = 1 - scores[selected_id]
            else:
                scores = cosine_similarity(vec, vec_list)[0]
                selected_id = np.argmax(scores)
                out_score = scores[selected_id]
        else:
            raise ValueError('unknown sim mode, represent or cross?')
        ret = (label_list[selected_id], out_score, selected_id, \
               self.text_list[selected_id])
        return ret