Python sklearn.metrics.pairwise.cosine_similarity() Examples

The following are 30 code examples of sklearn.metrics.pairwise.cosine_similarity(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.metrics.pairwise , or try the search function .
Example #1
Source File: test_pairwise.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_cosine_similarity():
    # Test the cosine_similarity.

    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    Y = rng.random_sample((3, 4))
    Xcsr = csr_matrix(X)
    Ycsr = csr_matrix(Y)

    for X_, Y_ in ((X, None), (X, Y),
                   (Xcsr, None), (Xcsr, Ycsr)):
        # Test that the cosine is kernel is equal to a linear kernel when data
        # has been previously normalized by L2-norm.
        K1 = pairwise_kernels(X_, Y=Y_, metric="cosine")
        X_ = normalize(X_)
        if Y_ is not None:
            Y_ = normalize(Y_)
        K2 = pairwise_kernels(X_, Y=Y_, metric="linear")
        assert_array_almost_equal(K1, K2) 
Example #2
Source File: utility.py    From DeepLearn with MIT License 6 votes vote down vote up
def cos_sim(ind1,ind2=1999):
    view1 = np.load("test_v1.npy")[0:ind1]
    view2 = np.load("test_v2.npy")[0:ind2]
    #val = []
    MAP=0
    for i,j in enumerate(view1):
        val=[]
        AP=0
        for x in view2:            
            val.append(cosine_similarity(j,x)[0].tolist())
        #val=val[0].tolist()
        #print val[0].tolist()
        val=[(q,p)for p,q in enumerate(val)]
        #print val
        val.sort()
        val.reverse()
        t = [w[1]for w in val[0:7]]
        for x,y in enumerate(t):
            if y in range(i,i+5):
                AP+=1/(x+1)
        print(t)
        print(AP)
        MAP+=AP
    print('MAP is : ',MAP/ind1) 
Example #3
Source File: entity_discoverer.py    From HarvestText with MIT License 6 votes vote down vote up
def clustering(self, threshold):
        """分不同词性的聚类

        :return: partition: dict {word_id: cluster_id}
        """
        print("Louvain clustering")
        partition = {}
        part_offset = 0
        for etype, ners in self.type_entity_dict.items():
            sub_id_mapping = [self.word2id[ner0] for ner0 in ners if ner0 in self.word2id]
            if len(sub_id_mapping) == 0:
                continue
            emb_mat_sub = self.emb_mat[sub_id_mapping, :]
            cos_sims = cosine_similarity(emb_mat_sub)
            cos_sims -= np.eye(len(emb_mat_sub))
            adj_mat = (cos_sims > threshold).astype(int)
            G = nx.from_numpy_array(adj_mat)
            partition_sub = community.best_partition(G)
            for sub_id, main_id in enumerate(sub_id_mapping):
                sub_part_id = partition_sub[sub_id]
                partition[main_id] = sub_part_id + part_offset
            part_offset += max(partition_sub.values()) + 1
        return partition 
Example #4
Source File: test_pairwise.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_cosine_similarity():
    # Test the cosine_similarity.

    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    Y = rng.random_sample((3, 4))
    Xcsr = csr_matrix(X)
    Ycsr = csr_matrix(Y)

    for X_, Y_ in ((X, None), (X, Y),
                   (Xcsr, None), (Xcsr, Ycsr)):
        # Test that the cosine is kernel is equal to a linear kernel when data
        # has been previously normalized by L2-norm.
        K1 = pairwise_kernels(X_, Y=Y_, metric="cosine")
        X_ = normalize(X_)
        if Y_ is not None:
            Y_ = normalize(Y_)
        K2 = pairwise_kernels(X_, Y=Y_, metric="linear")
        assert_array_almost_equal(K1, K2) 
Example #5
Source File: text_embedding_similarity_transformers.py    From driverlessai-recipes with Apache License 2.0 6 votes vote down vote up
def transform(self, X: dt.Frame):
        X.replace([None, math.inf, -math.inf], self._repl_val)
        from flair.embeddings import WordEmbeddings, BertEmbeddings, DocumentPoolEmbeddings, Sentence
        if self.embedding_name in ["glove", "en"]:
            self.embedding = WordEmbeddings(self.embedding_name)
        elif self.embedding_name in ["bert"]:
            self.embedding = BertEmbeddings()
        self.doc_embedding = DocumentPoolEmbeddings([self.embedding])
        output = []
        X = X.to_pandas()
        text1_arr = X.iloc[:, 0].values
        text2_arr = X.iloc[:, 1].values
        for ind, text1 in enumerate(text1_arr):
            try:
                text1 = Sentence(str(text1).lower())
                self.doc_embedding.embed(text1)
                text2 = text2_arr[ind]
                text2 = Sentence(str(text2).lower())
                self.doc_embedding.embed(text2)
                score = cosine_similarity(text1.get_embedding().reshape(1, -1),
                                          text2.get_embedding().reshape(1, -1))[0, 0]
                output.append(score)
            except:
                output.append(-99)
        return np.array(output) 
Example #6
Source File: faceApi.py    From FaceRecognition-RestApi with MIT License 6 votes vote down vote up
def compared(request):
    if request.method == 'POST':
        if len(request.FILES) != 2:
            return HttpResponse('{"status":false,"data":"","msg":"图片参数错误!"}')
        starttime = time.time()
        name1 = str(random.randint(10000, 99999)) + str(time.time())  # 随机名字
        name2 = str(random.randint(10000, 99999)) + str(time.time())

        handle_uploaded_file(request.FILES['face1'], str(name1))
        handle_uploaded_file(request.FILES['face2'], str(name2))

        tz1 = get_feature(root + "RestServer/upload/" + str(name1))

        tz2 = get_feature(root + "RestServer/upload/" + str(name2))

        comparedValue = pw.cosine_similarity(tz1, tz2)[0][0]

        os.remove(root + "RestServer/upload/" + str(name1))
        os.remove(root + "RestServer/upload/" + str(name2))
        endtime = time.time()
        Runtime=endtime-starttime
        return HttpResponse('{"status":true,"data":"' + str(comparedValue) + '","msg":"成功","runtime": ' + str(Runtime) + '  }')
    else:
        return HttpResponse('{"status":false,"data":"","msg":"请求不合法"}')
    return HttpResponse('{"status":false,"data":"","msg":"未知错误"}') 
Example #7
Source File: app.py    From altair with Apache License 2.0 6 votes vote down vote up
def get_closest_docs(uri):
    #user_doc = requests.get(uri).text
    r = requests.get(uri)
    if r.status_code == 200:
        user_doc = r.text
        print("URI content length",len(user_doc))
        code, _ = separate_code_and_comments(user_doc,"user doc")
        normalized_code = normalize_text(code, remove_stop_words=False, only_letters=False, return_list=True)
        model.random.seed(0)
        user_vector = model.infer_vector(normalized_code)
        print("finding similar...")
        sys.stdout.flush()
        stored_urls = list()
        stored_vectors = list()
        for url in vectors:
            stored_urls.append(url)
            stored_vectors.append(vectors[url])
        pair_sims = cosine_similarity(user_vector.reshape(1, -1), stored_vectors)
        indices = (-pair_sims[0]).argsort()[:5]
        return [(stored_urls[index],round(float(pair_sims[0][index]),2)) for index in indices]
    else:
        print("URL returned status code", r.status_code)
        raise ValueError('URL error') 
Example #8
Source File: helpers.py    From fnc-1 with Apache License 2.0 6 votes vote down vote up
def cosine_sim(x, y):
    try:
        if type(x) is np.ndarray: x = x.reshape(1, -1) # get rid of the warning
        if type(y) is np.ndarray: y = y.reshape(1, -1)
        d = cosine_similarity(x, y)
        d = d[0][0]
    except:
        print x
        print y
        d = 0.
    return d

 #   Copyright 2017 Cisco Systems, Inc.
 #  
 #   Licensed under the Apache License, Version 2.0 (the "License");
 #   you may not use this file except in compliance with the License.
 #   You may obtain a copy of the License at
 #  
 #     http://www.apache.org/licenses/LICENSE-2.0
 #  
 #   Unless required by applicable law or agreed to in writing, software
 #   distributed under the License is distributed on an "AS IS" BASIS,
 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #   See the License for the specific language governing permissions and
 #   limitations under the License. 
Example #9
Source File: feature_engineering.py    From CIKM-AnalytiCup-2018 with Apache License 2.0 6 votes vote down vote up
def _get_similarity_values(self, q1_csc, q2_csc):
        cosine_sim = []
        manhattan_dis = []
        eucledian_dis = []
        jaccard_dis = []
        minkowsk_dis = []
        
        for i,j in zip(q1_csc, q2_csc):
            sim = cs(i, j)
            cosine_sim.append(sim[0][0])
            sim = md(i, j)
            manhattan_dis.append(sim[0][0])
            sim = ed(i, j)
            eucledian_dis.append(sim[0][0])
            i_ = i.toarray()
            j_ = j.toarray()
            try:
                sim = jsc(i_, j_)
                jaccard_dis.append(sim)
            except:
                jaccard_dis.append(0)
                
            sim = minkowski_dis.pairwise(i_, j_)
            minkowsk_dis.append(sim[0][0])
        return cosine_sim, manhattan_dis, eucledian_dis, jaccard_dis, minkowsk_dis 
Example #10
Source File: itemitem.py    From Hands-on-Supervised-Machine-Learning-with-Python with MIT License 6 votes vote down vote up
def _compute_sim(self, R, k):
        # compute the similarity between all the items. This calculates the
        # similarity between each ITEM
        sim = cosine_similarity(R.T)

        # Only keep the similarities of the top K, setting all others to zero
        # (negative since we want descending)
        not_top_k = np.argsort(-sim, axis=1)[:, k:]  # shape=(n_items, k)

        if not_top_k.shape[1]:  # only if there are cols (k < n_items)
            # now we have to set these to zero in the similarity matrix
            row_indices = np.repeat(range(not_top_k.shape[0]),
                                    not_top_k.shape[1])
            sim[row_indices, not_top_k.ravel()] = 0.

        return sim 
Example #11
Source File: saxvsm.py    From pyts with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def decision_function(self, X):
        """Evaluate the cosine similarity between document-term matrix and X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_timestamps)
            Test samples.

        Returns
        -------
        X : array-like, shape (n_samples, n_classes)
            osine similarity between the document-term matrix and X.

        """
        check_is_fitted(self, ['vocabulary_', 'tfidf_', 'idf_',
                               '_tfidf', 'classes_'])
        X = check_array(X)
        X_bow = self._bow.transform(X)
        vectorizer = CountVectorizer(vocabulary=self._tfidf.vocabulary_)
        X_transformed = vectorizer.transform(X_bow).toarray()
        return cosine_similarity(X_transformed, self.tfidf_) 
Example #12
Source File: test_skater.py    From region with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_init():
    default = Spanning_Forest()
    assert default.metric == skm.manhattan_distances
    assert default.center == np.mean
    assert default.reduction == np.sum
    change = Spanning_Forest(dissimilarity=skm.euclidean_distances,
                             center=np.median, reduction=np.max)
    assert change.metric == skm.euclidean_distances
    assert change.center == np.median
    assert change.reduction == np.max
    
    sym = Spanning_Forest(affinity=skm.cosine_similarity)
    assert isinstance(sym.metric, types.LambdaType)
    test_distance = -np.log(skm.cosine_similarity(data[:2,]))
    comparator = sym.metric(data[:2,])
    np.testing.assert_allclose(test_distance, comparator) 
Example #13
Source File: save_utils.py    From keras-glove with MIT License 6 votes vote down vote up
def save_model(model: Model, tokenizer: Tokenizer):
    """
    Saves the important parts of the model
    :param model: Keras model to save
    :param tokenizer: Keras Tokenizer to save
    """
    for layer in model.layers:
        if '_biases' in layer.name or '_embeddings' in layer.name:
            np.save(file=f'{OUTPUT_FOLDER}{layer.name}', arr=layer.get_weights()[0])

    # save tokenizer
    pickle.dump(obj=tokenizer.index_word, file=open(f'{OUTPUT_FOLDER}{INDEX2WORD}', 'wb'))
    pickle.dump(obj=tokenizer.word_index, file=open(f'{OUTPUT_FOLDER}{WORD2INDEX}', 'wb'))

    # save combined embeddings & correlation matrix
    agg_embeddings = np.load(f'{OUTPUT_FOLDER}{CENTRAL_EMBEDDINGS}.npy') + \
                     np.load(f'{OUTPUT_FOLDER}{CONTEXT_EMBEDDINGS}.npy')

    np.save(file=f'{OUTPUT_FOLDER}{AGGREGATED_EMBEDDINGS}', arr=agg_embeddings)
    np.save(file=f'{OUTPUT_FOLDER}{CORRELATION_MATRIX}', arr=cosine_similarity(cosine_similarity(agg_embeddings))) 
Example #14
Source File: test_pairwise.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_kernel_symmetry():
    # Valid kernels should be symmetric
    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    for kernel in (linear_kernel, polynomial_kernel, rbf_kernel,
                   laplacian_kernel, sigmoid_kernel, cosine_similarity):
        K = kernel(X, X)
        assert_array_almost_equal(K, K.T, 15) 
Example #15
Source File: face_recognition.py    From FindFaceInVideo with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def compare_pic(feature1, feature2):
    predicts = pw.cosine_similarity(feature1, feature2);
    return predicts; 
Example #16
Source File: test_bert_sentence_encoding.py    From nlp-recipes with MIT License 5 votes vote down vote up
def test_sentence_encoding(tmp, data):
    se = BERTSentenceEncoder(
        language=Language.ENGLISH,
        num_gpus=0,
        to_lower=True,
        max_len=128,
        layer_index=-2,
        pooling_strategy=PoolingStrategy.MEAN,
        cache_dir=tmp,
    )

    result = se.encode(data, as_numpy=False)
    similarity = cosine_similarity(result["values"].values.tolist())
    assert similarity[0, 0] > similarity[1, 0]
    assert similarity[0, 1] > similarity[0, 2] 
Example #17
Source File: test_pairwise.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_kernel_sparse():
    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    X_sparse = csr_matrix(X)
    for kernel in (linear_kernel, polynomial_kernel, rbf_kernel,
                   laplacian_kernel, sigmoid_kernel, cosine_similarity):
        K = kernel(X, X)
        K2 = kernel(X_sparse, X_sparse)
        assert_array_almost_equal(K, K2) 
Example #18
Source File: inltk.py    From inltk with MIT License 5 votes vote down vote up
def get_similar_sentences(sen: str, no_of_variations: int, language_code: str, degree_of_aug: float = 0.1):
    check_input_language(language_code)
    # get embedding vectors for sen
    tok = LanguageTokenizer(language_code)
    token_ids = tok.numericalize(sen)
    embedding_vectors = get_embedding_vectors(sen, language_code)
    # get learner
    defaults.device = torch.device('cpu')
    path = Path(__file__).parent
    learn = load_learner(path / 'models' / f'{language_code}')
    encoder = get_model(learn.model)[0]
    encoder.reset()
    embeddings = encoder.state_dict()['encoder.weight']
    embeddings = np.array(embeddings)
    # cos similarity of vectors
    scores = cosine_similarity(embedding_vectors,embeddings)
    word_ids = [np.argpartition(-np.array(score), no_of_variations+1)[:no_of_variations+1] for score in scores]
    word_ids = [ids.tolist() for ids in word_ids]
    for i, ids in enumerate(word_ids):
        word_ids[i] = [wid for wid in word_ids[i] if wid != token_ids[i]]
    # generating more variations than required so that we can then filter out the best ones
    buffer_multiplicity = 2
    new_sen_tokens = []
    for i in range(no_of_variations):
        for k in range(buffer_multiplicity):
            new_token_ids = []
            ids = sorted(random.sample(range(len(token_ids)), max(1, int(degree_of_aug * len(token_ids)))))
            for j in range(len(token_ids)):
                if j in ids:
                    new_token_ids.append(word_ids[j][(i + k) % len(word_ids[j])])
                else:
                    new_token_ids.append(token_ids[j])
            new_token_ids = list(map(lambda x: int(x), new_token_ids))
            new_sen_tokens.append(new_token_ids)
    new_sens = [tok.textify(sen_tokens) for sen_tokens in new_sen_tokens]
    while sen in new_sens:
        new_sens.remove(sen)
    sen_with_sim_score = [(new_sen, get_sentence_similarity(sen, new_sen, language_code)) for new_sen in new_sens]
    sen_with_sim_score.sort(key=lambda x: x[1], reverse=True)
    new_sens = [sen for sen, _ in sen_with_sim_score]
    return new_sens[:no_of_variations] 
Example #19
Source File: bossvs.py    From pyts with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def decision_function(self, X):
        """Evaluate the cosine similarity between document-term matrix and X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_timestamps)
            Test samples.

        Returns
        -------
        X : array, shape (n_samples, n_classes)
            Cosine similarity between the document-term matrix and X.

        """
        check_is_fitted(self, ['vocabulary_', 'tfidf_', 'idf_', '_tfidf'])
        X = check_array(X, dtype='float64')
        n_samples, n_timestamps = X.shape

        X_windowed = _windowed_view(
            X, n_samples, n_timestamps, self._window_size, self._window_step
        )
        X_windowed = X_windowed.reshape(-1, self._window_size)

        X_sfa = self._sfa.transform(X_windowed)
        X_word = np.asarray([''.join(X_sfa[i]) for i in range(X_sfa.shape[0])])
        X_word = X_word.reshape(n_samples, self._n_windows)

        if self.numerosity_reduction:
            not_equal = np.c_[X_word[:, 1:] != X_word[:, :-1],
                              np.full(n_samples, True)]
            X_bow = np.asarray([' '.join(X_word[i, not_equal[i]])
                                for i in range(n_samples)])
        else:
            X_bow = np.asarray([' '.join(X_word[i]) for i in range(n_samples)])

        X_tf = self._tfidf.transform(X_bow).toarray()
        if self.idf_ is not None:
            X_tf /= self.idf_
        return cosine_similarity(X_tf, self.tfidf_) 
Example #20
Source File: qmath.py    From RecQ with GNU General Public License v3.0 5 votes vote down vote up
def cosine(x1,x2):
    #find common ratings
    #new_x1, new_x2 = common(x1,x2)
    #compute the cosine similarity between two vectors
    sum = x1.dot(x2)
    denom = sqrt(x1.dot(x1)*x2.dot(x2))
    try:
        return float(sum)/denom
    except ZeroDivisionError:
        return 0

    #return cosine_similarity(x1,x2)[0][0] 
Example #21
Source File: fever_features.py    From fever-naacl-2018 with Apache License 2.0 5 votes vote down vote up
def process(self,data):
        claim_bow = self.bow_vectorizer.transform(self.claims(data))
        claim_tfs = self.tfreq_vectorizer.transform(claim_bow)
        claim_tfidf = self.tfidf_vectorizer.transform(self.claims(data))

        body_texts = self.texts(data)
        body_bow = self.bow_vectorizer.transform(body_texts)
        body_tfs = self.tfreq_vectorizer.transform(body_bow)
        body_tfidf = self.tfidf_vectorizer.transform(body_texts)

        cosines = np.array([cosine_similarity(c, b)[0] for c,b in zip(claim_tfidf,body_tfidf)])

        return hstack([body_tfs,claim_tfs,cosines]) 
Example #22
Source File: process_tfidf_grid.py    From fever-naacl-2018 with Apache License 2.0 5 votes vote down vote up
def process(self, data):
        claim_bow = self.bow_vectorizer.transform(self.claims(data))
        claim_tfs = self.tfreq_vectorizer.transform(claim_bow)
        claim_tfidf = self.tfidf_vectorizer.transform(self.claims(data))

        body_texts = self.texts(data)
        body_bow = self.bow_vectorizer.transform(body_texts)
        body_tfs = self.tfreq_vectorizer.transform(body_bow)
        body_tfidf = self.tfidf_vectorizer.transform(body_texts)

        cosines = np.array([cosine_similarity(c, b)[0] for c, b in zip(claim_tfidf, body_tfidf)])

        return cosines 
Example #23
Source File: feature_engineering.py    From CIKM-AnalytiCup-2018 with Apache License 2.0 5 votes vote down vote up
def _create_weighted_distance_features(self, df):
        q1_matrix = self.tfidf_vectorizer.transform(df['spn_1'].values.tolist())
        q2_matrix = self.tfidf_vectorizer.transform(df['spn_2'].values.tolist())
        df['weighted_cosine_sim'] = np.concatenate([cs(q1_matrix[i], q2_matrix[i]).flatten() for i in range(q1_matrix.shape[0])])
        #df['weighted_eucledian_dis'] = np.square((q1_matrix - q2_matrix).toarray()).sum(axis=1) 
Example #24
Source File: scorer.py    From entity2vec with Apache License 2.0 5 votes vote down vote up
def similarity_function(vec1,vec2, similarity):
    
    #compute cosine similarity or other similarities

    v1 = np.array(vec1)

    v2 = np.array(vec2)

    if len(v1)*len(v2) == 0: #any of the two is 0
        global count
        count +=1

        return 0

    else:

        if similarity == 'cosine':

            return cosine_similarity([v1],[v2])[0][0] #returns a double array [[sim]]

        elif similarity == 'softmax':

            return np.exp(np.dot(v1,v2)) #normalization is useless for relative comparisons

        elif similarity == 'linear_kernel':
            return linear_kernel(v1,v2)[0][0]

        elif similarity == 'euclidean':
            return euclidean_distances(v1,v2)[0][0]
        else:
            raise NameError('Choose a valid similarity function') 
Example #25
Source File: spine_sample.py    From ikelos with MIT License 5 votes vote down vote up
def on_epoch_end(self, epoch, logs={}):
        indices = np.random.choice(len(self.spine_embedder), 10, False)
        comparisons = cosine_similarity(self.spine_embedder[indices], self.spine_embedder)
        results = np.argmax(comparisons, axis=-1)
        spine_vocab = self.igor.vocabs.spines
        comp_spines = [spine_vocab.lookup(x) for x in results]
        in_spines = [spine_vocab.lookup(x) for x in indices]
        for spine_i, spine_j in zip(in_spines, comp_spines):
            print("SPINE: {}".format(self.decode(spine_i)))
            print("\t most similar to {}".format(self.decode(spine_j))) 
Example #26
Source File: dist_utils.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def _cosine_sim(vec1, vec2):
    try:
        s = cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]
    except:
        try:
            s = cosine_similarity(vec1, vec2)[0][0]
        except:
            s = config.MISSING_VALUE_NUMERIC
    return s 
Example #27
Source File: tcga_benchmark.py    From perfect_match with MIT License 5 votes vote down vote up
def get_centroid_weights(self, x):
        similarities = map(lambda indices, centroid: cosine_similarity(x[indices].reshape(1, -1),
                                                                       centroid.reshape(1, -1)),
                           map(lambda x: x[0], self.centroids),
                           map(lambda x: x[1], self.centroids))
        return np.squeeze(similarities) 
Example #28
Source File: twins_benchmark.py    From perfect_match with MIT License 5 votes vote down vote up
def get_centroid_weights(self, x):
        similarities = map(
            lambda centroid: cosine_similarity(self.data_access.standardise_entry(
                                                   np.array(x[7:], dtype="float32")
                                               ).reshape((1, -1)),
                                               centroid.reshape((1, -1))),
            map(lambda x: x[0], self.centroids)
        )
        return np.squeeze(similarities) 
Example #29
Source File: feature_axis.py    From transparent_latent_gan with MIT License 5 votes vote down vote up
def plot_feature_cos_sim(feature_direction, feature_name=None):
    """
    plot cosine similarity measure of vectors

    :param feature_direction: vectors, shape = (num_dimension, num_vector)
    :param feature_name:      list of names of features
    :return:                  cosines similarity matrix, shape = (num_vector, num_vector)
    """
    import matplotlib.pyplot as plt
    from sklearn.metrics.pairwise import cosine_similarity

    len_z, len_y = feature_direction.shape
    if feature_name is None:
        feature_name = range(len_y)

    feature_cos_sim = cosine_similarity(feature_direction.transpose())

    c_lim_abs = np.max(np.abs(feature_cos_sim))

    plt.pcolormesh(np.arange(len_y+1), np.arange(len_y+1), feature_cos_sim,
                   vmin=-c_lim_abs, vmax=+c_lim_abs, cmap='coolwarm')
    plt.gca().invert_yaxis()
    plt.colorbar()
    # plt.axis('square')
    plt.xticks(np.arange(len_y) + 0.5, feature_name, fontsize='x-small', rotation='vertical')
    plt.yticks(np.arange(len_y) + 0.5, feature_name, fontsize='x-small')
    plt.show()
    return feature_cos_sim 
Example #30
Source File: test_saxvsm.py    From pyts with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_actual_results_strategy_uniform():
    """Test that the actual results are the expected ones."""
    # Data
    X = [[0, 0, 0, 1, 0, 0, 1, 1, 1],
         [0, 1, 1, 1, 0, 0, 1, 1, 1],
         [0, 0, 0, 1, 0, 0, 0, 1, 0]]
    y = [0, 0, 1]

    clf = SAXVSM(window_size=4, word_size=4, n_bins=2, strategy='uniform',
                 numerosity_reduction=False, sublinear_tf=False)
    decision_function_actual = clf.fit(X, y).decision_function(X)

    # X_bow = ["aaab aaba abaa baab aabb abbb",
    #          "abbb bbba bbaa baab aabb abbb",
    #          "aaab aaba abaa baaa aaab aaba"]

    assert clf.vocabulary_ == {0: 'aaab', 1: 'aaba', 2: 'aabb', 3: 'abaa',
                               4: 'abbb', 5: 'baaa', 6: 'baab', 7: 'bbaa',
                               8: 'bbba'}

    freq = np.asarray([[1, 1, 1, 1, 1, 0, 1, 0, 0],
                       [0, 0, 1, 0, 2, 0, 1, 1, 1],
                       [2, 2, 0, 1, 0, 1, 0, 0, 0]])
    tf = np.asarray([[1, 1, 2, 1, 3, 0, 2, 1, 1],
                     [2, 2, 0, 1, 0, 1, 0, 0, 0]])
    idf = np.asarray([1, 1, log(2) + 1, 1, log(2) + 1, log(2) + 1, log(2) + 1,
                      log(2) + 1, log(2) + 1])
    decision_function_desired = cosine_similarity(freq, tf * idf[None, :])
    np.testing.assert_allclose(decision_function_actual,
                               decision_function_desired, atol=1e-5, rtol=0.)

    pred_actual = clf.predict(X)
    pred_desired = cosine_similarity(freq, tf * idf[None, :]).argmax(axis=1)
    np.testing.assert_array_equal(pred_actual, pred_desired)