Python sklearn.metrics.pairwise.cosine_similarity() Examples

The following are code examples for showing how to use sklearn.metrics.pairwise.cosine_similarity(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: airbnbbot   Author: shirosaidev   File: airbnb_bot.py    Apache License 2.0 7 votes vote down vote up
def file_lookup(user_response):
    """Try to get response to question using nltk and sklearn from text in corpus file.
    Returns None if confidence is 0 else returns a tuple with 
    response text and confidence percent."""
    tobo_response=''
    sent_tokens.append(user_response)
    TfidfVec = TfidfVectorizer(tokenizer=lem_normalize, stop_words='english')
    tfidf = TfidfVec.fit_transform(sent_tokens)
    vals = cosine_similarity(tfidf[-1], tfidf)
    idx=vals.argsort()[0][-2]
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-2]  # confidence percent
    sent_tokens.remove(user_response)
    if req_tfidf == 0:
        return None
    else:
        tobo_response = tobo_response+sent_tokens[idx]
        return tobo_response, req_tfidf

# standard greetings and responses 
Example 2
Project: WordNetEmbeddings   Author: nlx-group   File: vector_distance.py    MIT License 7 votes vote down vote up
def cosine_sim(v1,v2,mode):
    if mode == "auto":
        #return(1 - distance.cosine(v1,v2))
        return(cosine_similarity(v1.reshape(1, -1),v2.reshape(1, -1)))
    else:
        "compute cosine similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)"
        #synsDim = v2.split(" ")
        sumxx, sumxy, sumyy = 0, 0, 0
        j = 0
        for i in range(len(v1)):
            if v2[j] == "":
                j += 1
            y = float(v2[j])
            j += 1
            x = v1[i]
            sumxx += x*x
            sumyy += y*y
            sumxy += x*y

        if math.sqrt(sumxx*sumyy) == 0 :
            return (0.00000001)
        return (sumxy/math.sqrt(sumxx*sumyy)) 
Example 3
Project: CIZSL   Author: mhelhoseiny   File: train_CIZSL.py    MIT License 6 votes vote down vote up
def eval_fakefeat_test(it, netG, dataset, param, result):
    gen_feat = np.zeros([0, param.X_dim])
    for i in range(dataset.test_cls_num):
        text_feat = np.tile(dataset.test_text_feature[i].astype('float32'), (opt.nSample, 1))
        text_feat = Variable(torch.from_numpy(text_feat)).cuda()
        z = Variable(torch.randn(opt.nSample, param.z_dim)).cuda()
        G_sample = netG(z, text_feat)
        gen_feat = np.vstack((gen_feat, G_sample.data.cpu().numpy()))

    # cosince predict K-nearest Neighbor
    sim = cosine_similarity(dataset.pfc_feat_data_test, gen_feat)
    idx_mat = np.argsort(-1 * sim, axis=1)
    label_mat = (idx_mat[:, 0:opt.Knn] / opt.nSample).astype(int)
    preds = np.zeros(label_mat.shape[0])
    for i in range(label_mat.shape[0]):
        (values, counts) = np.unique(label_mat[i], return_counts=True)
        preds[i] = values[np.argmax(counts)]

    # produce acc
    label_T = np.asarray(dataset.labels_test)
    acc = (preds == label_T).mean() * 100

    result.acc_list += [acc]
    return acc 
Example 4
Project: fuzzy-fs   Author: achyudh   File: feature_selection_using_cmeans.py    MIT License 6 votes vote down vote up
def selecttop(CF, k):
    """
        Finds cosine similarity between SC and Wi and returns index of top features
    """
    NCF = np.zeros((CF.shape[1],CF.shape[1]))
    for i in range(CF.shape[1]):
        for j in range(CF.shape[1]):
            if (CF[i,j]+CF[j,j]-CF[i,j]) !=0:
                NCF[i,j]=CF[i,j]/(CF[i,j]+CF[j,j]-CF[i,j])
            else:
                NCF[i,j]=0
            
    SC = np.zeros(CF.shape[1])
    for i in range(CF.shape[1]):
        SC[i] = np.sum(NCF[i,:])
    
    print(np.isnan(SC).any())
    print(np.isnan(CF).any())
    cosim = cosine_similarity(SC,CF)
    return (-cosim).argsort()[0][:int(k*CF.shape[1])]

#Loading CF matrix for each cluster 
Example 5
Project: SCDS   Author: feliksh   File: dominantset.py    GNU General Public License v3.0 6 votes vote down vote up
def get_adj_matrix(self):
        if self.metric == 'euclidean':
            dist_mat = distance.pdist(self.feature_vectors, metric=self.metric)
            dist_mat = distance.squareform(dist_mat)
        else:  # cosine distance
            dist_mat = pw.cosine_similarity(self.feature_vectors)
            dist_mat = np.arccos(dist_mat)
            dist_mat[np.eye(dist_mat.shape[0]) > 0] = 0
            dist_mat /= np.pi

        # the following heuristic is derived from Perona 2005 (Self-tuning spectral clustering)
        # with adaption from Zemene and Pelillo 2016 (Interactive image segmentation using
        # constrained dominant sets)
        sigmas = np.sort(dist_mat, axis=1)[:, 1:8]
        sigmas = np.mean(sigmas, axis=1)
        sigmas = np.dot(sigmas[:, np.newaxis], sigmas[np.newaxis, :])
        dist_mat /= -sigmas
        self.adj_matrix = np.exp(dist_mat)

        # zeros in main diagonal needed for dominant sets
        self.adj_matrix = self.adj_matrix * (1. - np.identity(self.adj_matrix.shape[0]))
        return self.adj_matrix 
Example 6
Project: poros   Author: diqiuzhuanzhuan   File: run_classifier.py    MIT License 6 votes vote down vote up
def main():
    model = SimpleClassifierModel(
        is_train=True,
        bert_config_file="./data/chinese_L-12_H-768_A-12/bert_config.json",
        vocab_file="./data/chinese_L-12_H-768_A-12/vocab.txt",
        output_dir="./output",
        max_seq_length=512,
        train_file="./data/train.csv",
        dev_file="./data/dev.csv",
        init_checkpoint="./data/chinese_L-12_H-768_A-12/bert_model.ckpt",
        label_list=["0", "1", "2", "3"],
        num_train_epochs=5,
        train_batch_size=8
    )
    model.train()
    res = model.predict([["1", "李勇"], ["2", "保险"]])
    print("prediction is {}".format(list(res)))
    res = list(res)
    from sklearn.metrics.pairwise import cosine_similarity
    print(cosine_similarity([res[0]["output_layer"], res[1]["output_layer"]]))
    model.eval()
    model.export_savedmodel("./export") 
Example 7
Project: DeepLearn   Author: GauravBh1010tt   File: utility.py    MIT License 6 votes vote down vote up
def cos_sim(ind1,ind2=1999):
    view1 = np.load("test_v1.npy")[0:ind1]
    view2 = np.load("test_v2.npy")[0:ind2]
    #val = []
    MAP=0
    for i,j in enumerate(view1):
        val=[]
        AP=0
        for x in view2:            
            val.append(cosine_similarity(j,x)[0].tolist())
        #val=val[0].tolist()
        #print val[0].tolist()
        val=[(q,p)for p,q in enumerate(val)]
        #print val
        val.sort()
        val.reverse()
        t = [w[1]for w in val[0:7]]
        for x,y in enumerate(t):
            if y in range(i,i+5):
                AP+=1/(x+1)
        print(t)
        print(AP)
        MAP+=AP
    print('MAP is : ',MAP/ind1) 
Example 8
Project: watlink   Author: dustalov   File: link.py    MIT License 6 votes vote down vote up
def emit(id):
    if not id in hctx:
        return (id, {})

    hvector, candidates = v.transform(hctx[id]), Counter()

    for hypernym in hctx[id]:
        hsenses = Counter({hid: sim(v.transform(Counter(synsets[hid])), hvector).item(0) for hid in index[hypernym]})

        for hid, cosine in hsenses.most_common(1):
            if cosine > 0:
                candidates[(hypernym, hid)] = cosine

    matches = [(hypernym, hid, cosine) for (hypernym, hid), cosine in
               candidates.most_common(len(candidates) if args.k == 0 else args.k) if hypernym not in synsets[id]]

    return (id, matches) 
Example 9
Project: define-semantic-annotation   Author: iarroyof   File: Categorizador.py    GNU General Public License v2.0 6 votes vote down vote up
def crearListaDefiniciones(archivoCsv):
    listaDefiniciones = []
    #Se abre el archivo para lectura.
    with open(archivoCsv, 'rb') as csvfile:
        #Se crea un objeto para leer el archivo.
        lecturaDefiniciones = csv.DictReader(csvfile)
        #Se recorre cada linea del archivo para obtener la información.
        for row in lecturaDefiniciones:
            #Se guarda la información en la lista.
            listaDefiniciones.append(row['definicion'].lower())
	return listaDefiniciones
    """Nombre de la función: generarRelacion
	Descripción: Realiza la comparacion de los vectores de definiciones y acepciones de wikipedia con la funcion cosine_similarity, se obtiene la lista de la
				relacion
	Versión:1.0
	Autor:Ramón Pantoja Velasco
	Parámetros: vectorDefiniciones.- Vector obtenido de la listaDefiniciones al transformarla ya sea con o sin TFIDF, vectorWiki.- Vector obtenido de la lista
				de acepciones de wikipedia debes ser obtenida de la misma forma que listaDefiniciones, minimaSimilitud.- Valor entre 0 y 1 que define la
				minima similitud para la relacion definicion->acepcion de Wikipedia.
	Retorno:listaRelacion.- Lista con la relacion de definicion->acepcion
	
    """ 
Example 10
Project: ijcai2019-relis   Author: UKPLab   File: state_type.py    MIT License 6 votes vote down vote up
def getSimilarity(self, tokens1, sentences2, fullDoc=False):
        # tokens1 is a string of tokens
        # sentences2 is a list of sentences, and each sentences is a list of tokens
        token_list1 = tokens1.split(' ')
        token_str1 = tokens1
        if fullDoc:
            token_str2  = ' '.join(sentences2)
            token_list2 = token_str2.split(' ')
        else:
            token_list2 = sentences2.split(' ')
            token_str2 = sentences2
        tfidf_vectorizer = TfidfVectorizer(min_df=0)
        #print('token_list 1: '+' '.join(token_list1))
        #print('token_list 2: '+' '.join(token_list2))
        if self.noCommonTokens(token_list1,token_list2):
            return 0
        if token_str2 == token_str1:
            return 1
        try:
            tfidf_matrix_train = tfidf_vectorizer.fit_transform([token_str1, token_str2])
        except ValueError:
            return 0
        return cosine_similarity(tfidf_matrix_train[0], tfidf_matrix_train[1])[0][0] 
Example 11
Project: Realtime-Recommender   Author: ggeop   File: model_feeder.py    GNU General Public License v3.0 6 votes vote down vote up
def run(self):
        sql_connection = SqlConnector()
        input_df = sql_connection.selecting_query()
        dataframe_transformer = DataframeTransformer(DATABASE['target_column'])
        input_text = dataframe_transformer.transform(input_df)
        train_tdm = self.model_manager.create_model(input_text)
        logging.info('WAITING FOR USER INPUT..')
        documents = input('INSERT TO THE MODEL: ')
        results_calculator = ResultsCalculator(dataset=input_df,
                                               result_column=DATABASE['target_column'],
                                               vectorizer=self.model_manager.vectorizer,
                                               similarity_measure=cosine_similarity,
                                               number_of_recommendations=3)

        result, score = results_calculator.query(train_tdm, [documents])
        print(result) 
Example 12
Project: watasense   Author: nlpub   File: wsd.py    MIT License 6 votes vote down vote up
def disambiguate_word(self, sentence, index):
        super().disambiguate_word(sentence, index)

        lemmas = self.lemmatize(sentence)

        if index not in lemmas:
            return

        svector = self.sensegram(lemmas.values())  # sentence vector

        if svector is None:
            return

        # map synset identifiers to the cosine similarity value
        candidates = Counter({id: sim(svector, self.dense[id]).item(0)
                              for id in self.inventory.index[lemmas[index]]
                              if self.dense[id] is not None})

        if not candidates:
            return

        for id, _ in candidates.most_common(1):
            return id 
Example 13
Project: driverlessai-recipes   Author: h2oai   File: text_embedding_similarity_transformers.py    Apache License 2.0 6 votes vote down vote up
def transform(self, X: dt.Frame):
        X.replace([None, math.inf, -math.inf], self._repl_val)
        from flair.embeddings import WordEmbeddings, BertEmbeddings, DocumentPoolEmbeddings, Sentence
        if self.embedding_name in ["glove", "en"]:
            self.embedding = WordEmbeddings(self.embedding_name)
        elif self.embedding_name in ["bert"]:
            self.embedding = BertEmbeddings()
        self.doc_embedding = DocumentPoolEmbeddings([self.embedding])
        output = []
        X = X.to_pandas()
        text1_arr = X.iloc[:, 0].values
        text2_arr = X.iloc[:, 1].values
        for ind, text1 in enumerate(text1_arr):
            try:
                text1 = Sentence(str(text1).lower())
                self.doc_embedding.embed(text1)
                text2 = text2_arr[ind]
                text2 = Sentence(str(text2).lower())
                self.doc_embedding.embed(text2)
                score = cosine_similarity(text1.get_embedding().reshape(1, -1),
                                          text2.get_embedding().reshape(1, -1))[0, 0]
                output.append(score)
            except:
                output.append(-99)
        return np.array(output) 
Example 14
Project: FaceRecognitionProjects   Author: ForrestPi   File: system_main.py    MIT License 6 votes vote down vote up
def compar_pic(path1,path2):
    global net
    #加载验证图片
    X=read_image(path1)
    test_num=np.shape(X)[0]
    #X  作为 模型的输入
    out = net.forward_all(data = X)
    #fc7是模型的输出,也就是特征值
    feature1 = np.float64(out['fc7'])
    feature1=np.reshape(feature1,(test_num,4096))
    #np.savetxt('feature1.txt', feature1, delimiter=',')

    #加载注册图片
    X=read_image(path2)
    #X  作为 模型的输入
    out = net.forward_all(data=X)
    #fc7是模型的输出,也就是特征值
    feature2 = np.float64(out['fc7'])
    feature2=np.reshape(feature2,(test_num,4096))
    #np.savetxt('feature2.txt', feature2, delimiter=',')
    #求两个特征向量的cos值,并作为是否相似的依据
    predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts 
Example 15
Project: FaceRecognitionProjects   Author: ForrestPi   File: test_main.py    MIT License 6 votes vote down vote up
def compar_pic(path1,path2):
    global net
    #加载验证图片
    X=read_image(path1)
    test_num=np.shape(X)[0]
    #X  作为 模型的输入
    out = net.forward_all(data = X)
    #fc7是模型的输出,也就是特征值
    feature1 = np.float64(out['fc7'])
    feature1=np.reshape(feature1,(test_num,4096))
    #加载注册图片
    X=read_image(path2)
    #X  作为 模型的输入
    out = net.forward_all(data=X)
    #fc7是模型的输出,也就是特征值
    feature2 = np.float64(out['fc7'])
    feature2=np.reshape(feature2,(test_num,4096))
    #求两个特征向量的cos值,并作为是否相似的依据
    predicts=pw.cosine_similarity(feature1, feature2)
    return  predicts 
Example 16
Project: kaggle   Author: galaxy24   File: bot.py    GNU General Public License v3.0 6 votes vote down vote up
def response(user_response):
    robo_response=''
    sent_tokens.append(user_response)
    TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
    tfidf = TfidfVec.fit_transform(sent_tokens)
    vals = cosine_similarity(tfidf[-1], tfidf)
    idx=vals.argsort()[0][-2]
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-2]
    if(req_tfidf==0):
        robo_response=robo_response+"I am sorry! I don't understand you"
        return robo_response
    else:
        robo_response = robo_response+sent_tokens[idx]
        return robo_response
      
# Generating response 
Example 17
Project: kaggle   Author: galaxy24   File: bot.py    GNU General Public License v3.0 6 votes vote down vote up
def responseone(user_response):
    robo_response=''
    sent_tokensone.append(user_response)
    TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
    tfidf = TfidfVec.fit_transform(sent_tokensone)
    vals = cosine_similarity(tfidf[-1], tfidf)
    idx=vals.argsort()[0][-2]
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-2]
    if(req_tfidf==0):
        robo_response=robo_response+"I am sorry! I don't understand you"
        return robo_response
    else:
        robo_response = robo_response+sent_tokensone[idx]
        return robo_response 
Example 18
Project: Automated-Social-Annotation   Author: acadTags   File: LDA.py    MIT License 5 votes vote down vote up
def display_for_qualitative_evaluation(modelToEval, k_num_doc, mat_train, trainY, corpus_eval, evalX, evalY, vocabulary_index2word, vocabulary_index2word_label, hamming_q=FLAGS.ave_labels_per_doc):
    prediction_str=""
    #generate the doc indexes same as for the deep learning models.
    number_examples=len(evalY)
    rn_dict={}
    rn.seed(1) # set the seed to produce same documents for prediction
    batch_size=128
    for i in range(0,500):
        batch_chosen=rn.randint(0,number_examples//batch_size)
        x_chosen=rn.randint(0,batch_size)
        #rn_dict[(batch_chosen*batch_size,x_chosen)]=1
        rn_dict[batch_chosen*batch_size+x_chosen]=1
        
    # get eval-train document similarity matrix
    #mat_train = np.array(modelToEval[corpus]) #https://stackoverflow.com/questions/21322564/numpy-list-of-1d-arrays-to-2d-array
    #mat_train = mat_train[:,:,1] #https://stackoverflow.com/questions/37152031/numpy-remove-a-dimension-from-np-array
    #mat_eval = np.array(modelToEval[corpus_eval])
    mat_eval = np.array(modelToEval.get_document_topics(corpus_eval,minimum_probability=0.0))
    mat_eval = mat_eval[:,:,1]
    mat_sim_v_tr = cosine_similarity(mat_eval,mat_train) # a matrix (n_valid,n_train)
    
    y_true = np.asarray(evalY)    
    for i in range(len(mat_sim_v_tr)):
        doc_ind_list = get_doc_ind_from_vec(mat_sim_v_tr[i],k_num_doc=k_num_doc)
        #print(doc_ind_list)
        label_predicted = get_labels_from_docs(doc_ind_list,trainY)
        if rn_dict.get(i) == 1:
            doc = 'doc: ' + ' '.join(display_results(evalX[i],vocabulary_index2word))
            pred = 'prediction-lda: ' + ' '.join(display_results(label_predicted,vocabulary_index2word_label))
            get_indexes = lambda x, xs: [i for (y, i) in zip(xs, range(len(xs))) if x == y]
            label = 'labels: ' + ' '.join(display_results(get_indexes(1,evalY[i]),vocabulary_index2word_label))
            prediction_str = prediction_str + '\n' + doc + '\n' + pred + '\n' + label + '\n'
    
    return prediction_str 
Example 19
Project: dac   Author: KBNLresearch   File: dac.py    GNU General Public License v3.0 5 votes vote down vote up
def set_entity_vector_match(self):
        '''
        Match word vectors for other entities in the article with
        entity vector.
        '''
        cvf = [f for f in self.features if f.startswith('candidate_vec')]
        mvf = [f for f in self.features if
               f.startswith('match_txt_entity_vec')]
        if not (cvf or mvf):
            return

        if 'vector' in self.document:
            cand_vectors = [json.loads(self.document.get('vector'))]
        else:
            return

        if cvf:
            for i, v in enumerate(cand_vectors[0]):
                setattr(self, 'candidate_vec_' + str(i), v)

        if not mvf:
            return

        if not hasattr(self.cluster, 'context_entity_parts'):
            self.cluster.get_context_entity_parts()
        if not self.cluster.context_entity_parts:
            return

        if not hasattr(self.cluster, 'context_entity_vectors'):
            self.cluster.context_entity_vectors = self.get_vectors(
                self.cluster.context_entity_parts)
        if not self.cluster.context_entity_vectors:
            return

        sims = cosine_similarity(np.array(self.cluster.context_entity_vectors),
                                 np.array(cand_vectors))
        self.match_txt_entity_vec_max = sims.max() - 0.375
        self.match_txt_entity_vec_mean = sims.mean() - 0.125 
Example 20
Project: domain_discovery_API   Author: VIDA-NYU   File: radviz.py    GNU General Public License v3.0 5 votes vote down vote up
def compute_tsp(self):
        max_value=30000;
        if (self.data != None):
            if (len(self.features) > 0):
                if(len(self.features)<=2):
                    order = range(0,len(self.features))
                else:
                    matData = 1-cosine_similarity(np.transpose(self.data))
                    cities = solve_tsp(matData)
                    return {"cities": cities, "groupId": 0, "offset": 0} 
Example 21
Project: VMED   Author: thaihungle   File: data_util.py    MIT License 5 votes vote down vote up
def bow_score(input_batch, target_batch, predict_batch, token2str, mat=None):
    s1=[]
    for b in range(target_batch.shape[0]):
        trim_target = []
        trim_predict = []
        str_target = []
        str_predict = []
        str_input=[]
        if mat is None:
            oh1=np.zeros(len(token2str))
            oh2 = np.zeros(len(token2str))
        else:
            oh1 = np.zeros(mat.shape[1])
            oh2 = np.zeros(mat.shape[1])
        for t in target_batch[b]:
            if t >2 and token2str[t]!='.':
                trim_target.append(t)
                if token2str[t] not in cachedStopWords:
                    if mat is None:
                        oh1+=onehot(t, len(token2str))
                    else:
                        oh1+=mat[t]
                str_target.append(token2str[t])
        for t in predict_batch[b]:
            if t >2 and token2str[t]!='.':
                trim_predict.append(t)
                if token2str[t] not in cachedStopWords:
                    if mat is None:
                        oh2+=onehot(t, len(token2str))
                    else:
                        oh2+=mat[t]
                str_predict.append(token2str[t])

        s1.append(cosine_similarity(np.reshape(oh1,[1,-1]),np.reshape(oh2,[1,-1])))

    return np.mean(s1) 
Example 22
Project: fuzzy-fs   Author: achyudh   File: generate_cf_matrix_webKB.py    MIT License 5 votes vote down vote up
def generate_cf_matrix(clu):
    CF = np.zeros((x_train.shape[1],x_train.shape[1]))
    for i in tqdm(range(x_train.shape[1])):
        for j in range(x_train.shape[1]):
            summ=0
            for k in docinc[clu]:
                summ+=X[k][i]*X[k][j]
            CF[i,j]=summ

    np.save("cf_%d_webKB.npy"%clu, CF)

    # CF = np.load("cf_webKB.npy")
    NCF = np.zeros((X.shape[1],X.shape[1]))
    for i in range(X.shape[1]):
        break
        for j in range(X.shape[1]):
            # print(CF[i,j], (CF[i,j]+CF[j,j]-CF[i,j]))
            NCF[i,j]=(0.001 + CF[i,j])/(0.001 +(CF[i,j]+CF[j,j]-CF[i,j]))
            break
    SC = np.zeros(X.shape[1])
    for i in range(X.shape[1]):
        SC[i] = np.sum(NCF[i,:])

    cosim = cosine_similarity(SC, CF)
    print(cosim)
    # top = 10
    # print("Top features are: ", (-cosim).argsort()[0][:top])
    np.save("sc_%d_webKB.npy"%clu, SC) 
Example 23
Project: fuzzy-fs   Author: achyudh   File: generate_cf_matrix_R8.py    MIT License 5 votes vote down vote up
def generate_cf_matrix(clu):
    CF = np.zeros((x_train.shape[1],x_train.shape[1]))
    for i in tqdm(range(x_train.shape[1])):
        for j in range(x_train.shape[1]):
            summ=0
            for k in docinc[clu]:
                summ+=X[k][i]*X[k][j]
            CF[i,j]=summ

    np.save("cf_%d_webKB.npy"%clu, CF)

    # CF = np.load("cf_webKB.npy")
    NCF = np.zeros((X.shape[1],X.shape[1]))
    for i in range(X.shape[1]):
        break
        for j in range(X.shape[1]):
            # print(CF[i,j], (CF[i,j]+CF[j,j]-CF[i,j]))
            NCF[i,j]=(0.001 + CF[i,j])/(0.001 +(CF[i,j]+CF[j,j]-CF[i,j]))
            break
    SC = np.zeros(X.shape[1])
    for i in range(X.shape[1]):
        SC[i] = np.sum(NCF[i,:])

    cosim = cosine_similarity(SC, CF)
    print(cosim)
    # top = 10
    # print("Top features are: ", (-cosim).argsort()[0][:top])
    np.save("sc_%d_webKB.npy"%clu, SC) 
Example 24
Project: ABRW   Author: houchengbin   File: utils.py    MIT License 5 votes vote down vote up
def pairwise_similarity(mat, type='cosine'):
    # XXX: possible to integrate pairwise_similarity with top_k to enhance performance? 
    # we'll use it elsewhere. if really needed, write a new method for this purpose
    if type == 'cosine':  # support sprase and dense mat
        from sklearn.metrics.pairwise import cosine_similarity
        result = cosine_similarity(mat, dense_output=True)
    elif type == 'jaccard':
        from sklearn.metrics import jaccard_similarity_score
        from sklearn.metrics.pairwise import pairwise_distances
        # n_jobs=-1 means using all CPU for parallel computing
        result = pairwise_distances(mat.todense(), metric=jaccard_similarity_score, n_jobs=-1)
    elif type == 'euclidean':
        from sklearn.metrics.pairwise import euclidean_distances
        # note: similarity = - distance
        # other version: similarity = 1 - 2 / pi * arctan(distance)
        result = euclidean_distances(mat)
        result = -result
        # result = 1 - 2 / np.pi * np.arctan(result)
    elif type == 'manhattan':
        from sklearn.metrics.pairwise import manhattan_distances
        # note: similarity = - distance
        # other version: similarity = 1 - 2 / pi * arctan(distance)
        result = manhattan_distances(mat)
        result = -result
        # result = 1 - 2 / np.pi * np.arctan(result)
    else:
        print('Please choose from: cosine, jaccard, euclidean or manhattan')
        return 'Not found!'
    return result


# ---------------------------------ulits for preprocessing-------------------------------- 
Example 25
Project: toolkit   Author: SciLensProject   File: papers_ops.py    GNU General Public License v3.0 5 votes vote down vote up
def cos_sim(vec1, vec2):
    return abs(cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0])

############################### EXTRACTION/SIMILARITY ###############################

#Extract word vectors from text 
Example 26
Project: access-face-vision   Author: accessai   File: face_recogniser.py    Apache License 2.0 5 votes vote down vote up
def recognise(obj, trained_embeddings, trained_labels, trained_face_ids, recognition_threshold):
    time_in = time()

    detected_embeddings = obj.get('embeddings', [])
    if len(detected_embeddings) > 0 and len(trained_embeddings)>0 and len(trained_labels)>0:
        sims = cosine_similarity(trained_embeddings, detected_embeddings)
        indexes = np.argmax(sims, axis=0)
        confidence = np.max(sims, axis=0)
        detected_labels = trained_labels[indexes]
        detected_face_ids = trained_face_ids[indexes]

        for label, faceId, conf, face in zip(detected_labels, detected_face_ids, confidence, obj.get('detections')):
            face['label'] = label
            face['faceId'] = faceId
            face['recognition_confidence'] = conf
            if face.get('confidence'):
                face['confidence'] = roundUp((face['confidence'] + conf)/2.)
                conf = face['confidence']
            if conf < recognition_threshold:
                face['label'] = 'Unknown'



    time_out = time()
    obj['recognition_time'] = time_out - time_in

    return obj 
Example 27
Project: HINPy   Author: pedroramaciotti   File: hin_class.py    GNU General Public License v3.0 5 votes vote down vote up
def NoveltyDivMes(self,relation_name,similarity_relation,verbose=False):
        table = self.table[self.table.relation==relation_name].copy(deep=True)
        sim_matrix = cosine_similarity(self.GetLinkGroup(similarity_relation).stochastic_matrix)
        object_position = self.GetLinkGroupStartObjectGroup(similarity_relation).OjectPositionDicFromName()
        return Novelty(table,sim_matrix,object_position,verbose=verbose); 
Example 28
Project: HINPy   Author: pedroramaciotti   File: hin_class.py    GNU General Public License v3.0 5 votes vote down vote up
def IntraListSimilarityDivMes(self,relation_name,similarity_relation,verbose=False):
        table = self.table[self.table.relation==relation_name].copy(deep=True)
        sim_matrix = cosine_similarity(self.GetLinkGroup(similarity_relation).stochastic_matrix)
        object_position = self.GetLinkGroupStartObjectGroup(similarity_relation).OjectPositionDicFromName()
        return IntraListSimilarity(table,sim_matrix,object_position,verbose=verbose); 
Example 29
Project: twitter_topic_detection   Author: santels   File: calculate_similarity.py    MIT License 5 votes vote down vote up
def cos_similarity(self, M1=None, M2=None):
        '''
        Cosine similarity measure of documents. For testing purposes.
        '''
        if M1 is None:
            M1 = self.matrix
            if M2 is None:
                M2 = self.matrix
        return cosine_similarity(M1, M2) 
Example 30
Project: Rep-True   Author: ProjectLegenda   File: smartcore.py    GNU General Public License v3.0 5 votes vote down vote up
def calcSimilarity(tfidf,tfidf_matix,title_list,top_n = 5):
    cosine_similarities = cosine_similarity(tfidf,tfidf_matix)
    similar_indices = cosine_similarities.argsort().flatten()[-top_n:]
    similar_items = sorted([(title_list[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
    return similar_items 
Example 31
Project: Rep-True   Author: ProjectLegenda   File: smartcore.py    GNU General Public License v3.0 5 votes vote down vote up
def calcSimilarity_m2(tfidf,tfidf_matix,top_n = 5):
    cosine_similarities = cosine_similarity(tfidf,tfidf_matix)
    similar_indices = cosine_similarities.argsort().flatten()[-top_n:]
    print(similar_indices)
    print(cosine_similarities)
    index_similarity_list = []
    for x in similar_indices:
        index_similarity_list.append({'index': x,'cosine_similarity':cosine_similarities[0,x] })            
    df = pd.DataFrame(index_similarity_list)
    print(df)
    return(df) 
Example 32
Project: scattertext   Author: JasonKessler   File: CategoryProjectorEvaluator.py    Apache License 2.0 5 votes vote down vote up
def evaluate(self, category_projection):
        assert issubclass(type(category_projection), CategoryProjectionBase)
        topics = category_projection.get_nearest_terms()
        total_similarity = 0
        for topic in topics.values():
            topic_vectors = np.array([self.get_vector(term) for term in topic])
            #simport pdb; pdb.set_trace()
            sim_matrix = cosine_similarity(topic_vectors)
            tril_sim_matrix = np.tril(sim_matrix)
            mean_similarity = tril_sim_matrix.sum()/(tril_sim_matrix.shape[0] ** 2 - tril_sim_matrix.shape[0]) / 2
            total_similarity += mean_similarity
        return total_similarity/len(topics) 
Example 33
Project: dockerizeme   Author: dockerizeme   File: snippet.py    Apache License 2.0 5 votes vote down vote up
def vector_cos3(u, v):
    return _vector_cos3(u, v)[0][0]
    

#### 
Example 34
Project: QAUniBonn   Author: jtrillos   File: app.py    Apache License 2.0 5 votes vote down vote up
def ranking(question):
  with open('data/templates.json', 'r') as templates_json:
    try:
      templatesDict = json.load(templates_json)
      templateVectorMatrix = np.array([])
      n=0
      for template in templatesDict :
        n=n+1
        vector = np.array(template['vec_representation'])
        if templateVectorMatrix.size > 0:
          if templateVectorMatrix.size==1:
            templateVectorMatrix = np.stack((templateVectorMatrix,vector))
          else:
            templateVectorMatrix = np.vstack((templateVectorMatrix,vector))
        else:
          templateVectorMatrix = np.hstack((templateVectorMatrix,vector))

      sims=cosine_similarity(getQuestionVector(question).reshape(1,-1),templateVectorMatrix)
      sims_index = np.argsort(sims)[0][::-1][:n]

      for i in range(n):
        templatesDict[i]['ranking'] = sims[0][i]

      # sort temaplates by ranking
      templatesDict = sorted(templatesDict, key=sort_by_ranking, reverse=True)
      #print("sims_index computed")
      #print(str(sims_index))

      return json.dumps(templatesDict)
    except ValueError:
      print("error")
      return {'err' : 'No templates found'} 
Example 35
Project: fever-naacl-2018   Author: sheffieldnlp   File: fever_features.py    Apache License 2.0 5 votes vote down vote up
def process(self,data):
        claim_bow = self.bow_vectorizer.transform(self.claims(data))
        claim_tfs = self.tfreq_vectorizer.transform(claim_bow)
        claim_tfidf = self.tfidf_vectorizer.transform(self.claims(data))

        body_texts = self.texts(data)
        body_bow = self.bow_vectorizer.transform(body_texts)
        body_tfs = self.tfreq_vectorizer.transform(body_bow)
        body_tfidf = self.tfidf_vectorizer.transform(body_texts)

        cosines = np.array([cosine_similarity(c, b)[0] for c,b in zip(claim_tfidf,body_tfidf)])

        return hstack([body_tfs,claim_tfs,cosines]) 
Example 36
Project: fever-naacl-2018   Author: sheffieldnlp   File: process_tfidf_grid.py    Apache License 2.0 5 votes vote down vote up
def process(self, data):
        claim_bow = self.bow_vectorizer.transform(self.claims(data))
        claim_tfs = self.tfreq_vectorizer.transform(claim_bow)
        claim_tfidf = self.tfidf_vectorizer.transform(self.claims(data))

        body_texts = self.texts(data)
        body_bow = self.bow_vectorizer.transform(body_texts)
        body_tfs = self.tfreq_vectorizer.transform(body_bow)
        body_tfidf = self.tfidf_vectorizer.transform(body_texts)

        cosines = np.array([cosine_similarity(c, b)[0] for c, b in zip(claim_tfidf, body_tfidf)])

        return cosines 
Example 37
Project: fever-naacl-2018   Author: sheffieldnlp   File: process_tfidf.py    Apache License 2.0 5 votes vote down vote up
def process(self, data):
        claim_bow = self.bow_vectorizer.transform(self.claims(data))
        claim_tfs = self.tfreq_vectorizer.transform(claim_bow)
        claim_tfidf = self.tfidf_vectorizer.transform(self.claims(data))

        body_texts = self.texts(data)
        body_bow = self.bow_vectorizer.transform(body_texts)
        body_tfs = self.tfreq_vectorizer.transform(body_bow)
        body_tfidf = self.tfidf_vectorizer.transform(body_texts)

        cosines = np.array([cosine_similarity(c, b)[0] for c, b in zip(claim_tfidf, body_tfidf)])

        return cosines 
Example 38
Project: ICASSP-2020-clustering   Author: xavierfav   File: clustering.py    GNU General Public License v3.0 5 votes vote down vote up
def compute_similarity_matrix(X):
    """
    Compute similarity matrix of the given features.

    """
    euclidian_distances = euclidean_distances(X)
    similarity_matrix = 1 - euclidian_distances/euclidian_distances.max()
    similarity_matrix = np.exp(-1 * euclidian_distances / euclidian_distances.std())
    # similarity_matrix = cosine_similarity(X)
    return similarity_matrix 
Example 39
Project: define-semantic-annotation   Author: iarroyof   File: Categorizador.py    GNU General Public License v2.0 5 votes vote down vote up
def generarRelacion(vectorDefiniciones, vectorWiki, minimaSimilitud):
    listaRelacion = []
    i = 0
    #Se recorre el vector de definiciones.
    for valor in vectorDefiniciones:
        listaRelacion.append(0)
        j = 0
        valorMasAlto = 0
        #Se recorre el vector de Wikipedia.
        for valorWiki in vectorWiki:
            #Se calcula la distancia coseno para cada vector de wikipedia con los vectores de definiciones.
            result = cosine_similarity(valor, valorWiki)
            #Mantiene el resultado mayor guardado en la lista.
            if(result > valorMasAlto):
                #Asigna el ID de la definicion de wikipedia con que se relaciona la definicion, es + 1 porque empiezan en 1 los ID.
                listaRelacion[i] = j + 1
                valorMasAlto = result
            j = j + 1
        #En caso de ser menor a minimaSimilitud la similitud se asigna a no informativo.
        if(valorMasAlto < minimaSimilitud):
            listaRelacion[i] = 0
        i = i + 1
    return listaRelacion
    """Nombre de la función: generarSimilitud
	Descripción:Realiza la obtencion de el porcentaje de similitud de los vectores de definiciones y acepciones de wikipedia con la funcion
				cosine_similarity, se obtiene la lista de la similitud
	Versión:1.0
	Autor:Ramón Pantoja Velasco
	Parámetros: vectorDefiniciones.- Vector obtenido de la listaDefiniciones al transformarla ya sea con o sin TFIDF, vectorWiki.- Vector obtenido de la lista
				de acepciones de wikipedia debes ser obtenida de la misma forma que listaDefiniciones
	Retorno:listaSimilitud.- lista de procentaje de similitud coseno
	
    """ 
Example 40
Project: define-semantic-annotation   Author: iarroyof   File: Categorizador.py    GNU General Public License v2.0 5 votes vote down vote up
def generarSimilitud(vectorDefiniciones, vectorWiki):
    listaSimilitud = []
    i = 0
    #Recorre todos los valores en vectorDefiniciones.
    for valor in vectorDefiniciones:
        j = 0
        valorMasAlto = 0
        result = 0
        #Recorre todos los valores en vectorWiki.
        for valorWiki in vectorWiki:
            #Calcula la distancia coseno y almacena el resultado en result
            result = cosine_similarity(valor, valorWiki)
            #Guarda el valor más alto
            if(result > valorMasAlto):
                valorMasAlto = result
            j = j + 1
        #Guarda el valor en la lista.
        listaSimilitud.append(valorMasAlto)
    return listaSimilitud
    """Nombre de la función: crearListaAcepciones
	Descripción:Recibe la ruta del archivo de Acepciones en .csv y genera la lista que se utilizará de Acepciones, guardando unicamente el nombre de
				la acepción
	Versión:1.0
	Autor:Ramón Pantoja Velasco
	Parámetros:archivoCsv.- Direccion del archivo de acepciones
	Retorno:listaAcepciones.- Lista que guarda el nombre de las acepciones obtenidas del archivo .csv
	
    """ 
Example 41
Project: head-qa   Author: aghie   File: models.py    MIT License 5 votes vote down vote up
def _predict(self,qas):
        """
        
        Returns a list of tuples (question_id, right_answer_id)
        
        Args
        
        qas (list). A list of tuples of strings of the form (QID, Q, A1,...,AN)
        
        """
        
        preds = {}
        for qid, question, answers in qas:
        
            question_word_embs = [self.word_embeddings[self.word2index[word]] 
                               if word in self.word2index else np.zeros(self.embedding_size) 
                               for word in question]
            
            embedding_question = normalize(np.sum(question_word_embs, axis=0).reshape(1, -1))
  
            best_score = -1
            for aid, answer in enumerate(answers,1):
                answer_word_embs = [self.word_embeddings[self.word2index[word]] 
                               if word in self.word2index else np.zeros(self.embedding_size) 
                               for word in answer]
                answer_vector = normalize(np.sum(answer_word_embs, axis=0).reshape(1, -1))
                score = cosine_similarity(embedding_question, answer_vector)[0][0]
                if score > best_score:
                    best_answer,best_score = aid, score             
                
            preds[qid] = best_answer
        return preds 
Example 42
Project: ijcai2019-relis   Author: UKPLab   File: infersent_metric.py    MIT License 5 votes vote down vote up
def __call__(self,docs,summs, top_n=3, average_all_docs=False):
        self.sentences = []
        top_sents_idx = []
        for doc in docs:
            if top_n == 0:
                top_sents_idx.append([ii for ii in range(len(self.sentences),len(self.sentences)+len(doc[1]))])
            else:
                top_sents_idx.append([ii for ii in range(len(self.sentences),len(self.sentences)+top_n)])
            self.sentences.extend(doc[1])
        self.sent_vecs = self.infersent.encode(self.sentences,tokenize=True)

        sum_vecs = []
        for sum in summs:
            sv = np.array([ss for ii,ss in enumerate(self.sent_vecs) if ii in sum])
            sv = np.mean(sv,axis=0)
            sum_vecs.append(sv)

        doc_vecs = []
        for top in top_sents_idx:
            dv = np.array([v for ii,v in enumerate(self.sent_vecs) if ii in top])
            dv = np.mean(dv,axis=0)
            doc_vecs.append(dv)
            if average_all_docs:
                doc_vecs = [np.mean(np.array(doc_vecs), axis=0)]

        sims = []
        for sv in sum_vecs:
            if average_all_docs:
                sims.append([cosine_similarity(sv.reshape(1,-1),doc_vecs[0].reshape(1,-1))[0][0]])
            else:
                ss = []
                for dv in doc_vecs:
                    ss.append(cosine_similarity(sv.reshape(1,-1),dv.reshape(1,-1))[0][0])
                sims.append([np.min(ss), np.mean(ss), np.max(ss), np.std(ss)])

        return np.array(sims) 
Example 43
Project: dialogue-models   Author: siat-nlp   File: metrics.py    Apache License 2.0 5 votes vote down vote up
def greedy(self, hyp_embeds, ref_embeds):
        """
        greedy
        """
        greedy_sim = []
        for hyp_embed, ref_embed in zip(hyp_embeds, ref_embeds):
            cos_sim = cosine_similarity(hyp_embed, ref_embed)
            g_sim = (cos_sim.max(axis=1).mean() +
                     cos_sim.max(axis=0).mean()) / 2
            greedy_sim.append(g_sim)
        greedy_sim = np.array(greedy_sim)
        return greedy_sim 
Example 44
Project: dialogue-models   Author: siat-nlp   File: metrics.py    Apache License 2.0 5 votes vote down vote up
def greedy(self, hyp_embeds, ref_embeds):
        """
        greedy
        """
        greedy_sim = []
        for hyp_embed, ref_embed in zip(hyp_embeds, ref_embeds):
            cos_sim = cosine_similarity(hyp_embed, ref_embed)
            g_sim = (cos_sim.max(axis=1).mean() +
                     cos_sim.max(axis=0).mean()) / 2
            greedy_sim.append(g_sim)
        greedy_sim = np.array(greedy_sim)
        return greedy_sim 
Example 45
Project: dialogue-models   Author: siat-nlp   File: metrics.py    Apache License 2.0 5 votes vote down vote up
def greedy(self, hyp_embeds, ref_embeds):
        """
        greedy
        """
        greedy_sim = []
        for hyp_embed, ref_embed in zip(hyp_embeds, ref_embeds):
            cos_sim = cosine_similarity(hyp_embed, ref_embed)
            g_sim = (cos_sim.max(axis=1).mean() +
                     cos_sim.max(axis=0).mean()) / 2
            greedy_sim.append(g_sim)
        greedy_sim = np.array(greedy_sim)
        return greedy_sim 
Example 46
Project: chicago-crime   Author: thekingofkings   File: multi_view_prediction.py    MIT License 5 votes vote down vote up
def similarityMatrix(F):
    assert F.shape[0] == N
    M = np.zeros((N,N))
    for i in range(N):
        for j in range(i, N):
            if i == j:
                M[i,j] = 1
            else:
                # sim = cosine_similarity(F[i].reshape(1,-1), F[j].reshape(1,-1))
                sim = np.dot(F[i], F[j].T)
                M[i,j] = sim
                M[j,i] = sim
    return M 
Example 47
Project: numerate-language-models   Author: uclnlp   File: ploty.py    GNU General Public License v3.0 5 votes vote down vote up
def plot_sims(W,  # (n_samples, n_features)
              points,
              labels,
              title=None
              ):
    im = plt.imshow(cos_sim(W), vmin=-1.0, vmax=1.0, cmap='seismic')#'hot')
    im.axes.xaxis.tick_top()
    plt.colorbar()
    plt.xticks(points, labels, rotation='vertical', verticalalignment='bottom')
    plt.yticks(points, labels)
    if title:
        plt.xlabel(title)
    plt.show() 
Example 48
Project: watasense   Author: nlpub   File: wsd.py    MIT License 5 votes vote down vote up
def disambiguate_word(self, sentence, index):
        super().disambiguate_word(sentence, index)

        lemmas = self.lemmatize(sentence)

        if index not in lemmas:
            return

        svector = self.sparse.transform(Counter(lemmas.values()))  # sentence vector

        def search(query):
            """
            Map synset identifiers to the cosine similarity value.
            This function calls the function query(id) that retrieves
            the corresponding dict of words.
            """
            return Counter({id: sim(svector, self.sparse.transform(query(id))).item(0)
                            for id in self.inventory.index[lemmas[index]]})

        candidates = search(lambda id: self.inventory.synsets[id].synonyms)

        # give the hypernyms a chance if nothing is found
        if not candidates:
            candidates = search(lambda id: self.inventory.synsets[id].bag)

        if not candidates:
            return

        for id, _ in candidates.most_common(1):
            return id 
Example 49
Project: tokenquery   Author: ramtinms   File: vector_opr.py    GNU General Public License v3.0 5 votes vote down vote up
def vec_cos_sim(token_input, operation_input):
    operation_string = None
    ref_vector_string = None
    cond_value_string = None
    for opr_sign in ['==', '>=', '<=', '!=', '<>', '<', '>', '=']:
        if opr_sign in operation_input:
            ref_vector_string = operation_input.split(opr_sign)[0]
            operation_string = opr_sign
            cond_value_string = operation_input.split(opr_sign)[1]
            break

    if ref_vector_string and cond_value_string and operation_string:
        try:
            cond_value = float(cond_value_string)
            ref_vector = change_string_to_vector(ref_vector_string)
            token_vector = change_string_to_vector(token_input)
            if len(ref_vector) != len(token_vector):
                print ('len of vectors does not match')
                return False
            if operation_string == "=" or operation_string == "==":
                return cosine_similarity(token_vector, ref_vector) == cond_value
            elif operation_string == "<":
                return cosine_similarity(token_vector, ref_vector) < cond_value
            elif operation_string == ">":
                return cosine_similarity(token_vector, ref_vector) > cond_value
            elif operation_string == ">=":
                return cosine_similarity(token_vector, ref_vector) >= cond_value
            elif operation_string == "<=":
                return cosine_similarity(token_vector, ref_vector) <= cond_value
            elif operation_string == "!=" or operation_string == "<>":
                return cosine_similarity(token_vector, ref_vector) != cond_value
            else:
                return False
        except ValueError:
            # TODO raise tokenregex error
            return False

    else:
        # TODO raise tokenregex error
        print ('Problem with the operation input') 
Example 50
Project: ParticleFlowBayesRule   Author: xinshi-chen   File: utils.py    MIT License 5 votes vote down vote up
def kernel_function(kernel_type):
        kernel_dict = {'gaussian': sk_metric.rbf_kernel,
                        'laplacian': sk_metric.laplacian_kernel,
                        'sigmoid': sk_metric.sigmoid_kernel,
                        'polynomial': sk_metric.polynomial_kernel,
                        'cosine': sk_metric.cosine_similarity,
                        'chi2': sk_metric.chi2_kernel
                        }
        return kernel_dict[kernel_type] 
Example 51
Project: ParticleFlowBayesRule   Author: xinshi-chen   File: metric.py    MIT License 5 votes vote down vote up
def square_mmd_fine(p_samples, q_samples, n_p, n_q, kernel_type):
    """
    n_p: number of samples from true distribution p

    assume n_p >> n_q
    """
    kernel_dict = {
        'gaussian': sk_metric.rbf_kernel,
        'laplacian': sk_metric.laplacian_kernel,
        'sigmoid': sk_metric.sigmoid_kernel,
        'polynomial': sk_metric.polynomial_kernel,
        'cosine': sk_metric.cosine_similarity,
    }

    kernel = kernel_dict[kernel_type]

    p_samples = np.array(p_samples)
    q_samples = np.array(q_samples)

    k_xi_xj = kernel(p_samples, p_samples)
    k_yi_yj = kernel(q_samples, q_samples)
    k_xi_yj = kernel(p_samples, q_samples)

    off_diag_k_xi_xj = (np.sum(k_xi_xj) - np.sum(np.diag(k_xi_xj))) / n_p / (n_p - 1)
    off_diag_k_yi_yj = (np.sum(k_yi_yj) - np.sum(np.diag(k_yi_yj))) / n_q / (n_q - 1)
    sum_k_xi_yj = np.sum(k_xi_yj) * 2 / n_p / n_q

    return off_diag_k_xi_xj + off_diag_k_yi_yj - sum_k_xi_yj 
Example 52
Project: SOR   Author: Projectdotpy   File: query.py    GNU General Public License v3.0 5 votes vote down vote up
def images_similar_to(q_img_path, features_per_class, metadata_per_class, C):
    result = imgs_to_roi_features([q_img_path], C, bbox_threshold=0.7)
    if not q_img_path in result:
        return [], result
    instance = result[q_img_path]
    best_is = best_bbox(instance, n=None)

    seen_classes = set()

    similar_images = []
    for best_i in best_is:
        best_feature = instance[2][best_i]
        claz = instance[1][best_i][1]
        if claz in seen_classes:
            continue
        seen_classes.add(claz)
        pool = features_per_class[claz]

        sims = cosine_similarity(pool, np.array([best_feature])).reshape(-1)
        top = np.argsort(sims)[::-1]
        similar_images_in_claz = [
            (metadata_per_class[claz][im][0], sims[im], claz) for im in top
        ]

        similar_images += similar_images_in_claz

    similar_images = map(
        lambda t: {"path": t[0], "class": t[2]},
        sorted(similar_images, key=lambda t: t[1], reverse=True),
    )

    return list(similar_images), result 
Example 53
Project: asml   Author: nachocano   File: hashing_criteo.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def update_similarity(dct_row, field1, field2):

	for i in range(len(hashable_fields)):
   		for j in range(i+1, len(hashable_fields)):
			field1=dct_row[hashable_fields[i]]
			field2=dct_row[hashable_fields[j]]
			hash1=hashing(field1)
			hash2=hashing(field2)
			similarity=cosine_similarity(hash1, hash2)[0][0]
	#print 'similarity_'+str(hashable_fields[i])+'_'+str(hashable_fields[j])
			dct_row['Similarity_'+str(hashable_fields[i])+'_'+str(hashable_fields[j])]=similarity								
	return dct_row 
Example 54
Project: asml   Author: nachocano   File: hashing.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def update_similarity(hashable_fields,dct_row):

    for i in range(len(hashable_fields)):
        for j in range(i+1, len(hashable_fields)):
            field1=dct_row[hashable_fields[i]]
            field2=dct_row[hashable_fields[j]]
            hash1=hashing(field1)
            hash2=hashing(field2)
            similarity=cosine_similarity(hash1, hash2)[0][0]
    #print 'similarity_'+str(hashable_fields[i])+'_'+str(hashable_fields[j])
            dct_row['Similarity_'+str(hashable_fields[i])+'_'+str(hashable_fields[j])]=similarity                               
    return dct_row 
Example 55
Project: WhiskyEmbeddings   Author: KenHBS   File: embeddings.py    GNU General Public License v3.0 5 votes vote down vote up
def most_similar_whiskies(self, whisky, focus='ful', n=15, min_count=3):
        whisky = whisky.lower()

        # Only focus on rows with appropriate focus (nos, pal, fin, ful)
        df = self.embeddings.xs(focus, level='att')

        try:
            target = df.ix[whisky]
        except KeyError:
            whisky = self.correct_typo(whisky)
            if whisky is None:
                return None
            target = df.ix[whisky]
        target = target.values.reshape(1, -1)

        # Only return whiskies that have an appropriate nr of reviews:
        keep = [self.count[x] >= min_count for x in df.index]
        result_set = df.loc[keep, :]

        # Calculate the distance in embedding of whisky w.r.t. all whiskies
        # and return the n closest ones:
        distances = cosine_similarity(target, result_set)
        inds = np.argsort(distances)[0]
        inds = inds[::-1][1:n]
        for x in inds:
            print([result_set.index[x], distances[0, x]])
        pass 
Example 56
Project: WhiskyEmbeddings   Author: KenHBS   File: embeddings.py    GNU General Public License v3.0 5 votes vote down vote up
def compare_to_vocab(self, embedding, n=10):
        embedding = embedding.values.reshape(1, -1)
        distances = cosine_similarity(embedding, self.wv_vecs)[0]

        inds = np.argsort(distances)
        inds = inds[::-1][1:n]

        vocab = self.wordlist
        return [vocab[x] for x in inds] 
Example 57
Project: search_relevance   Author: rmanak   File: nlp_utils.py    MIT License 5 votes vote down vote up
def cosine_sim(x, y):
    try:
        d = cosine_similarity(x.reshape(1,-1), y.reshape(1,-1))
        d = d[0][0]
    except:
        d = 0.0
    return d 
Example 58
Project: Natural-Language-Processing-with-Python-Cookbook   Author: PacktPublishing   File: Similarity.py    MIT License 5 votes vote down vote up
def cosineSimilarity(self):
        vec = TfidfVectorizer()
        matrix = vec.fit_transform(self.statements)
        for j in range(1, 5):
            i = j - 1
            print("\tsimilarity of document {} with others".format(i))
            similarity = cosine_similarity(matrix[i:j], matrix)
            print(similarity) 
Example 59
Project: SDLib   Author: Coder-Yu   File: qmath.py    GNU General Public License v3.0 5 votes vote down vote up
def cosine(x1,x2):
    #find common ratings
    new_x1, new_x2 = common(x1,x2)
    #compute the cosine similarity between two vectors
    sum = new_x1.dot(new_x2)
    denom = sqrt(new_x1.dot(new_x1)*new_x2.dot(new_x2))
    try:
        return float(sum)/denom
    except ZeroDivisionError:
        return 0

    #return cosine_similarity(x1,x2)[0][0] 
Example 60
Project: image-classifier   Author: gustavkkk   File: eval.py    MIT License 5 votes vote down vote up
def compare_pic(self,feature1,feature2):
	predicts=pw.pairwise_distances(feature2, feature1,'cosine')
	#predicts=pw.cosine_similarity(feature1, feature2)
	return  predicts 
Example 61
Project: image-classifier   Author: gustavkkk   File: eval.py    MIT License 5 votes vote down vote up
def compare_pic(self,feature1,feature2):
	predicts=pw.pairwise_distances(feature2, feature1,'cosine')
	#predicts=pw.cosine_similarity(feature1, feature2)
	return  predicts 
Example 62
Project: image-classifier   Author: gustavkkk   File: eval.py    MIT License 5 votes vote down vote up
def compare_pic(self,feature1,feature2):
	predicts=pw.pairwise_distances(feature2, feature1,'cosine')
	#predicts=pw.cosine_similarity(feature1, feature2)
	return  predicts 
Example 63
Project: image-classifier   Author: gustavkkk   File: eval.py    MIT License 5 votes vote down vote up
def compare_pic(self,feature1,feature2):
	predicts=pw.pairwise_distances(feature2, feature1,'cosine')
	#predicts=pw.cosine_similarity(feature1, feature2)
	return  predicts 
Example 64
Project: image-classifier   Author: gustavkkk   File: eval.py    MIT License 5 votes vote down vote up
def compare_pic(self,feature1,feature2):
	predicts=pw.pairwise_distances(feature2, feature1,'cosine')
	#predicts=pw.cosine_similarity(feature1, feature2)
	return  predicts 
Example 65
Project: Automated-Social-Annotation   Author: acadTags   File: LDA.py    MIT License 4 votes vote down vote up
def do_eval_lda(modelToEval, k_num_doc, mat_train, trainY, corpus_eval, evalY, vocabulary_index2word_label, hamming_q=FLAGS.ave_labels_per_doc):
    # get eval-train document similarity matrix
    #mat_train = np.array(modelToEval[corpus]) #https://stackoverflow.com/questions/21322564/numpy-list-of-1d-arrays-to-2d-array
    #get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False):
    #mat_train = np.array(modelToEval.get_document_topics(corpus,minimum_probability=0.0))
    #print(len(modelToEval[corpus[0]]))
    #print(len(modelToEval[corpus[1]]))
    #print(len(modelToEval[corpus[2]]))
    #print(mat_train.shape)
    #mat_train = mat_train[:,:,1] #https://stackoverflow.com/questions/37152031/numpy-remove-a-dimension-from-np-array
    mat_eval = np.array(modelToEval.get_document_topics(corpus_eval,minimum_probability=0.0))
    #print(mat_eval.shape)
    mat_eval = mat_eval[:,:,1]
    mat_sim_v_tr = cosine_similarity(mat_eval,mat_train) # a matrix (n_valid,n_train)
    
    y_true = np.asarray(evalY)
    acc, prec, rec, hamming_loss = 0.0, 0.0, 0.0, 0.0
    for i in range(len(mat_sim_v_tr)):
        doc_ind_list = get_doc_ind_from_vec(mat_sim_v_tr[i],k_num_doc=k_num_doc)
        #print(doc_ind_list)
        label_predicted = get_labels_from_docs(doc_ind_list,trainY)
        #print(label_ind_list)
        #label_predicted = display_results(label_ind_list,vocabulary_index2word_label)
        #print(label_predicted)
        
        curr_acc = calculate_accuracy(label_predicted,y_true[i])
        acc = acc + curr_acc
        curr_prec, curr_rec = calculate_precision_recall(label_predicted,y_true[i])
        prec = prec + curr_prec
        rec = rec + curr_rec
        curr_hl = calculate_hamming_loss(label_predicted,y_true[i])
        hamming_loss = hamming_loss + curr_hl
    acc = acc/float(len(mat_sim_v_tr))
    prec = prec/float(len(mat_sim_v_tr))
    rec = rec/float(len(mat_sim_v_tr))
    hamming_loss = hamming_loss/float(len(mat_sim_v_tr))/FLAGS.ave_labels_per_doc
    if prec+rec != 0:
        f_measure = 2*prec*rec/(prec+rec)
    else:
        f_measure = 0
    return acc,prec,rec,f_measure,hamming_loss

# this also needs evalX 
Example 66
Project: dac   Author: KBNLresearch   File: dac.py    GNU General Public License v3.0 4 votes vote down vote up
def set_topic_match(self):
        '''
        Match the topics identified for the article with the DBpedia
        abstract.
        '''
        etf = [f for f in self.features if f.startswith('entity_topic')]
        ctf = [f for f in self.features if f.startswith('candidate_topic')]
        mtf = [f for f in self.features if f.startswith('match_txt_topic')]
        if not (etf or ctf or mtf):
            return

        if not hasattr(self.cluster.context, 'topics'):
            self.cluster.context.get_topics()
        topics = self.cluster.context.topics

        if topics and etf:
            for t in topics:
                setattr(self, 'entity_topic_' + t, topics[t])

        if ctf or mtf:

            description_topics = {t: 0.0 for t in dictionary.topics}

            # Deduce topic(s) from role(s)
            dbo_types = []
            if self.document.get('dbo_type'):
                dbo_types += self.document.get('dbo_type')

            if dbo_types:
                for t in dbo_types:
                    for r in dictionary.roles_dbo:
                        if (t in dictionary.roles_dbo[r] and r.split('_')[0] in
                                description_topics):
                            description_topics[r.split('_')[0]] = 1.0

            # Predict topic(s) from abstract
            if not sum(description_topics.values()):
                if not hasattr(self, 'topic_probs'):
                    self.get_topics()
                description_topics = self.topic_probs

            if ctf:
                for t in description_topics:
                    setattr(self, 'candidate_topic_' + t,
                            description_topics[t])

            if mtf:
                topics_arr = np.array([topics[t] for t in
                                       dictionary.topics]).reshape(1, -1)
                desc_topics_arr = np.array([description_topics[t] for t in
                                            dictionary.topics]).reshape(1, -1)

                self.match_txt_topic = cosine_similarity(
                    topics_arr, desc_topics_arr)[0][0] - 0.25 
Example 67
Project: dac   Author: KBNLresearch   File: dac.py    GNU General Public License v3.0 4 votes vote down vote up
def set_vector_match(self):
        '''
        Match context word vectors with abstract word vectors.
        '''
        if not self.document.get('lang') == 'nl':
            return

        evf = [f for f in self.features if f.startswith('entity_vec')]
        mvf = [f for f in self.features if f.startswith('match_txt_vec')]
        if not (evf or mvf):
            return

        if not hasattr(self.cluster, 'window'):
            self.cluster.get_window()
        if not self.cluster.window:
            return

        if not hasattr(self.cluster, 'window_vectors'):
            self.cluster.window_vectors = self.get_vectors(self.cluster.window)
        if not self.cluster.window_vectors:
            return

        if evf:
            # Take mean of window vectors for now, need to find better
            # representation
            window_vectors_array = np.array(self.cluster.window_vectors)
            entity_vector = np.mean(window_vectors_array, axis=0).tolist()
            for i, v in enumerate(entity_vector):
                setattr(self, 'entity_vec_' + str(i), v)

        if not mvf:
            return

        if 'abstract_vector' in self.document:
            cand_vectors = [json.loads(v) for v in
                            self.document.get('abstract_vector')]
        else:
            return

        sims = cosine_similarity(np.array(self.cluster.window_vectors),
                                 np.array(cand_vectors))

        self.match_txt_vec_max = sims.max() - 0.375
        self.match_txt_vec_mean = sims.mean() - 0.0625 
Example 68
Project: CIZSL   Author: mhelhoseiny   File: train_CIZSL.py    MIT License 4 votes vote down vote up
def eval_fakefeat_GZSL(netG, dataset, param, plot_dir, result):
    gen_feat = np.zeros([0, param.X_dim])
    for i in range(dataset.train_cls_num):
        text_feat = np.tile(dataset.train_text_feature[i].astype('float32'), (opt.nSample, 1))
        text_feat = Variable(torch.from_numpy(text_feat)).cuda()
        z = Variable(torch.randn(opt.nSample, param.z_dim)).cuda()
        G_sample = netG(z, text_feat)
        gen_feat = np.vstack((gen_feat, G_sample.data.cpu().numpy()))

    for i in range(dataset.test_cls_num):
        text_feat = np.tile(dataset.test_text_feature[i].astype('float32'), (opt.nSample, 1))
        text_feat = Variable(torch.from_numpy(text_feat)).cuda()
        z = Variable(torch.randn(opt.nSample, param.z_dim)).cuda()
        G_sample = netG(z, text_feat)
        gen_feat = np.vstack((gen_feat, G_sample.data.cpu().numpy()))

    visual_pivots = [gen_feat[i * opt.nSample:(i + 1) * opt.nSample].mean(0) \
                     for i in range(dataset.train_cls_num + dataset.test_cls_num)]
    visual_pivots = np.vstack(visual_pivots)

    """collect points for gzsl curve"""

    acc_S_T_list, acc_U_T_list = list(), list()
    seen_sim = cosine_similarity(dataset.pfc_feat_data_train, visual_pivots)
    unseen_sim = cosine_similarity(dataset.pfc_feat_data_test, visual_pivots)
    for GZSL_lambda in np.arange(-2, 2, 0.01):
        tmp_seen_sim = copy.deepcopy(seen_sim)
        tmp_seen_sim[:, dataset.train_cls_num:] += GZSL_lambda
        pred_lbl = np.argmax(tmp_seen_sim, axis=1)
        acc_S_T_list.append((pred_lbl == np.asarray(dataset.labels_train)).mean())

        tmp_unseen_sim = copy.deepcopy(unseen_sim)
        tmp_unseen_sim[:, dataset.train_cls_num:] += GZSL_lambda
        pred_lbl = np.argmax(tmp_unseen_sim, axis=1)
        acc_U_T_list.append((pred_lbl == (np.asarray(dataset.labels_test) + dataset.train_cls_num)).mean())

    auc_score = integrate.trapz(y=acc_S_T_list, x=acc_U_T_list) * 100.0
    plt.plot(acc_S_T_list, acc_U_T_list)
    plt.title("{:s}-{:s}-{}: {:.4}%".format(opt.dataset, opt.splitmode, opt.model_number, auc_score))
    plt.savefig(plot_dir + '/best_plot.png')
    plt.clf()
    plt.close()
    np.savetxt(plot_dir + '/best_plot.txt', np.vstack([acc_S_T_list, acc_U_T_list]))
    result.auc_list += [auc_score]
    return auc_score 
Example 69
Project: MAC   Author: runzhouge   File: dataset.py    MIT License 4 votes vote down vote up
def next_batch_iou(self):

        image_batch = np.zeros([self.batch_size, self.visual_feature_dim])
        softmax_batch = np.zeros([self.batch_size, self.clip_softmax_dim])
        sentence_batch = np.zeros([self.batch_size, self.sent_vec_dim])
        offset_batch = np.zeros([self.batch_size, 2], dtype=np.float32)
        VP_spacy_batch = np.zeros([self.batch_size, self.spacy_vec_dim*2])
        subj_spacy_batch = np.zeros([self.batch_size, self.spacy_vec_dim])
        obj_spacy_batch = np.zeros([self.batch_size, self.spacy_vec_dim])

        #lst_video_clip_order_lst = self.generate_training_sample_index_one_clip_in_one_video()
        lst_video_clip_order_lst = self.generate_training_sample_index_all_clip_in_one_then_next_one()        

        # read all clips
        for ind_this, index_here in enumerate(lst_video_clip_order_lst):
            # get this clip's: sentence  vector, swin, p_offest, l_offset, sentence, Vps
            dict_3rd = self.clip_sentence_pairs_iou[index_here[0]][index_here[1]][index_here[2]]
            #read visual feats
            featmap = self.read_unit_level_feats(dict_3rd['proposal_or_sliding_window'])
            left_context_feat, right_context_feat = self.get_context_window(dict_3rd['proposal_or_sliding_window'], self.context_num)
            image_batch[ind_this,:] = np.hstack((left_context_feat, featmap, right_context_feat))
            # read softmax batch
            softmax_center_clip = self.read_unit_level_softmax(dict_3rd['proposal_or_sliding_window'])
            softmax_batch[ind_this,:] = softmax_center_clip
            # sentence batch
            sentence_batch[ind_this,:] = dict_3rd['sent_skip_thought_vec'][0][0, :self.sent_vec_dim]
            if len(dict_3rd['dobj_or_VP']) != 0:
                VP_spacy_one_by_one_this_ = dict_3rd['VP_spacy_vec_one_by_one_word'][random.choice(xrange(len(dict_3rd['dobj_or_VP'])))]
                if len(VP_spacy_one_by_one_this_) == 1:
                    VP_spacy_batch[ind_this, :self.spacy_vec_dim] = VP_spacy_one_by_one_this_[0]
                else:
                    VP_spacy_batch[ind_this, :] = np.concatenate((VP_spacy_one_by_one_this_[0], VP_spacy_one_by_one_this_[1]))
            if len(dict_3rd['subj']) != 0:
                subj_spacy_batch[ind_this, :] = dict_3rd['subj_spacy_vec'][random.choice(xrange(len(dict_3rd['subj'])))]
            if len(dict_3rd['obj']) != 0:
                obj_spacy_batch[ind_this, :] = dict_3rd['obj_spacy_vec'][random.choice(xrange(len(dict_3rd['obj'])))]

            # offest
            p_offset = dict_3rd['offset_start']
            l_offset = dict_3rd['offset_end']
            offset_batch[ind_this,0] = p_offset
            offset_batch[ind_this,1] = l_offset

        simi_mat_img = cosine_similarity(image_batch, image_batch)
        np.fill_diagonal(simi_mat_img, 1.0)

        return image_batch, sentence_batch, offset_batch, softmax_batch, VP_spacy_batch, subj_spacy_batch, obj_spacy_batch, simi_mat_img 
Example 70
Project: scattertext   Author: JasonKessler   File: CategoryEmbeddings.py    Apache License 2.0 4 votes vote down vote up
def __init__(self, category_embedding_resolver, category1, category2, prefix1=None, prefix2=None):
        '''

        :param category_embedding_resolver: CategoryEmbeddingsResolver
        :param category1: str
        :param category2: str
        :param prefix1: str
        :param prefix2: str
        '''
        #assert issubclass(type(category_embedding_resolver), CategoryEmbeddingsResolver)
        self.category_embedding_resolver = category_embedding_resolver
        valid_categories = category_embedding_resolver.corpus_.get_categories()
        assert category1 in valid_categories
        assert category2 in valid_categories
        self.category1 = category1
        self.category2 = category2
        cat1_dwe_dict = category_embedding_resolver.category_embeddings_[category1]
        cat2_dwe_dict = category_embedding_resolver.category_embeddings_[category2]
        self.terms = np.array(list((set(cat1_dwe_dict.keys()) & set(cat2_dwe_dict.keys()))))
        self.cat1_dwe_ar = np.stack([cat1_dwe_dict[word] for word in self.terms])
        self.cat2_dwe_ar = np.stack([cat2_dwe_dict[word] for word in self.terms])

        #self.cat1_dwe_ar_norm, self.cat2_dwe_ar_norm, self.disparity = \
        #    scipy.spatial.procrustes(self.cat1_dwe_ar, self.cat2_dwe_ar)
        self.pairwise_sim, sv = scipy.linalg.orthogonal_procrustes(self.cat1_dwe_ar, self.cat2_dwe_ar)

        import pdb; pdb.set_trace()

        #self.pairwise_sim = cosine_similarity(np.vstack([self.cat1_dwe_ar_norm,
        #                                                 self.cat2_dwe_ar_norm]))
        #

        self.pairwise_sim_sort = np.argsort(-self.pairwise_sim, axis=1)

        def distinct_prefix(x, y):
            for i, (xc, yc) in enumerate(zip(x, y)):
                if xc != yc:
                    return (x[:i + 1], y[:i + 1])
            return x, y

        myprefix1, myprefix2 = distinct_prefix(category1, category2)
        self.prefix1 = myprefix1 if prefix1 is None else prefix1
        self.prefix2 = myprefix2 if prefix2 is None else prefix2

        self.labeled_terms = np.array([self.prefix1 + '_' + w for w in self.terms]
                                      + [self.prefix2 + '_' + w for w in self.terms]) 
Example 71
Project: Attention-Based-Siamese-Text-CNN-for-Stance-Detection   Author: Yikai-Wang   File: util.py    MIT License 4 votes vote down vote up
def pipeline_test(test, bow_vectorizer, tfreq_vectorizer, tfidf_vectorizer):

    """

    Process test set

    Args:
        test: FNCData object, test set
        bow_vectorizer: sklearn CountVectorizer
        tfreq_vectorizer: sklearn TfidfTransformer(use_idf=False)
        tfidf_vectorizer: sklearn TfidfVectorizer()

    Returns:
        test_set: list, of numpy arrays

    """

    # Initialise
    test_set = []
    heads_track = {}
    bodies_track = {}
    cos_track = {}

    # Process test set
    for instance in test.instances:
        head = instance['Headline']
        body_id = instance['Body ID']
        if head not in heads_track:
            head_bow = bow_vectorizer.transform([head]).toarray()
            head_tf = tfreq_vectorizer.transform(head_bow).toarray()[0].reshape(1, -1)
            head_tfidf = tfidf_vectorizer.transform([head]).toarray().reshape(1, -1)
            heads_track[head] = (head_tf, head_tfidf)
        else:
            head_tf = heads_track[head][0]
            head_tfidf = heads_track[head][1]
        if body_id not in bodies_track:
            body_bow = bow_vectorizer.transform([test.bodies[body_id]]).toarray()
            body_tf = tfreq_vectorizer.transform(body_bow).toarray()[0].reshape(1, -1)
            body_tfidf = tfidf_vectorizer.transform([test.bodies[body_id]]).toarray().reshape(1, -1)
            bodies_track[body_id] = (body_tf, body_tfidf)
        else:
            body_tf = bodies_track[body_id][0]
            body_tfidf = bodies_track[body_id][1]
        if (head, body_id) not in cos_track:
            tfidf_cos = cosine_similarity(head_tfidf, body_tfidf)[0].reshape(1, 1)
            cos_track[(head, body_id)] = tfidf_cos
        else:
            tfidf_cos = cos_track[(head, body_id)]
        feat_vec = np.squeeze(np.c_[head_tf, body_tf, tfidf_cos])
        test_set.append(feat_vec)

    return test_set 
Example 72
Project: DeepLearn   Author: GauravBh1010tt   File: util.py    MIT License 4 votes vote down vote up
def pipeline_test(test, bow_vectorizer, tfreq_vectorizer, tfidf_vectorizer):

    """
    Process test set
    Args:
        test: FNCData object, test set
        bow_vectorizer: sklearn CountVectorizer
        tfreq_vectorizer: sklearn TfidfTransformer(use_idf=False)
        tfidf_vectorizer: sklearn TfidfVectorizer()
    Returns:
        test_set: list, of numpy arrays
    """

    # Initialise
    test_set = []
    heads_track = {}
    bodies_track = {}
    cos_track = {}

    # Process test set
    for instance in test.instances:
        head = instance['Headline']
        body_id = instance['Body ID']
        if head not in heads_track:
            head_bow = bow_vectorizer.transform([head]).toarray()
            head_tf = tfreq_vectorizer.transform(head_bow).toarray()[0].reshape(1, -1)
            head_tfidf = tfidf_vectorizer.transform([head]).toarray().reshape(1, -1)
            heads_track[head] = (head_tf, head_tfidf)
        else:
            head_tf = heads_track[head][0]
            head_tfidf = heads_track[head][1]
        if body_id not in bodies_track:
            body_bow = bow_vectorizer.transform([test.bodies[body_id]]).toarray()
            body_tf = tfreq_vectorizer.transform(body_bow).toarray()[0].reshape(1, -1)
            body_tfidf = tfidf_vectorizer.transform([test.bodies[body_id]]).toarray().reshape(1, -1)
            bodies_track[body_id] = (body_tf, body_tfidf)
        else:
            body_tf = bodies_track[body_id][0]
            body_tfidf = bodies_track[body_id][1]
        if (head, body_id) not in cos_track:
            tfidf_cos = cosine_similarity(head_tfidf, body_tfidf)[0].reshape(1, 1)
            cos_track[(head, body_id)] = tfidf_cos
        else:
            tfidf_cos = cos_track[(head, body_id)]
        feat_vec = np.squeeze(np.c_[head_tf, body_tf, tfidf_cos])
        test_set.append(feat_vec)

    return test_set 
Example 73
Project: gtd   Author: JeffAbrahamson   File: find_similar.py    GNU General Public License v2.0 4 votes vote down vote up
def find_similar(input_filename, target):
    """Find some phrases similar to target.

    """
    dataframe = gtd_load(input_filename, 'tasks')
    labels = dataframe.label.unique()
    print('Got {n} labels.'.format(n=len(labels)))

    max_score = .5
    # vectorizer = CountVectorizer(analyzer='word')
    # vectorizer = CountVectorizer(analyzer='word', ngram_range=(1,2))
    # vectorizer = HashingVectorizer(analyzer='word', ngram_range=(1,2))
    # For TfIdf, scores are bigger.
    max_score = .8
    vectorizer = TfidfVectorizer(analyzer='word')
    # vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2))
    print('Learning model (TF-IDF)...')
    ft_matrix = vectorizer.fit_transform(labels)
    print('Got model: {r}x{c}.'.format(r=ft_matrix.shape[0], c=ft_matrix.shape[1]))
    cosine_distance = 1 - cosine_similarity(ft_matrix)
    target_index = list(labels).index(target)
    print('Found target at index {i}'.format(i=target_index))
    if len(labels) != cosine_distance.shape[0]:
        print('Warning: {num_labels} labels, {num_dist} distances'.format(
            num_labels=len(labels), num_dist=cosine_distance.shape[0]))

    print('Searching for similarities (among {n})...'.format(n=len(labels)))
    similar = []
    for pattern_index in range(len(labels)):
        pattern = labels[pattern_index]
        if pattern != target:
            score = cosine_distance[target_index, pattern_index]
            if score < max_score:
                similar.append((pattern_index, score))
    similar.sort(key=itemgetter(1))
    print(len(similar))
    print(target)
    for candidate in similar[:10]:
        print('  {score:.2} {phrase}'.format(
            score=candidate[1], phrase=labels[candidate[0]]))
    print(target)
    for candidate in similar[90:100]:
        print('  {score:.2} {phrase}'.format(
            score=candidate[1], phrase=labels[candidate[0]])) 
Example 74
Project: gtd   Author: JeffAbrahamson   File: plot_similar.py    GNU General Public License v2.0 4 votes vote down vote up
def plot_similar(input_filename, output_filename, target, width, height):
    """Find some phrases similar to target.

    """
    dataframe = gtd_load(
        input_filename, 'tasks')
    labels = dataframe.label.unique()
    print('Got {n} labels.'.format(n=len(labels)))
    labels = np.random.choice(labels, 1000, False)
    if target not in labels:
        labels = np.append(labels, target)
    print('  Sub-sampled to {n} labels.'.format(n=len(labels)))

    print('Learning model...')
    vectorizer = HashingVectorizer(analyzer='word')
    ft_matrix = vectorizer.fit_transform(labels)
    print('Got model: {r}x{c}.'.format(r=ft_matrix.shape[0], c=ft_matrix.shape[1]))
    cosine_distance = 1 - cosine_similarity(ft_matrix)
    
    target_index = list(labels).index(target)
    print('Found target at index {i}'.format(i=target_index))
    if len(labels) != cosine_distance.shape[0]:
        print('Warning: {num_labels} labels, {num_dist} distances'.format(
            num_labels=len(labels), num_dist=cosine_distance.shape[0]))

    # Two components as we're plotting points in a two-dimensional plane.
    # "Precomputed" because we provide a distance matrix.
    # We specify `random_state` so the plot is reproducible.
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
    pos = mds.fit_transform(cosine_distance) # shape (n_components, n_samples)
    print('MDS projection completed.')

    print('Scattering (among {n} points)...'.format(n=len(labels)))
    for pattern_index in range(len(labels)):
        distance = cosine_distance[target_index, pattern_index]
        pattern = labels[pattern_index]
        if pattern == target:
            color = 'black'
            ecolor = 'black'
        elif distance < .5:
            color = 'blue'
            ecolor = 'none'
        else:
            color = 'cyan'
            ecolor = 'none'
        plt.scatter(x=pos[pattern_index, 0], y=pos[pattern_index, 1], \
                    c=color, edgecolor=ecolor)
    fig = plt.gcf()
    fig.set_size_inches(width, height)
    plt.savefig(output_filename, dpi=100)
    print('Wrote {fn}'.format(fn=output_filename)) 
Example 75
Project: ijcai2019-relis   Author: UKPLab   File: ner_cos_vector.py    MIT License 4 votes vote down vote up
def __call__(self,summary_list):
        summary_features = []
        doc_summ_list = []
        for doc in self.documents:
            #cleaned_doc = ' '.join(sent2stokens(doc,self.stemmer,LANGUAGE))
            cleaned_doc = ' '.join(sent2stokens_wostop(doc,self.stemmer,self.stoplist,LANGUAGE))
            doc_summ_list.append(cleaned_doc)
        for sum_idxs in summary_list:
            summary = []
            for idx in sum_idxs:
                summary.append(self.sentences[idx])
            #cleaned_summ = ' '.join(sent2stokens(' '.join(summary),self.stemmer,LANGUAGE))
            cleaned_summ = ' '.join(sent2stokens_wostop(' '.join(summary),self.stemmer,self.stoplist,LANGUAGE))
            doc_summ_list.append(cleaned_summ)
            summary_features.append([len(sum_idxs)])

        ### get tf cosine similarity
        tf_matrix = TfidfVectorizer(use_idf=False,ngram_range=(1,1),tokenizer=word_tokenize).fit_transform(doc_summ_list)
        for ii in range(len(self.documents),tf_matrix._shape[0]):
            cos_list = []
            for jj in range(len(self.documents)):
                cos_list.append(cosine_similarity(tf_matrix[ii,:],tf_matrix[jj,:]))
            summary_features[ii-len(self.documents)].append(np.max(cos_list))
            summary_features[ii-len(self.documents)].append(np.min(cos_list))
            summary_features[ii-len(self.documents)].append(np.mean(cos_list))
            summary_features[ii-len(self.documents)].append(np.std(cos_list))

        ### get tf-idf cosine similarity. Note that the idf weights are from original documents
        tfidf_vec = TfidfVectorizer(use_idf=True,ngram_range=(1,1),tokenizer=word_tokenize)
        tf_idf_matrix = tfidf_vec.fit_transform(doc_summ_list[:len(self.documents)])
        summ_tf = CountVectorizer(vocabulary=tfidf_vec.vocabulary_).fit_transform(doc_summ_list[len(self.documents):])
        for ii,tf in enumerate(summ_tf.toarray()):
            ntf = tf*tfidf_vec.idf_/np.linalg.norm(tf)
            cos_list = []
            for jj in range(len(self.documents)):
                cos_list.append(cosine_similarity(ntf.reshape(1,-1),tf_idf_matrix[jj,:]))
            summary_features[ii].append(np.min(cos_list))
            summary_features[ii].append(np.max(cos_list))
            summary_features[ii].append(np.mean(cos_list))
            summary_features[ii].append(np.std(cos_list))

        return np.array(summary_features) 
Example 76
Project: ijcai2019-relis   Author: UKPLab   File: cross_topic_sentence_vector.py    MIT License 4 votes vote down vote up
def __call__(self,summary_list):
        summary_features = []
        doc_summ_list = []
        for doc in self.documents:
            #cleaned_doc = ' '.join(sent2stokens(doc,self.stemmer,LANGUAGE))
            cleaned_doc = ' '.join(sent2stokens_wostop(doc,self.stemmer,self.stoplist,LANGUAGE))
            doc_summ_list.append(cleaned_doc)
        for sum_idxs in summary_list:
            summary = []
            for idx in sum_idxs:
                summary.append(self.sentences[idx])
            #cleaned_summ = ' '.join(sent2stokens(' '.join(summary),self.stemmer,LANGUAGE))
            cleaned_summ = ' '.join(sent2stokens_wostop(' '.join(summary),self.stemmer,self.stoplist,LANGUAGE))
            doc_summ_list.append(cleaned_summ)
            summary_features.append([len(sum_idxs)])

        ### get tf cosine similarity
        tf_matrix = TfidfVectorizer(use_idf=False,ngram_range=(1,1),tokenizer=word_tokenize).fit_transform(doc_summ_list)
        for ii in range(len(self.documents),tf_matrix._shape[0]):
            cos_list = []
            for jj in range(len(self.documents)):
                cos_list.append(cosine_similarity(tf_matrix[ii,:],tf_matrix[jj,:]))
            summary_features[ii-len(self.documents)].append(np.max(cos_list))
            summary_features[ii-len(self.documents)].append(np.min(cos_list))
            summary_features[ii-len(self.documents)].append(np.mean(cos_list))
            summary_features[ii-len(self.documents)].append(np.std(cos_list))

        ### get tf-idf cosine similarity. Note that the idf weights are from original documents
        tfidf_vec = TfidfVectorizer(use_idf=True,ngram_range=(1,1),tokenizer=word_tokenize)
        tf_idf_matrix = tfidf_vec.fit_transform(doc_summ_list[:len(self.documents)])
        summ_tf = CountVectorizer(vocabulary=tfidf_vec.vocabulary_).fit_transform(doc_summ_list[len(self.documents):])
        for ii,tf in enumerate(summ_tf.toarray()):
            ntf = tf*tfidf_vec.idf_/np.linalg.norm(tf)
            cos_list = []
            for jj in range(len(self.documents)):
                cos_list.append(cosine_similarity(ntf.reshape(1,-1),tf_idf_matrix[jj,:]))
            summary_features[ii].append(np.min(cos_list))
            summary_features[ii].append(np.max(cos_list))
            summary_features[ii].append(np.mean(cos_list))
            summary_features[ii].append(np.std(cos_list))

        return np.array(summary_features) 
Example 77
Project: JobFunnel   Author: PaulMcInnis   File: filters.py    MIT License 4 votes vote down vote up
def tfidf_filter(cur_dict: Dict[str, dict], prev_dict: Dict[str, dict],
                 max_similarity: float = 0.75):
    """ Fit a TFIDF vectorizer to a corpus of all listing's text

        Args:
            cur_dict: the existing masterlist job dict
            prev_dict: today's job scrape dict
            max_similarity: threshold above which a blurb similarity = duplicate

        Returns:
            list of duplicate job ids which were removed from cur_dict

        @TODO skip calculating metric for jobs which have the same job id!
    """
    # init vectorizer
    vectorizer = TfidfVectorizer(strip_accents='unicode',
                                 lowercase=True,
                                 analyzer='word')
    # get reference words as list
    reference_words = [job['blurb'] for job in prev_dict.values()]

    # get query words as list
    query_words, query_ids = [], []
    for job in cur_dict.values():
        query_words.append(job['blurb'])
        query_ids.append(job['id'])

    # fit vectorizer to entire corpus
    vectorizer.fit(query_words + reference_words)

    # set reference tfidf for cosine similarity later
    references = vectorizer.transform(reference_words)

    # calculate cosine similarity between reference and current blurbs
    similarities = cosine_similarity(
        vectorizer.transform(query_words), references)

    # get duplicate job ids and pop them
    duplicate_ids = []
    for sim, query_id in zip(similarities, query_ids):
        if np.max(sim) >= max_similarity:
            duplicate_ids.append(cur_dict.pop(query_id)['id'])

    # log something
    logging.info("found {} unique listings and {} duplicate listings "
                 "via TFIDF cosine similarity".format(len(cur_dict.keys()),
                                                      len(duplicate_ids)))
    return duplicate_ids 
Example 78
Project: VariationalAutoRegressive   Author: likecoffee   File: evaluate.py    GNU General Public License v3.0 4 votes vote down vote up
def eval_emb_metrics(hypothesis, references, embedding_array):
    emb_hyps = []
    avg_emb_hyps = []
    extreme_emb_hyps = []
    for hyp in hypothesis:
        embs = np.array([embedding_array[word] for word in hyp])
        avg_emb = np.sum(embs, axis=0) / np.linalg.norm(np.sum(embs, axis=0))
        maxemb = np.max(embs, axis=0)
        minemb = np.min(embs, axis=0)
        extreme_emb = np.array(list(map(lambda x, y: x if ((x>y or x<-y) and y>0) or ((x<y or x>-y) and y<0) else y, maxemb, minemb)))

        emb_hyps.append(embs)
        avg_emb_hyps.append(avg_emb)
        extreme_emb_hyps.append(extreme_emb)

    emb_refs = []
    avg_emb_refs = []
    extreme_emb_refs = []
    for ref in references:
        embs = np.array([embedding_array[word] for word in ref])
        avg_emb = np.sum(embs, axis=0) / np.linalg.norm(np.sum(embs, axis=0))
        #avg_emb = np.mean(embs,axis=0)
        maxemb = np.max(embs, axis=0)
        minemb = np.min(embs, axis=0)
        extreme_emb = np.array(list(map(lambda x, y: x if ((x>y or x<-y) and y>0) or ((x<y or x>-y) and y<0) else y, maxemb, minemb)))
        emb_refs.append(embs)
        avg_emb_refs.append(avg_emb)
        extreme_emb_refs.append(extreme_emb)

    avg_cos_similarity = np.array([cos_sim(hyp,ref) for hyp,ref in zip(avg_emb_hyps,avg_emb_refs)])
    avg_cos_similarity = avg_cos_similarity.mean()
    extreme_cos_similarity = np.array([cos_sim(hyp, ref) for hyp, ref in zip(extreme_emb_hyps, extreme_emb_refs)])
    extreme_cos_similarity = extreme_cos_similarity.mean()

    scores = []
    for emb_ref, emb_hyp in zip(emb_refs, emb_hyps):
        simi_matrix = cosine_similarity(emb_ref, emb_hyp)
        dir1 = simi_matrix.max(axis=0).mean()
        dir2 = simi_matrix.max(axis=1).mean()
        scores.append((dir1+dir2)/2)
    greedy_scores = np.mean(scores)

    return avg_cos_similarity,extreme_cos_similarity,greedy_scores 
Example 79
Project: hoaxbait   Author: shril   File: util.py    Apache License 2.0 4 votes vote down vote up
def pipeline_test(test, bow_vectorizer, tfreq_vectorizer, tfidf_vectorizer):

    """

    Process test set

    Args:
        test: FNCData object, test set
        bow_vectorizer: sklearn CountVectorizer
        tfreq_vectorizer: sklearn TfidfTransformer(use_idf=False)
        tfidf_vectorizer: sklearn TfidfVectorizer()

    Returns:
        test_set: list, of numpy arrays

    """

    # Initialise
    test_set = []
    heads_track = {}
    bodies_track = {}
    cos_track = {}

    # Process test set
    for instance in test.instances:
        head = instance['Headline']
        body_id = instance['Body ID']
        if head not in heads_track:
            head_bow = bow_vectorizer.transform([head]).toarray()
            head_tf = tfreq_vectorizer.transform(head_bow).toarray()[0].reshape(1, -1)
            head_tfidf = tfidf_vectorizer.transform([head]).toarray().reshape(1, -1)
            heads_track[head] = (head_tf, head_tfidf)
        else:
            head_tf = heads_track[head][0]
            head_tfidf = heads_track[head][1]
        if body_id not in bodies_track:
            body_bow = bow_vectorizer.transform([test.bodies[body_id]]).toarray()
            body_tf = tfreq_vectorizer.transform(body_bow).toarray()[0].reshape(1, -1)
            body_tfidf = tfidf_vectorizer.transform([test.bodies[body_id]]).toarray().reshape(1, -1)
            bodies_track[body_id] = (body_tf, body_tfidf)
        else:
            body_tf = bodies_track[body_id][0]
            body_tfidf = bodies_track[body_id][1]
        if (head, body_id) not in cos_track:
            tfidf_cos = cosine_similarity(head_tfidf, body_tfidf)[0].reshape(1, 1)
            cos_track[(head, body_id)] = tfidf_cos
        else:
            tfidf_cos = cos_track[(head, body_id)]
        feat_vec = np.squeeze(np.c_[head_tf, body_tf, tfidf_cos])
        test_set.append(feat_vec)

    return test_set 
Example 80
Project: verifytweet   Author: kamidipreetham   File: text.py    GNU Affero General Public License v3.0 4 votes vote down vote up
def get_similarity(self, extracted_tweet: str, same_day_tweets: list):
        """Calculates a similarity matrix.

        Calculates a similarity matrix of the corpus containing
        extracted tweet and tweets aggregated from Twitter Search API
        using consine similarity approach.

        Attributes:
            extracted_tweet: A string denoting extracted tweet from image.
            same_day_tweets: A list contaning tweets of target date

        Returns:
            A tuple contaning a similarity matrix, which is a numpy array
            as well as Enum ResultStatus which gives out result status.
            For example: ::

                ([[1.        0.9258201]
                 [0.9258201 1.       ]], ResultStatus.ALL_OKAY)


        """
        if not isinstance(extracted_tweet, str) or not isinstance(
                same_day_tweets, list):
            raise TypeError(
                'Extracted tweet must be type str and Same day tweets must be type list'
            )
        if not extracted_tweet or not same_day_tweets:
            raise ValueError(
                'Extracted tweet must be a valid string and same day tweets must be a valid list'
            )
        logger.info('Processing similarity of two tweets...')
        corpus = list()
        corpus.append(extracted_tweet)
        corpus.extend(same_day_tweets)
        logger.debug('Corpus: ' + str(corpus))
        try:
            sparse_matrix = count_vectorizer.fit_transform(corpus)
            similarity_matrix = cosine_similarity(sparse_matrix, sparse_matrix)
        except Exception as e:
            logger.exception(e)
            return (None, ResultStatus.MODULE_FAILURE)
        logger.debug('Similartiy Matrix: ' + str(similarity_matrix))
        return (similarity_matrix, ResultStatus.ALL_OKAY)