#-*- coding:utf-8 -*-
#refer: https://github.com/xionghhcs/chip2018/blob/c2bb9efc08eca521a9ef5d37d4b915fb4c2a69dc/src/feature_extractor.py
from nltk import ngrams
from sklearn.preprocessing import MinMaxScaler
from gensim.models import KeyedVectors
import gensim
from gensim.models import KeyedVectors
from collections import Counter
import sys
import time
import datetime
import copy
import pdb

from gensim.summarization.bm25 import get_bm25_weights, BM25
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora, models, similarities
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import jaccard_similarity_score

#pip3 install python_Levenshtein,支持hamming, edit distance

class Similarity():
    def __init__(self, corpus = None):
        self.corpus = corpus
        if self.corpus != None:
            self.tfidf_vectorizer = self.get_tfidf_vectorizer(self.corpus)
            self.corpus_vec = self.tfidf_vectorizer.transform(self.corpus)
            self.bm25_model = BM25([s.split() for s in corpus])
            self.average_idf = sum(map(lambda k: float(self.bm25_model.idf[k]),
                                       self.bm25_model.idf.keys())) / len(self.bm25_model.idf.keys())

    def get_tfidf_vectorizer(self, corpus):
        tfidf_vectorizer = TfidfVectorizer()
        tfidf_vectorizer.fit(corpus)
        return tfidf_vectorizer

    def get_vector(self, query):
        vec = self.tfidf_vectorizer.transform([query])
        return vec[0]

    def similarity(self, query, type):
        assert self.corpus != None, "self.corpus can't be None"
        ret = []
        if type == 'cosine':
            query = self.get_vector(query)
            for item in self.corpus_vec:
                sim = cosine_similarity(item, query)
                ret.append(sim[0][0])
        elif type == 'manhattan':
            query = self.get_vector(query)
            for item in self.corpus_vec:
                sim = manhattan_distances(item, query)
                ret.append(sim[0][0])
        elif type == 'euclidean':
            query = self.get_vector(query)
            for item in self.corpus_vec:
                sim = euclidean_distances (item, query)
                ret.append(sim[0][0])
        #elif type == 'jaccard':
        #    #query = query.split()
        #    query = self.get_vector(query)
        #    for item in self.corpus_vec:
        #        pdb.set_trace()
        #        sim = jaccard_similarity_score(item, query)
        #        ret.append(sim)
        elif type == 'bm25':
            query = query.split()
            ret = self.bm25_model.get_scores(query)
        else:
            raise ValueError('similarity type error:%s'%type)
        return ret

if __name__ == '__main__':
    corpus = ['帮我 打开 灯','打开 空调', '关闭 空调','关灯','音量 调高','声音 调高']
    sim = Similarity(corpus)
    print(sim.similarity('打开 灯','cosine'))
    print(sim.similarity('打开 灯','manhattan'))
    print(sim.similarity('打开 灯','euclidean'))
    print(sim.similarity('打开 灯','bm25'))