python source code of feature

import pandas as pd
import numpy as np
import re
import gensim
import distance

from collections import Counter, defaultdict
from fuzzywuzzy import fuzz
from gensim import corpora, models
from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk import ngrams

from closer.config import dataset_config, model_config
from closer.data_utils import bm25
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity as cs
from sklearn.metrics.pairwise import manhattan_distances as md
from sklearn.metrics.pairwise import euclidean_distances as ed
from sklearn.metrics import jaccard_similarity_score as jsc
from sklearn.neighbors import DistanceMetric
from sklearn.preprocessing import MinMaxScaler
from simhash import Simhash

minkowski_dis = DistanceMetric.get_metric('minkowski')

class FeatureCreator(object):

    def __init__(self, train_df, test_df, unlabeled_df, data_loader, normalization=True):
        self.train_df = train_df
        self.test_df = test_df
        self.unlabeled_df = unlabeled_df

        self.data_loader = data_loader
        self.stopwords = self.data_loader.load_stopwords()

        self.train_df['splited_spn_1'] = self.train_df['spn_1'].apply(lambda v: v.split())
        self.train_df['splited_spn_2'] = self.train_df['spn_2'].apply(lambda v: v.split())
        self.unlabeled_df['splited_spn_1'] = self.unlabeled_df['spn_1'].apply(lambda v: v.split())

        self.test_df['splited_spn_1'] = self.test_df['spn_1'].apply(lambda v: v.split())
        self.test_df['splited_spn_2'] = self.test_df['spn_2'].apply(lambda v: v.split())

        self.normalization = normalization

        docs = self.train_df['splited_spn_1'].values.tolist() + self.train_df['splited_spn_2'].values.tolist() + \
               self.test_df['splited_spn_1'].values.tolist() + self.test_df['splited_spn_2'].values.tolist() + self.unlabeled_df['splited_spn_1'].tolist()
        docs = np.array(docs)
        docs = np.unique(docs)
        docs = docs.tolist()
        
        docs_raw = self.train_df['spn_1'].values.tolist() + self.train_df['spn_2'].values.tolist() + \
                   self.test_df['spn_1'].values.tolist() + self.test_df['spn_2'].values.tolist() + self.unlabeled_df['spn_1'].tolist()
        docs_raw = np.array(docs_raw)
        docs_raw = np.unique(docs_raw)
        docs_raw = docs_raw.tolist()

        self.tfidf_vectorizer = TfidfVectorizer()
        self.tfidf_vectorizer.fit(docs_raw)

        self.bm25_scorer = bm25.bm25Scorer(docs=docs)

        #print("[FE] Loading the word2vec model")
        #self.word2vec_model = KeyedVectors.load_word2vec_format(dataset_config.SPANISH_WORDVEC_PATH)
        #self.word2vec_model.init_sims(replace=True)
        #print("[FE] Loaded the word2vec mdoel")

        self.build_statistic()

    def build_statistic(self):
        self.sentences = self.train_df['splited_spn_1'].tolist() + self.train_df['splited_spn_2'].tolist() + self.test_df['splited_spn_1'].tolist() + self.test_df['splited_spn_2'].tolist() + self.unlabeled_df['splited_spn_1'].tolist()
        self.sentences = np.unique(np.array(self.sentences)).tolist()

        words = []
        for comment in self.sentences:
            for w in comment:
                words.append(w)

        counts = Counter(words)
        self.weights = {word: self._get_weight(count) for word, count in counts.items()}

        self.dictionary = corpora.Dictionary(self.sentences)
        self.dictionary.compactify()
        print ("No of words in the dictionary = %s" % len(self.dictionary.token2id))

    def create_features(self):

        for df in [self.train_df, self.test_df]:
            # create word2vec features
            print("[FE] create the frequency features")
            self._create_frequency_features(df)

            '''Move to preprocessing notebook.
            # create word2vec features
            #print("[FE] create the word2vec features")
            #self._create_word2vec_features(df)
            # create hash features
            #print("[FE] creating the hash features")
            #self._create_hash_features(df)
            #print("[FE] creating the topic features")
            #self._create_topic_features(df)            
            '''

            # create IR features
            print("[FE] creating the IR features")
            self._create_IR_features(df)           

            # create tf/idf weighted distance
            print("[FE] creating the weighted distance features")
            self._create_weighted_distance_features(df)

            # create the length features
            print("[FE] creating the length features")
            self._create_length_features(df)

            # create the meta-information
            print("[FE] creating the weight features")
            self._create_weight_features(df)

            # create the distance features
            print("[FE] creating the distance features")
            self._create_distance_features(df)

            # create fuzzywuzzy features
            print("[FE] creating the fuzzy features")
            self._create_fuzzy_wuzzy_features(df)

            # create topic word features
            print("[FE] creating the topic word features")
            self._create_topic_word_features(df)

            print("[FE] TODO! Create the graph features")
                        
        print("[FE] Feature engineered. With features", self.test_df.columns.values)

        return self.train_df, self.test_df

    def apply_normalization(self, train_df, test_df):
        all_df = pd.concat((train_df, test_df))
        for column in model_config.META_FEATURES:
            if column in all_df.columns:
                scaler = MinMaxScaler()
                all_df[column] = scaler.fit_transform(all_df[column].values.reshape(-1, 1))
            else:
                print("[FE-Norm] The column", column, "is not in the dataframe.")
        train_df, test_df = all_df.iloc[:len(train_df)], all_df.iloc[len(train_df):]
        return train_df, test_df

    def _create_frequency_features(self, df):
        ques = pd.concat([self.train_df[['spn_1', 'spn_2']], \
                          self.test_df[['spn_1', 'spn_2']]], axis=0).reset_index(drop='index')
        q_dict = defaultdict(set)
        for i in range(ques.shape[0]):
                q_dict[ques.spn_1[i]].add(ques.spn_2[i])
                q_dict[ques.spn_2[i]].add(ques.spn_1[i])

        '''Might cause leakage
        def q1_freq(row):
            return(len(q_dict[row['spn_1']]))
            
        def q2_freq(row):
            return(len(q_dict[row['spn_2']]))
        
        def q1_q2_intersect(row):
            return(len(q_dict[row['spn_1']].intersection(q_dict[row['spn_2']])))

        df['q1_q2_intersect'] = df[['spn_1', 'spn_2']].apply(lambda row: q1_q2_intersect(row), axis=1)
        df['q1_freq'] = df[['spn_1', 'spn_2']].apply(lambda row: q1_freq(row), axis=1)
        df['q2_freq'] = df[['spn_1', 'spn_2']].apply(lambda row: q2_freq(row), axis=1)
        '''

    def _create_word2vec_features(self, df):
        df['wmd_distance'] = df[['spn_1', 'spn_2']].apply(lambda row: self.word2vec_model.wmdistance(row['spn_1'], row['spn_2']), axis=1)

    def _create_hash_features(self, df):

        def get_word_ngrams(sequence, n=3):
            return [' '.join(ngram) for ngram in ngrams(sequence, n)]

        def get_character_ngrams(sequence, n=3):
            sequence = ' '.join(sequence)
            return [sequence[i:i+n] for i in range(len(sequence)-n+1)]

        def calculate_simhash_distance(sequence1, sequence2):
            return Simhash(sequence1).distance(Simhash(sequence2))
            
        def calculate_all_simhash(row):
            q1, q2 = row['splited_spn_1'], row['splited_spn_2']
            simhash_distance = calculate_simhash_distance(q1, q2)

            q1, q2 = get_word_ngrams(q1, 2), get_word_ngrams(q2, 2)
            simhash_distance_2gram = calculate_simhash_distance(q1, q2)

            q1, q2 = get_word_ngrams(q1, 3), get_word_ngrams(q2, 3)
            simhash_distance_3gram = calculate_simhash_distance(q1, q2)

            q1, q2 = get_character_ngrams(q1, 2), get_character_ngrams(q2, 2)
            simhash_distance_ch_2gram = calculate_simhash_distance(q1, q2)
           
            q1, q2 = get_character_ngrams(q1, 3), get_character_ngrams(q2, 3)
            simhash_distance_ch_3gram = calculate_simhash_distance(q1, q2)

            return '{}:{}:{}:{}:{}'.format(simhash_distance, simhash_distance_2gram, simhash_distance_3gram, simhash_distance_ch_2gram, simhash_distance_ch_3gram)

        df['sim_hash'] = df.apply(calculate_all_simhash, axis=1, raw=True)
        df['simhash_distance'] = df['sim_hash'].apply(lambda x: float(x.split(':')[0]))
        df['simhash_distance_2gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[1]))
        df['simhash_distance_3gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[2]))
        df['simhash_distance_ch_2gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[3]))
        df['simhash_distance_ch_3gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[4]))


    def _create_weighted_distance_features(self, df):
        q1_matrix = self.tfidf_vectorizer.transform(df['spn_1'].values.tolist())
        q2_matrix = self.tfidf_vectorizer.transform(df['spn_2'].values.tolist())
        df['weighted_cosine_sim'] = np.concatenate([cs(q1_matrix[i], q2_matrix[i]).flatten() for i in range(q1_matrix.shape[0])])
        #df['weighted_eucledian_dis'] = np.square((q1_matrix - q2_matrix).toarray()).sum(axis=1)

    def _create_weight_features(self, df):
        df['word_shares'] = df.apply(self._build_word_shares, axis=1, raw=True)

        # weight features
        def first_word_the_same(row):
            return row['splited_spn_1'][0] == row['splited_spn_2'][0]

        df['first_word_the_same'] = df[['splited_spn_1', 'splited_spn_2']].apply(lambda row: first_word_the_same(row), axis=1)
        df['word_match'] = df['word_shares'].apply(lambda x: float(x.split(':')[0]))
        df['tfidf_word_match'] = df['word_shares'].apply(lambda x: float(x.split(':')[1]))
        df['diff_tfidf_word_match'] = (df['word_match'] - df['tfidf_word_match']).abs()
        df['shared_count']  = df['word_shares'].apply(lambda x: float(x.split(':')[2]))
        df['bigram_corr']  = df['word_shares'].apply(lambda x: float(x.split(':')[3]))
        df['trigram_corr']  = df['word_shares'].apply(lambda x: float(x.split(':')[4]))
        df['word_match_no_stopwords'] = df['word_shares'].apply(lambda x: float(x.split(':')[5]))
        df['unique_word_ratio'] = df[['splited_spn_1', 'splited_spn_2']].apply(lambda row: len(set(row['splited_spn_1']).union(row['splited_spn_2'])) / (len(row['splited_spn_1']) + len(row['splited_spn_2'])), axis=1)

    def _create_length_features(self, df):

        def word_length_compare(row, cmp):
            l1 = len(row['splited_spn_1'])
            l2 = len(row['splited_spn_2'])
            return cmp(l1, l2)

        def char_length_compare(row, cmp):
            l1 = len(str(row['spn_1']).replace(' ', ''))
            l2 = len(str(row['spn_2']).replace(' ', ''))
            return cmp(l1, l2)

        df['len_word_max'] = df[['splited_spn_1', 'splited_spn_2']].apply(lambda v: word_length_compare(v, max), axis=1)
        df['len_word_min'] = df[['splited_spn_2', 'splited_spn_1']].apply(lambda v: word_length_compare(v, min), axis=1)
        df['len_char_max'] = df[['spn_1', 'spn_2']].apply(lambda v: char_length_compare(v, max), axis=1)
        df['len_char_min'] = df[['spn_2', 'spn_1']].apply(lambda v: char_length_compare(v, min), axis=1)

        df['len_word_q1'] = df['splited_spn_1'].apply(len)
        df['len_word_q2'] = df['splited_spn_2'].apply(len)
        df['len_char_q1'] = df['spn_1'].apply(lambda x: len(str(x).replace(' ', '')))
        df['len_char_q2'] = df['spn_2'].apply(lambda x: len(str(x).replace(' ', '')))

        df['word_length_diff'] = (df['len_word_max'] - df['len_word_min']).abs()
        df['char_length_diff'] = (df['len_char_max'] - df['len_char_min']).abs()

        df['len_avg_word_1'] = df['len_word_q1'] / df['len_char_q1']
        df['len_avg_word_2'] = df['len_word_q2'] / df['len_char_q2']
        df['avg_word_diff'] = (df['len_avg_word_1'] - df['len_avg_word_2']).abs()

        def calculate_without_stops_features(row):
            q1_list = row['splited_spn_1']
            q1_set = set(q1_list)
            q1_no_stopwords = q1_set.difference(self.stopwords)

            q2_list = row['splited_spn_2']
            q2_set = set(q2_list)
            q2_no_stopwords = q2_set.difference(self.stopwords)

            return abs(len(q1_no_stopwords) - len(q2_no_stopwords))

        df['len_diff_remove_stopwords'] = df[['splited_spn_1', 'splited_spn_2']].apply(lambda v: calculate_without_stops_features(v), axis=1)

    def _create_topic_word_features(self, df):
        
        def add_word_count(df, word):
            df['q1_' + word] = df['splited_spn_1'].apply(lambda x: (word in x) * 1) # * 1 for casting booleans to ints
            df['q2_' + word] = df['splited_spn_2'].apply(lambda x: (word in x) * 1)
            df[word + '_both'] = df['q1_' + word] * df['q2_' + word]

        add_word_count(df, 'cómo') # how
        add_word_count(df, 'qué') # what
        # talk about myself

    def _create_distance_features(self, df):
        q1_csc, q2_csc = self._get_vectors(df, self.dictionary)
        cosine_sim, manhattan_dis, eucledian_dis, jaccard_dis, minkowsk_dis = self._get_similarity_values(q1_csc, q2_csc)
        print ("[FE] cosine_sim sample= \n", cosine_sim[0:2])
        print ("[FE] manhattan_dis sample = \n", manhattan_dis[0:2])
        print ("[FE] eucledian_dis sample = \n", eucledian_dis[0:2])
        print ("[FE] jaccard_dis sample = \n", jaccard_dis[0:2])
        print ("[FE] minkowsk_dis sample = \n", minkowsk_dis[0:2])

        eucledian_dis_array = np.array(eucledian_dis).reshape(-1,1)
        manhattan_dis_array = np.array(manhattan_dis).reshape(-1,1)
        minkowsk_dis_array = np.array(minkowsk_dis).reshape(-1,1)

        eucledian_dis = eucledian_dis_array.flatten()
        manhattan_dis = manhattan_dis_array.flatten()
        minkowsk_dis = minkowsk_dis_array.flatten()

        df['cosine_sim'] = cosine_sim
        df['manhattan_dis'] = manhattan_dis
        df['eucledian_dis'] = eucledian_dis
        df['jaccard_dis'] = jaccard_dis
        df['minkowsk_dis'] = minkowsk_dis

    def _create_fuzzy_wuzzy_features(self, df):
        df['fuzzy_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.ratio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_set_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.token_set_ratio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_partial_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.partial_ratio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_token_sort_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.token_sort_ratio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_qratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.QRatio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_WRatio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.WRatio(row['spn_1'], row['spn_2']), axis=1)
   
        def _get_longest_substr_ratio(a, b):
            strs = list(distance.lcsubstrings(a, b))
            if len(strs) == 0:
                return 0
            else:
                return len(strs[0]) / (min(len(a), len(b)) + 1)

        df['longest_substr_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: _get_longest_substr_ratio(row['spn_1'], row['spn_2']), axis=1)

    def _create_IR_features(self, df):
        df['bm25_q1_to_q2'] = df[['splited_spn_1', 'splited_spn_2']].apply(lambda row: self.bm25_scorer.sim(row['splited_spn_1'], row['splited_spn_2']), axis=1)
        df['bm25_q2_to_q1'] = df[['splited_spn_1', 'splited_spn_2']].apply(lambda row: self.bm25_scorer.sim(row['splited_spn_2'], row['splited_spn_1']), axis=1)

    def _get_vectors(self, df, dictionary):
        question1_vec = [dictionary.doc2bow(text) for text in df.splited_spn_1.tolist()]
        question2_vec = [dictionary.doc2bow(text) for text in df.splited_spn_2.tolist()]
        
        question1_csc = gensim.matutils.corpus2csc(question1_vec, num_terms=len(dictionary.token2id))
        question2_csc = gensim.matutils.corpus2csc(question2_vec, num_terms=len(dictionary.token2id))
        
        return question1_csc.transpose(), question2_csc.transpose()

    def _get_similarity_values(self, q1_csc, q2_csc):
        cosine_sim = []
        manhattan_dis = []
        eucledian_dis = []
        jaccard_dis = []
        minkowsk_dis = []
        
        for i,j in zip(q1_csc, q2_csc):
            sim = cs(i, j)
            cosine_sim.append(sim[0][0])
            sim = md(i, j)
            manhattan_dis.append(sim[0][0])
            sim = ed(i, j)
            eucledian_dis.append(sim[0][0])
            i_ = i.toarray()
            j_ = j.toarray()
            try:
                sim = jsc(i_, j_)
                jaccard_dis.append(sim)
            except:
                jaccard_dis.append(0)
                
            sim = minkowski_dis.pairwise(i_, j_)
            minkowsk_dis.append(sim[0][0])
        return cosine_sim, manhattan_dis, eucledian_dis, jaccard_dis, minkowsk_dis    

    def _get_weight(self, count, eps=10000, min_count=2):
        return 0 if count < min_count else 1 / (count + eps)

    def _build_word_shares(self, row):

        q1_list = row['splited_spn_1']
        q1_set = set(q1_list)
        q1_no_stopwords = q1_set.difference(self.stopwords)

        q2_list = row['splited_spn_2']
        q2_set = set(q2_list)
        q2_no_stopwords = q2_set.difference(self.stopwords)
        share_no_stopwords = q1_no_stopwords.intersection(q2_no_stopwords)

        q1words = set(row['splited_spn_1'])
        if len(q1words) == 0:
            return '0:0:0:0:0:0'

        q2words = set(row['splited_spn_2'])
        if len(q2words) == 0:
            return '0:0:0:0:0:0'

        q1_2gram = set([i for i in zip(q1_list, q1_list[1:])])
        q2_2gram = set([i for i in zip(q2_list, q2_list[1:])])
        shared_2gram = q1_2gram.intersection(q2_2gram)

        q1_3gram = set(self._generate_ngram(3, q1_list))
        q2_3gram = set(self._generate_ngram(3, q2_list))
        shared_3gram = q1_3gram.intersection(q2_3gram)

        if len(q1_2gram) + len(q2_2gram) == 0:
            R2gram = 0
        else:
            R2gram = len(shared_2gram) / (len(q1_2gram) + len(q2_2gram))

        if len(q1_3gram) + len(q2_3gram) == 0:
            R3gram = 0
        else:
            R3gram = len(shared_3gram) / (len(q1_3gram) + len(q2_3gram))

        shared_words = q1words.intersection(q2words)
        
        shared_weights = [self.weights.get(w, 0) for w in shared_words]
        total_weights = [self.weights.get(w, 0) for w in q1words] + [self.weights.get(w, 0) for w in q2words]
        R1 = np.sum(shared_weights) / np.sum(total_weights) #tfidf share
        R2 = len(shared_words) / (len(q1words) + len(q2words)) #count share
        return '{}:{}:{}:{}:{}:{}'.format(R1, R2, len(shared_words), R2gram, R3gram, len(share_no_stopwords))

    def _generate_ngram(self, n, sentence):
        return [tuple(sentence[i:i+n]) for i in range(0, len(sentence) - 1)]