python source code of merge

import os
import time
from collections import defaultdict, deque

import numpy as np
import pandas as pd

import itertools

import CONSTANT
from util import Config, Timer, log, timeit
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from preprocess import clean_df

NUM_OP = [np.std, np.mean]

import copy



def get_tfidf_vector(X, key_name=None, max_features=10, sparse=False):
    v = TfidfVectorizer(max_features=max_features, dtype=np.float32)
    sparse_output = v.fit_transform(X)
    if sparse:
        return sparse_output
    #print(sparse_output.shape)
    #print(sparse_output.count_nonzero())
    #input()
    Y = pd.DataFrame(sparse_output.toarray())
    Y.index = X.index
    feature_name = v.get_feature_names()
    col_name_dict = dict()
    for i in range(max_features):
        col_name_dict[i] = 'n_' + key_name + '_' + feature_name[i]
    return Y.rename(col_name_dict, axis="columns")


def get_m2m_features(u):

    columns = []
    print(u.columns)
    for col in u.columns:
        if col.startswith("t_0") or col.startswith("c_0") or col.startswith("index"):
            continue
        columns.append(col)
    print(u)
    data = u.values
    group_index = {}
    result_data = []
    print(data.shape)
    print(u.shape)
    input()
    for i in range(u.shape[0]):
        tmp = data[i]
        print(tmp)
        input()
        tmp_index = tmp[-1]
        if tmp_index[0]=="v":
            group_index[tmp[-2]] = tmp
        else:
            if tmp[-2] in group_index:
                result_data.append(group_index[tmp[-2]])
            else:
                result_data.append(list(np.ones(tmp.shape)*np.nan))

    result_data = pd.DataFrame(np.array(result_data))
    print(result_data)
    input()
    result_data.columns = u.columns

    result_data.drop(["index"],axis=1,inplace=True)

    # for c in [c for c in result_data if c.startswith(CONSTANT.CATEGORY_PREFIX)]:
    #     result_data[c].fillna("0", inplace=True)
    #
    #     result_data[c] = result_data[c].apply(lambda x: int(x))

    for c in [c for c in result_data if c.startswith(CONSTANT.MULTI_CAT_PREFIX)]:
        #result_data.drop([c],axis=1,inplace=True)
        try:

            #print(result_data[c])
            result_data[c].fillna("0", inplace=True)
            result_data[c] = result_data[c].astype(str)
            #print(result_data[c])
            result_data["mul_feature_" + c] = result_data[c].str.split(",")
            #print(result_data["mul_feature_" + c])
            #print("------------")
            #result_data["mul_feature_" + c] = result_data[c]

            result_data[c] = result_data["mul_feature_" + c].apply(lambda x: int(x[0]))

        except:
            result_data.drop([c], axis=1, inplace=True)
            continue


    return result_data

    #     #group_index[tmp[]]
    #     #tmp.rehash_c_01
    # data_dict = dict(list(u))
    # for key in data_dict:
    #     print
    #     print(data_dict[key].values.shape)



    #
    # for hash_key in data_dict:
    #     for key2 in data_dict[hash_key]:
    #         table  = data_dict[hash_key][key2]
    #         print(table)
    #         # for line in table:
    #         #     print(line.index)


    # cols = u.columns
    # columns = []
    # for col in cols:
    #     print(col)
    #     if col.startswith("t_"):
    #         continue
    #     columns.append(col)
    #
    # for hash_key in u.rehash_c_01:
    #
    #     for line in u[hash_key]:
    #         if line.index == "v":
    #             print(line)
    #         else:
    #             print(line)






def bfs(root_name, graph, tconfig):
    tconfig[CONSTANT.MAIN_TABLE_NAME]['depth'] = 0
    queue = deque([root_name])
    while queue:
        u_name = queue.popleft()
        for edge in graph[u_name]:
            v_name = edge['to']
            if 'depth' not in tconfig[v_name]:
                tconfig[v_name]['depth'] = tconfig[u_name]['depth'] + 1
                queue.append(v_name)


@timeit
def join(u, v, v_name, key, type_):
    if isinstance(key, list):
        assert len(key) == 1
        key = key[0]

    if type_.split("_")[2] == 'many':
        v_features = v.columns
        for c in v_features:
            if c != key and c.startswith(CONSTANT.CATEGORY_PREFIX):
                v[c] = v[c].apply(lambda x: int(x))
        agg_funcs = {col: Config.aggregate_op(col) for col in v if col != key
                     and not col.startswith(CONSTANT.TIME_PREFIX)
                     and not col.startswith(CONSTANT.MULTI_CAT_PREFIX)
                     and 'mul_feature_' not in col}

        v = v.fillna(0).groupby(key).agg(agg_funcs)
        v.columns = v.columns.map(lambda a:
                f"{CONSTANT.NUMERICAL_PREFIX}{a[1].upper()}_ROLLING5({v_name}.{a[0]})")

        ## --------------- remove duplicated rolling columns ------------
        c_tmp = None
        count = 0
        for c in v.columns:
            if 'COUNT_ROLLING5' in c:
                if c_tmp is None:
                    c_tmp = v[c]
                else:
                    v.drop(c, axis=1, inplace=True)
                    count += 1
        print("There are %d duplicated columns in temporal join!" % count)

        v.reset_index(0, inplace=True)
        v = v.set_index(key)
    else:
        # for c in [c for c in v if c.startswith(CONSTANT.CATEGORY_PREFIX) and "c_0"]:
        #
        #     v[c] = v[c].apply(lambda x: int(x))

        ###-------------- Multi-cat features will be processed in the main function--------###
        for c in [c for c in v if c.startswith(CONSTANT.MULTI_CAT_PREFIX)]:
            v[c].fillna("0",inplace=True)
            v["mul_feature_" + c] = v[c].apply(lambda x:str(x).split(","))
            #v["mul_feature_" + c] = v[c].str.split(",")
            #print(v["mul_feature_" + c])
            #v["mul_feature_" + c] = v[c]
            v[c] = v["mul_feature_" + c].apply(lambda x: int(x[0]))
        '''
        for c in [c for c in v if c.startswith(CONSTANT.MULTI_CAT_PREFIX)]:
            v[c].fillna("0",inplace=True)
            mul_features = get_tfidf_vector(v[c], c)
            v.drop(c, axis=1, inplace=True)
            v = pd.concat([v, mul_features], axis=1)
            '''
            #v["mul_feature_" + c] = v[c].parallel_apply(lambda x:str(x).split(","))
            #v["mul_feature_" + c] = v[c].apply(lambda x:str(x).split(","))
            #v["mul_feature_" + c] = v[c].str.split(",")
            #print(v["mul_feature_" + c])
            #v["mul_feature_" + c] = v[c]
            #v[c] = v["mul_feature_" + c].parallel_apply(lambda x: int(x[0]))
            #v[c] = v["mul_feature_" + c].apply(lambda x: int(x[0]))
        #v[key].astype(str, copy=True)
        v = v.set_index(key)
        v.columns = v.columns.map(lambda a: f"{a.split('_', 1)[0]}_{v_name}.{a}")
    #u[key].astype(str,copy=True)

    '''print(u.dtypes)
    print("*"*50)
    print(v.dtypes)
    print(v.index)
    input()'''

    return u.join(v, on=key)


@timeit
def temporal_join_jinnian(u, v, v_name, key, time_col, type_):
    if isinstance(key, list):
        assert len(key) == 1
        key = key[0]

    if type_.split("_")[2] == 'many':
        timer = Timer()

        tmp_u = u[[time_col, key]]
        timer.check("select")
        # tmp_u = pd.concat([tmp_u, v], keys=['u', 'v'], sort=False)
        # timer.check("concat")

        # tmp_u = pd.concat([tmp_u, v], keys=['u', 'v'], sort=False)
        for c in v.columns:
            if c != key and c.startswith(CONSTANT.CATEGORY_PREFIX):
                v[c] = v[c].apply(lambda x: int(x))
        tmp_u = pd.concat([tmp_u, v], sort=False)
        #tmp_u = v
        # print(tmp_u.index)
        # input()
        # print(tmp_u[key].nunique())
        # input()
        timer.check("concat")

        rehash_key = f'rehash_{key}'
        # tmp_u[rehash_key] = tmp_u[key].apply(lambda x: hash(x) % CONSTANT.HASH_MAX)
        timer.check("rehash_key")

        # tmp_u.sort_values(time_col, inplace=True)
        timer.check("sort")

        agg_funcs = {col: Config.aggregate_op(col) for col in v if col != key
                     and not col.startswith(CONSTANT.TIME_PREFIX)
                     and not col.startswith(CONSTANT.MULTI_CAT_PREFIX)
                     and 'mul_feature_' not in col
                     }
        tmp_u = tmp_u.fillna(0).groupby(key).agg(agg_funcs)







        '''agg_funcs_num = {col: Config.aggregate_op(col) for col in v if col != key
                     and col.startswith(CONSTANT.NUMERICAL_PREFIX)
                     }
        agg_funcs_cat = {col: Config.aggregate_op(col) for col in v if col != key
                     and col.startswith(CONSTANT.CATEGORY_PREFIX)
                     }
        num_features = [c for c in tmp_u.columns if c.startswith(CONSTANT.NUMERICAL_PREFIX)]
        if key not in num_features:
            num_features.append(key)
        cat_features = [c for c in tmp_u.columns if c.startswith(CONSTANT.CATEGORY_PREFIX)]
        if key not in cat_features:
            cat_features.append(key)
        #print(num_features)
        #print(cat_features)
        #input()
        tmp_u_num = tmp_u[num_features]
        tmp_u_cat = tmp_u[cat_features]'''
        ##---------------FillNA-----------------
        # tmp_u = tmp_u.groupby(rehash_key).rolling(5).agg(agg_funcs)
        ##tmp_u = tmp_u.fillna(0).groupby(rehash_key).rolling(5).agg(agg_funcs)

        '''if len(num_features) > 1:
            tmp_u_cat = tmp_u_cat.groupby(key).agg(agg_funcs_cat)
            tmp_u_cat.reset_index(0, inplace=True)

        if len()


        tmp_u_num = tmp_u_num.fillna(0).groupby(key).agg(agg_funcs_num)
        tmp_u_num.reset_index(0, inplace=True)
        print(tmp_u_cat.index)
        print(tmp_u_cat.columns)
        print(tmp_u_num.index)
        print(tmp_u_num.columns)
        input()
        tmp_u = pd.merge(tmp_u_cat, tmp_u_num, on=[key])'''

        timer.check("group & rolling & agg")

        # tmp_u.reset_index(0, drop=True, inplace=True)  # drop rehash index
        timer.check("reset_index")

        tmp_u.columns = tmp_u.columns.map(lambda a:
                                          f"{CONSTANT.NUMERICAL_PREFIX}{a[1].upper()}_ROLLING5({v_name}.{a[0]})")


        # new_columns = []
        # for a in tmp_u.columns:
        #     if "collect_list" == a[1]:
        #         new_columns.append(f"{'mul_'}{a[1].upper()}_ROLLING5({v_name}.{a[0]})")
        #     else:
        #         new_columns.append(f"{CONSTANT.NUMERICAL_PREFIX}{a[1].upper()}_ROLLING5({v_name}.{a[0]})")
        #
        # tmp_u.columns = new_columns
        # print(tmp_u.columns)

        ##-------------remove duplicated rolling columns---------------
        c_tmp = None
        count = 0
        for c in tmp_u.columns:
            if 'COUNT_ROLLING5' in c:
                if c_tmp is None:
                    c_tmp = tmp_u[c]
                else:
                    tmp_u.drop(c, axis=1, inplace=True)
                    count += 1
        print("There are %d duplicated columns in temporal join!" % count)

        # print(tmp_u.columns)
        # input()

        ##------------check whether all n_COUNT_ROLLING_X are the same---------------
        '''all_columns = tmp_u.columns
        print(all_columns)
        c_tmp = None
        for c in all_columns:
            if 'COUNT_ROLLING5' in c:
                if c_tmp is None:
                    c_tmp = tmp_u[c]
                else:
                    print(c)
                    print([(tmp_u[c]-c_tmp).max(), (tmp_u[c]-c_tmp).min()])
        input()'''

        # print(tmp_u.columns)
        tmp_u.reset_index(0, inplace=True)

        ##-------------check NAN after aggregation functions----------
        '''print(tmp_u.columns)
        #print(tmp_u["n_COUNT_ROLLING5(table_1.c_1)"])
        for c in tmp_u.columns:
            print(c)
            print(tmp_u[c].loc['u'].shape[0])
            print(np.sum(np.isnan(tmp_u[c]).loc['u']))
            print(tmp_u[c].loc['u'])
        #print(tmp_u['n_MEAN_ROLLING5(table_1.n_1)'])
        input()'''

        if tmp_u.empty:
            log("empty tmp_u, return u")
            return u
        # print(u.shape,tmp_u.loc['u'].shape,tmp_u_2.shape)
        # ret = pd.concat([u, tmp_u.loc['u'],#tmp_u_2], axis=1, sort=False)
        # ret = pd.concat([u, tmp_u.loc['u']], axis=1, sort=False)
        index_tmp = u.index
        u["index"] = list(range(0, len(index_tmp)))
        ret = pd.merge(u, tmp_u, on=[key])
        ret.sort_values("index", inplace=True)
        ret.index = index_tmp
        ret.drop("index", axis=1, inplace=True)

        timer.check("final concat")

        del tmp_u

        return ret

    else:
        ###------------ Multi-cat features will be processed in the main function --------##
        for c in [c for c in v if c.startswith(CONSTANT.MULTI_CAT_PREFIX)]:
            v[c].fillna("0", inplace=True)
            v["mul_feature_" + c] = v[c].apply(lambda x: str(x).split(","))
            # v["mul_feature_" + c] = v[c].str.split(",")
            # print(v["mul_feature_" + c])
            # v["mul_feature_" + c] = v[c]
            v[c] = v["mul_feature_" + c].apply(lambda x: int(x[0]))
        '''
        for c in [c for c in v if c.startswith(CONSTANT.MULTI_CAT_PREFIX)]:
            v[c].fillna("0",inplace=True)
            mul_features = get_tfidf_vector(v[c], c)
            v.drop(c, axis=1, inplace=True)
            v = pd.concat([v, mul_features], axis=1)
            '''
        # tmp_u = u[[time_col, key]]
        # tmp_u = pd.concat([tmp_u, v], keys=['u', 'v'], sort=False)
        # print(tmp_u[key].nunique())
        # input()
        # print(u.dtypes)
        # input()
        # u[key] = u[key].astype('int64')
        v = v.set_index(key)
        v.columns = v.columns.map(lambda a: f"{a.split('_', 1)[0]}_{v_name}.{a}")

        return u.join(v, on=key)



@timeit
def temporal_join(u, v, v_name, key, time_col):
    timer = Timer()

    if isinstance(key, list):
        assert len(key) == 1
        key = key[0]

    tmp_u = u[[time_col, key]]
    timer.check("select")
    #tmp_u = pd.concat([tmp_u, v], keys=['u', 'v'], sort=False)
    #timer.check("concat")

    tmp_u = pd.concat([tmp_u, v], keys=['u', 'v'], sort=False)
    timer.check("concat")


    rehash_key = f'rehash_{key}'
    tmp_u[rehash_key] = tmp_u[key].apply(lambda x: hash(x) % CONSTANT.HASH_MAX)
    timer.check("rehash_key")

    tmp_u.sort_values(time_col, inplace=True)
    timer.check("sort")



    agg_funcs = {col: Config.aggregate_op(col) for col in v if col != key
                 and not col.startswith(CONSTANT.TIME_PREFIX)
                 and not col.startswith(CONSTANT.MULTI_CAT_PREFIX)
                 and 'mul_feature_' not in col
                 }

    tmp_u_2 = tmp_u

    ##---------------FillNA-----------------
    #tmp_u = tmp_u.groupby(rehash_key).rolling(5).agg(agg_funcs)
    # tmp_u_2 = tmp_u

    tmp_u = tmp_u.groupby(key).agg(agg_funcs)

    timer.check("group & rolling & agg")

    # tmp_u.reset_index(0, drop=True, inplace=True)  # drop rehash index
    timer.check("reset_index")

    # tmp_u.columns = tmp_u.columns.map(lambda a:
    #    f"{CONSTANT.NUMERICAL_PREFIX}{a[1].upper()}_ROLLING5({v_name}.{a[0]})")

    if tmp_u.empty:
        log("empty tmp_u, return u")
        return u
    # print(u.shape,tmp_u.loc['u'].shape,tmp_u_2.shape)
    # ret = pd.concat([u, tmp_u_2], axis=1, sort=False)
    # ret = pd.concat([u, tmp_u.loc['u'],tmp_u_2], axis=1, sort=False)

    # ret = pd.concat([u, tmp_u.loc['u']], axis=1, sort=False)
    timer.check("final concat")

    tmp_u.columns = tmp_u.columns.map(lambda a:
                                      f"{v_name}.{a})")

    tmpindex = u.index

    u["index"] = list(range(0, len(tmpindex)))

    ret = pd.merge(u, tmp_u, left_index=True, on=[key])

    ret.sort_values("index", inplace=True)

    # ret.index = ret["index"]
    ret.index = tmpindex
    ret.drop("index", axis=1, inplace=True)

    # u[key] = u[key].apply(int)
    # v[key] = v[key].apply(int)
    # #u = u.join(v,on=key)
    # u = u.merge(v)
    # print(u)
    del tmp_u

    return u

def dfs(u_name, config, tables, graph):
    u = tables[u_name]
    log(f"enter {u_name}")
    for edge in graph[u_name]:
        v_name = edge['to']
        if config['tables'][v_name]['depth'] <= config['tables'][u_name]['depth']:
            continue

        v = dfs(v_name, config, tables, graph)
        key = edge['key']
        type_ = edge['type']

        if config['time_col'] not in u and config['time_col'] in v:
            continue

        if config['time_col'] in u and config['time_col'] in v:
            log(f"join {u_name} <--{type_}--t {v_name}")
            ## ZJN: change to my temporal_join function
            #u = temporal_join(u, v, v_name, key, config['time_col'])
            u = temporal_join_jinnian(u, v, v_name, key, config['time_col'], type_)
            ## ZJN: Test many-to-many join
            #u = join(u, v, v_name, key, type_)
        else:
            log(f"join {u_name} <--{type_}--nt {v_name}")
            u = join(u, v, v_name, key, type_)

        del v

    log(f"leave {u_name}")
    return u


@timeit
def merge_table(tables, config):
    graph = defaultdict(list)
    for rel in config['relations']:
        ta = rel['table_A']
        tb = rel['table_B']
        graph[ta].append({
            "to": tb,
            "key": rel['key'],
            "type": rel['type']
        })
        graph[tb].append({
            "to": ta,
            "key": rel['key'],
            "type": '_'.join(rel['type'].split('_')[::-1])
        })
    v = tables[CONSTANT.MAIN_TABLE_NAME]
    # for c in [c for c in v if c.startswith(CONSTANT.CATEGORY_PREFIX)]:
    #     v[c] = v[c].apply(lambda x: int(x))

    ## --------- Process Multi-cat features in the maintable-----------##
    '''
    for c in [c for c in v if c.startswith(CONSTANT.MULTI_CAT_PREFIX)]:
        #tfidfVec = TfidfVectorizer(max_features=10, dtype=np.float32)
        #v["mul_feature_" + c] = v[c]
        v[c].fillna("0",inplace=True)
        mul_features = get_tfidf_vector(v[c], c)
        v.drop(c, axis=1, inplace=True)
        v = pd.concat([v, mul_features], axis=1)
        '''
        #v["mul_feature_" + c] = v[c].parallel_apply(lambda x: str(x).split(","))
        #v["mul_feature_" + c] = v[c].apply(lambda x: str(x).split(","))
        #v["mul_feature_" + c] = v[c].str.split(",")
        #print(v["mul_feature_" + c])
        #v[c] = v["mul_feature_" + c].parallel_apply(lambda x: int(x[0]))
        #v[c] = v["mul_feature_" + c].apply(lambda x: int(x[0]))
    #print(v.columns)
    #print(v.dtypes)
    #input()
    for c in [c for c in v if c.startswith(CONSTANT.MULTI_CAT_PREFIX)]:
        #v["mul_feature_" + c] = v[c]
        v[c].fillna("0",inplace=True)
        v["mul_feature_" + c] = v[c].apply(lambda x: str(x).split(","))
        #v["mul_feature_" + c] = v[c].str.split(",")
        #print(v["mul_feature_" + c])
        v[c] = v["mul_feature_" + c].apply(lambda x: int(x[0]))

    tables[CONSTANT.MAIN_TABLE_NAME] = v
    bfs(CONSTANT.MAIN_TABLE_NAME, graph, config['tables'])
    return dfs(CONSTANT.MAIN_TABLE_NAME, config, tables, graph)