import time import os import numpy as np import scipy.sparse as sp from scipy.sparse.linalg.eigen.arpack import eigsh from .input import read_dbp15k_input from .preprocess import enhance_triples, generate_3hop_triples, generate_2hop_triples, \ remove_unlinked_triples def merge_dic(dic1, dic2): return {**dic1, **dic2} def read_relation_triples(file_path): print("read relation triples:", file_path) if file_path is None: return set(), set(), set() triples = set() entities, relations = set(), set() file = open(file_path, 'r', encoding='utf8') for line in file.readlines(): params = line.strip('\n').split('\t') assert len(params) == 3 h = params[0].strip() r = params[1].strip() t = params[2].strip() triples.add((h, r, t)) entities.add(h) entities.add(t) relations.add(r) return triples, entities, relations def read_links(file_path): print("read links:", file_path) links = list() refs = list() reft = list() file = open(file_path, 'r', encoding='utf8') for line in file.readlines(): params = line.strip('\n').split('\t') assert len(params) == 2 e1 = params[0].strip() e2 = params[1].strip() refs.append(e1) reft.append(e2) links.append((e1, e2)) assert len(refs) == len(reft) return links def read_dict(file_path): file = open(file_path, 'r', encoding='utf8') ids = dict() for line in file.readlines(): params = line.strip('\n').split('\t') assert len(params) == 2 ids[params[0]] = int(params[1]) file.close() return ids def read_pair_ids(file_path): file = open(file_path, 'r', encoding='utf8') pairs = list() for line in file.readlines(): params = line.strip('\n').split('\t') assert len(params) == 2 pairs.append((int(params[0]), int(params[1]))) file.close() return pairs def pair2file(file, pairs): if pairs is None: return with open(file, 'w', encoding='utf8') as f: for i, j in pairs: f.write(str(i) + '\t' + str(j) + '\n') f.close() def dict2file(file, dic): if dic is None: return with open(file, 'w', encoding='utf8') as f: for i, j in dic.items(): f.write(str(i) + '\t' + str(j) + '\n') f.close() print(file, "saved.") def line2file(file, lines): if lines is None: return with open(file, 'w', encoding='utf8') as f: for line in lines: f.write(line + '\n') f.close() print(file, "saved.") def radio_2file(radio, folder): path = folder + str(radio).replace('.', '_') if not os.path.exists(path): os.makedirs(path) return path + '/' def save_results(folder, rest_12): if not os.path.exists(folder): os.makedirs(folder) pair2file(folder + 'alignment_results_12', rest_12) print("Results saved!") def task_divide(idx, n): total = len(idx) if n <= 0 or 0 == total: return [idx] if n > total: return [idx] elif n == total: return [[i] for i in idx] else: j = total // n tasks = [] for i in range(0, (n - 1) * j, j): tasks.append(idx[i:i + j]) tasks.append(idx[(n - 1) * j:]) return tasks def generate_out_folder(out_folder, training_data_path, div_path, method_name): params = training_data_path.strip('/').split('/') path = params[-1] folder = out_folder + method_name + '/' + path + "/" + div_path + str(time.strftime("%Y%m%d%H%M%S")) + "/" print("results output folder:", folder) return folder def save_embeddings(folder, kgs, ent_embeds, rel_embeds, attr_embeds, mapping_mat=None): if not os.path.exists(folder): os.makedirs(folder) if ent_embeds is not None: np.save(folder + 'ent_embeds.npy', ent_embeds) if rel_embeds is not None: np.save(folder + 'rel_embeds.npy', rel_embeds) if attr_embeds is not None: np.save(folder + 'attr_embeds.npy', attr_embeds) if mapping_mat is not None: np.save(folder + 'mapping_mat.npy', mapping_mat) dict2file(folder + 'kg1_ent_ids', kgs.kg1.entities_id_dict) dict2file(folder + 'kg2_ent_ids', kgs.kg2.entities_id_dict) dict2file(folder + 'kg1_rel_ids', kgs.kg1.relations_id_dict) dict2file(folder + 'kg2_rel_ids', kgs.kg2.relations_id_dict) dict2file(folder + 'kg1_attr_ids', kgs.kg1.attributes_id_dict) dict2file(folder + 'kg2_attr_ids', kgs.kg2.attributes_id_dict) print("Embeddings saved!") # ***************************adj & sparse************************** def sparse_to_tuple(sparse_mx): def to_tuple(mx): if not sp.isspmatrix_coo(mx): mx = mx.tocoo() coords = np.vstack((mx.row, mx.col)).transpose() values = mx.data shape = mx.shape return coords, values, shape if isinstance(sparse_mx, list): for i in range(len(sparse_mx)): sparse_mx[i] = to_tuple(sparse_mx[i]) else: sparse_mx = to_tuple(sparse_mx) return sparse_mx def normalize_adj(adj): """Symmetrically normalize adjacency matrix.""" adj = sp.coo_matrix(adj) rowsum = np.array(adj.sum(1)) d_inv_sqrt = np.power(rowsum, -0.5).flatten() d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. d_mat_inv_sqrt = sp.diags(d_inv_sqrt) return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo() def preprocess_adj(adj): """Preprocessing of adjacency matrix for simple GCN gnn and conversion to tuple representation.""" adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0])) return sparse_to_tuple(adj_normalized) def chebyshev_polynomials(adj, k): """Calculate Chebyshev polynomials up to order k. Return a list of sparse matrices (tuple representation).""" print("Calculating Chebyshev polynomials up to order {}...".format(k)) adj_normalized = normalize_adj(adj) laplacian = sp.eye(adj.shape[0]) - adj_normalized largest_eigval, _ = eigsh(laplacian, 1, which='LM') scaled_laplacian = (2. / largest_eigval[0]) * laplacian - sp.eye(adj.shape[0]) t_k = list() t_k.append(sp.eye(adj.shape[0])) t_k.append(scaled_laplacian) def chebyshev_recurrence(t_k_minus_one, t_k_minus_two, scaled_lap): s_lap = sp.csr_matrix(scaled_lap, copy=True) return 2 * s_lap.dot(t_k_minus_one) - t_k_minus_two for i in range(2, k + 1): t_k.append(chebyshev_recurrence(t_k[-1], t_k[-2], scaled_laplacian)) return sparse_to_tuple(t_k) def func(triples): head = {} cnt = {} for tri in triples: if tri[1] not in cnt: cnt[tri[1]] = 1 head[tri[1]] = {tri[0]} else: cnt[tri[1]] += 1 head[tri[1]].add(tri[0]) r2f = {} for r in cnt: r2f[r] = len(head[r]) / cnt[r] return r2f def ifunc(triples): tail = {} cnt = {} for tri in triples: if tri[1] not in cnt: cnt[tri[1]] = 1 tail[tri[1]] = {tri[2]} else: cnt[tri[1]] += 1 tail[tri[1]].add(tri[2]) r2if = {} for r in cnt: r2if[r] = len(tail[r]) / cnt[r] return r2if def get_weighted_adj(e, triples): r2f = func(triples) r2if = ifunc(triples) M = {} for tri in triples: if tri[0] == tri[2]: continue if (tri[0], tri[2]) not in M: M[(tri[0], tri[2])] = max(r2if[tri[1]], 0.3) else: M[(tri[0], tri[2])] += max(r2if[tri[1]], 0.3) if (tri[2], tri[0]) not in M: M[(tri[2], tri[0])] = max(r2f[tri[1]], 0.3) else: M[(tri[2], tri[0])] += max(r2f[tri[1]], 0.3) row = [] col = [] data = [] for key in M: row.append(key[1]) col.append(key[0]) data.append(M[key]) data = np.array(data, dtype='float32') return sp.coo_matrix((data, (row, col)), shape=(e, e)) def gcn_load_data(input_folder, is_two=False, is_three=False, is_four=False): kg1, kg2, sup_ent1, sup_ent2, ref_ent1, ref_ent2, total_tri_num, total_e_num, total_r_num, rel_id_mapping = \ read_dbp15k_input(input_folder) linked_ents = set(sup_ent1 + sup_ent2 + ref_ent1 + ref_ent2) enhanced_triples1, enhanced_triples2 = enhance_triples(kg1, kg2, sup_ent1, sup_ent2) triples = remove_unlinked_triples(kg1.triple_list + kg2.triple_list + list(enhanced_triples1) + list(enhanced_triples2), linked_ents) # enhanced_triples1 = generate_2hop_triples(kg1, linked_ents=linked_ents) # enhanced_triples2 = generate_2hop_triples(kg2, linked_ents=linked_ents) # triples = set(triples) | enhanced_triples1 | enhanced_triples2 one_adj, _ = no_weighted_adj(total_e_num, triples, is_two_adj=False) adj = [one_adj] # enhanced_triples11 = generate_2hop_triples(kg1) # enhanced_triples21 = generate_2hop_triples(kg2) # triples = list(enhanced_triples1) + list(enhanced_triples2) + list(enhanced_triples11) + list(enhanced_triples21) # two_adj, _ = no_weighted_adj(total_e_num, triples, is_two_adj=False) # adj = [one_adj, two_adj] # useful_triples1 = enhanced_triples1 - kg1.triples # useful_triples2 = enhanced_triples2 - kg2.triples # useful_edges = set() # for h, r, t in list(useful_triples1) + list(useful_triples2): # useful_edges.add((h, t)) # print("useful_edges", len(useful_edges)) # enhanced_triples1 = generate_2steps_path(kg1.triples) # enhanced_triples2 = generate_2steps_path(kg2.triples) # n = 0 # for h, r, t in list(enhanced_triples1) + list(enhanced_triples2): # if (h, t) in useful_edges: # n += 1 # print("useful 2 hop edges", n) two_hop_triples1, two_hop_triples2 = None, None three_hop_triples1, three_hop_triples2 = None, None if is_two: two_hop_triples1 = generate_2hop_triples(kg1, linked_ents=linked_ents) two_hop_triples2 = generate_2hop_triples(kg2, linked_ents=linked_ents) triples = two_hop_triples1 | two_hop_triples2 # triples = remove_unlinked_triples(triples, linked_ents) two_adj, _ = no_weighted_adj(total_e_num, triples, is_two_adj=False) adj.append(two_adj) if is_three: three_hop_triples1 = generate_3hop_triples(kg1, two_hop_triples1, linked_ents=linked_ents) three_hop_triples2 = generate_3hop_triples(kg2, two_hop_triples2, linked_ents=linked_ents) triples = three_hop_triples1 | three_hop_triples2 three_adj, _ = no_weighted_adj(total_e_num, triples, is_two_adj=False) adj.append(three_adj) if is_four: four_hop_triples1 = generate_3hop_triples(kg1, three_hop_triples1, linked_ents=linked_ents) four_hop_triples2 = generate_3hop_triples(kg2, three_hop_triples2, linked_ents=linked_ents) triples = four_hop_triples1 | four_hop_triples2 four_adj, _ = no_weighted_adj(total_e_num, triples, is_two_adj=False) adj.append(four_adj) return adj, kg1, kg2, sup_ent1, sup_ent2, ref_ent1, ref_ent2, total_tri_num, \ total_e_num, total_r_num, rel_id_mapping def diag_adj(adj): d = np.array(adj.sum(1)).flatten() d_inv = 1. / d d_inv[np.isinf(d_inv)] = 0 d_inv = sp.diags(d_inv) return sparse_to_tuple(d_inv.dot(adj)) def rgcn_adj_list(kg1, kg2, adj_number, all_rel_num, all_ent_num): # *****************test two adj********************************** # adj_list = list() # adj1 = list() # adj2 = list() # for item in kg1.triple_list: # adj1.append([item[0], item[2]]) # adj1.append([item[2], item[0]]) # for item in kg2.triple_list: # adj2.append([item[0], item[2]]) # adj2.append([item[2], item[0]]) # kg1_pos = np.array(adj1) # row, col = np.transpose(kg1_pos) # data = np.ones(row.shape[0]) # # # n_row = np.hstack((row, col)) # # n_col = np.hstack((col, row)) # # adj = sp.coo_matrix((data, (n_row, n_col)), shape=(all_ent_num, all_ent_num)) # # adj = sp.coo_matrix((data, (row, col)), shape=(all_ent_num, all_ent_num)) # adj = diag_adj(adj) # # adj = preprocess_adj(adj) # # # # adj_rev = sp.coo_matrix((data, (col, row)), shape=(all_ent_num, all_ent_num)) # # adj_rev = diag_adj(adj_rev) # # adj = sp.coo_matrix((data, (row, col)), shape=(all_ent_num, all_ent_num)) # adj_list.append(adj) # # adj_list.append(adj_rev) # # kg2_pos = np.array(adj2) # row, col = np.transpose(kg2_pos) # data = np.ones(row.shape[0]) # adj = sp.coo_matrix((data, (row, col)), shape=(all_ent_num, all_ent_num)) # # adj = preprocess_adj(adj) # adj = diag_adj(adj) # # # # adj_rev = sp.coo_matrix((data, (col, row)), shape=(all_ent_num, all_ent_num)) # # adj_rev = diag_adj(adj_rev) # # adj = sp.coo_matrix((data, (row, col)), shape=(all_ent_num, all_ent_num)) # # # data = np.ones(row.shape[0] * 2) # # # # n_row = np.hstack((row, col)) # # n_col = np.hstack((col, row)) # # adj = sp.coo_matrix((data, (n_row, n_col)), shape=(all_ent_num, all_ent_num)) # # adj = diag_adj(adj) # adj_list.append(adj) # # adj_list.append(adj_rev) # return adj_list # ***************************************************************** adj_list = list() triple_list = kg1.triple_list + kg2.triple_list edge = dict() edge_length = np.zeros(all_rel_num) for item in triple_list: if item[1] not in edge.keys(): edge[item[1]] = list() edge[item[1]].append([item[0], item[2]]) edge[item[1]].append([item[2], item[0]]) edge_length[item[1]] += 2 sort_edge_length = np.argsort(-edge_length) # ****************************将剩余的关系构造成一个adj******************** left_len = int(edge_length[sort_edge_length[adj_number]]) pos = np.array(edge[sort_edge_length[adj_number]]) first_row, first_col = np.transpose(pos) init_row = first_row init_col = first_col # init_row = np.hstack((first_row, first_col)) # init_col = np.hstack((first_col, first_row)) for i in range(adj_number + 1, len(edge.keys())): pos = np.array(edge[sort_edge_length[i]]) row, col = np.transpose(pos) init_row = np.hstack((init_row, row)) # init_row = np.hstack((init_row, col)) init_col = np.hstack((init_col, col)) # init_col = np.hstack((init_col, row)) left_len += int(edge_length[sort_edge_length[i]]) data = np.ones(left_len) left_adj = sp.coo_matrix((data, (init_row, init_col)), shape=(all_ent_num, all_ent_num)) left_adj = diag_adj(left_adj) # left_adj = preprocess_adj(left_adj) left_rev_adj = sp.coo_matrix((data, (init_col, init_row)), shape=(all_ent_num, all_ent_num)) left_rev_adj = diag_adj(left_rev_adj) adj_list.append(left_adj) adj_list.append(left_rev_adj) # ********************************************************************** for i in range(adj_number): pos = np.array(edge[sort_edge_length[i]]) row, col = np.transpose(pos) # *********************************构造对称adj************************* # new_row = np.hstack((row, col)) # new_col = np.hstack((col, row)) # data = np.ones(shape=int(edge_length[sort_edge_length[i]]*2)) # adj = sp.coo_matrix((data, (new_row, new_col)), shape=(all_ent_num, all_ent_num)) # adj = diag_adj(adj) # # adj = preprocess_adj(adj) # adj_list.append(adj) # ******************************************************************** data = np.ones(shape=int(edge_length[sort_edge_length[i]])) adj = sp.coo_matrix((data, (row, col)), shape=(all_ent_num, all_ent_num)) adj = diag_adj(adj) # adj_rev = sp.coo_matrix((data, (col, row)), shape=(all_ent_num, all_ent_num)) adj_rev = diag_adj(adj_rev) # # adj = sp.coo_matrix((data, (row, col)), shape=(all_ent_num, all_ent_num)) adj_list.append(adj) adj_list.append(adj_rev) return adj_list def test_no_weighted_adj(total_ent_num, kg1_triple_list, kg2_triple_list): adj = list() for triple_list in [kg1_triple_list, kg2_triple_list]: edge = dict() for item in triple_list: if 0 <= item[0] < 10500: item_first = item[0] elif 10500 <= item[0] < 21000: item_first = item[0] - 10500 elif item[0] < 25500: item_first = item[0] - 10500 else: item_first = item[0] - 15000 if 0 <= item[2] < 10500: item_second = item[2] elif 10500 <= item[2] < 21000: item_second = item[2] - 10500 elif item[2] < 25500: item_second = item[2] - 10500 else: item_second = item[2] - 15000 if item_first not in edge.keys(): edge[item_first] = set() if item_second not in edge.keys(): edge[item_second] = set() edge[item_first].add(item_second) edge[item_second].add(item_first) row = list() col = list() for i in range(int(total_ent_num / 2)): if i not in edge.keys(): continue key = i value = edge[key] add_key_len = len(value) add_key = (key * np.ones(add_key_len)).tolist() row.extend(add_key) col.extend(list(value)) data_len = len(row) data = np.ones(data_len) one_adj = sp.coo_matrix((data, (row, col)), shape=(int(total_ent_num / 2), int(total_ent_num / 2))) one_adj = preprocess_adj(one_adj) adj.append(one_adj) return adj def no_weighted_adj(total_ent_num, triple_list, is_two_adj=False): start = time.time() edge = dict() for item in triple_list: if item[0] not in edge.keys(): edge[item[0]] = set() if item[2] not in edge.keys(): edge[item[2]] = set() edge[item[0]].add(item[2]) edge[item[2]].add(item[0]) row = list() col = list() for i in range(total_ent_num): if i not in edge.keys(): continue key = i value = edge[key] add_key_len = len(value) add_key = (key * np.ones(add_key_len)).tolist() row.extend(add_key) col.extend(list(value)) data_len = len(row) data = np.ones(data_len) one_adj = sp.coo_matrix((data, (row, col)), shape=(total_ent_num, total_ent_num)) one_adj = preprocess_adj(one_adj) print('generating one-adj costs time: {:.4f}s'.format(time.time() - start)) if not is_two_adj: return one_adj, None expend_edge = dict() row = list() col = list() temp_len = 0 for key, values in edge.items(): if key not in expend_edge.keys(): expend_edge[key] = set() for value in values: add_value = edge[value] for item in add_value: if item not in values and item != key: expend_edge[key].add(item) no_len = len(expend_edge[key]) if temp_len != no_len: row.append(key) col.append(item) temp_len = no_len data = np.ones(len(row)) two_adj = sp.coo_matrix((data, (row, col)), shape=(total_ent_num, total_ent_num)) two_adj = preprocess_adj(two_adj) print('generating one- and two-adj costs time: {:.4f}s'.format(time.time() - start)) return one_adj, two_adj def temp_weighted_two_adj(total_ent_num, triple_list, is_two_adj=False): start = time.time() edge = dict() for item in triple_list: if item[0] not in edge.keys(): edge[item[0]] = set() if item[2] not in edge.keys(): edge[item[2]] = set() edge[item[0]].add(item[2]) edge[item[2]].add(item[0]) row = list() col = list() for i in range(total_ent_num): if i not in edge.keys(): continue key = i value = edge[key] add_key_len = len(value) add_key = (key * np.ones(add_key_len)).tolist() row.extend(add_key) col.extend(list(value)) data_len = len(row) data = (np.ones(data_len)) * 0.5 one_adj = sp.coo_matrix((data, (row, col)), shape=(total_ent_num, total_ent_num)) one_adj = preprocess_adj(one_adj) print('generating one-adj costs time: {:.4f}s'.format(time.time() - start)) print('generating one- and two-adj costs time: {:.4f}s'.format(time.time() - start)) return one_adj def relation_adj_list(kg1, kg2, adj_number, all_rel_num, all_ent_num, linked_ents, rel_id_mapping): rel_dict = rel_id_mapping adj_list = list() triple_list = kg1.triple_list + kg2.triple_list edge = dict() edge_length = np.zeros(all_rel_num) # for item in triple_list: # if item[1] not in edge.keys(): # edge[item[1]] = list() # edge[item[1]].append([item[0], item[2]]) # edge_length[item[1]] += 1 # sort_edge_length = np.argsort(-edge_length) for item in triple_list: if rel_dict[item[1]] is not None and rel_dict[item[1]] != "": edge_id = rel_dict[item[1]] else: edge_id = item[1] if edge_id not in edge.keys(): edge[edge_id] = list() edge[edge_id].append([item[0], item[2]]) edge_length[edge_id] += 1 sort_edge_length = np.argsort(-edge_length) # ********************************************************************** adj_len = list() for i in range(adj_number): pos = np.array(edge[sort_edge_length[i]]) row, col = np.transpose(pos) data = np.ones(shape=int(edge_length[sort_edge_length[i]])) adj_len.append(int(edge_length[sort_edge_length[i]])) adj = sp.coo_matrix((data, (row, col)), shape=(all_ent_num, all_ent_num)) adj = sparse_to_tuple(adj) adj_list.append(adj) # r1_count = 0 # r2_count = 0 # count = 0 # r1_adj_number = adj_number / 2 # r2_adj_number = adj_number / 2 # while r1_count <= r1_adj_number and r2_count <= r2_adj_number: # r_id = sort_edge_length[count] # i = count # count += 1 # if r_id > 1700: # r2_count += 1 # if r2_count > adj_number / 2: # continue # else: # r1_count += 1 # if r1_count > adj_number / 2: # continue # pos = np.array(edge[sort_edge_length[i]]) # row, col = np.transpose(pos) # data = np.ones(shape=int(edge_length[sort_edge_length[i]])) # # adj_len.append(int(edge_length[sort_edge_length[i]])) # # adj = sp.coo_matrix((data, (row, col)), shape=(all_ent_num, all_ent_num)) # adj = sparse_to_tuple(adj) # adj_list.append(adj) return adj_list def transloss_add2hop(kg1, kg2, sup_ent1, sup_ent2, ref_ent1, ref_ent2, total_e_num): linked_ents = set(sup_ent1 + sup_ent2 + ref_ent1 + ref_ent2) enhanced_triples1 = generate_2hop_triples(kg1, linked_ents=linked_ents) enhanced_triples2 = generate_2hop_triples(kg2, linked_ents=linked_ents) triples = enhanced_triples1 | enhanced_triples2 edge = dict() for item in triples: if item[0] not in edge.keys(): edge[item[0]] = set() if item[2] not in edge.keys(): edge[item[2]] = set() edge[item[0]].add(item[2]) edge[item[2]].add(item[0]) row = list() col = list() for i in range(total_e_num): if i not in edge.keys(): continue key = i value = edge[key] add_key_len = len(value) add_key = (key * np.ones(add_key_len)).tolist() row.extend(add_key) col.extend(list(value)) data_len = len(row) data = np.ones(data_len) one_adj = sp.coo_matrix((data, (row, col)), shape=(total_e_num, total_e_num)) one_adj = sparse_to_tuple(one_adj) return one_adj