# -*- coding: utf-8 -*- # @Author: lxy # @Date: 2019-12-23 14:30:48 # @Last Modified by: lxy # @Last Modified time: 2020-01-06 12:22:27 import time import networkx as nx import numpy as np import scipy.sparse as sp from scipy import linalg from scipy.special import iv from sklearn import preprocessing from sklearn.utils.extmath import randomized_svd from .. import register_model, base_model @register_model("prone") class ProNE(base_model.BaseModel): @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument("--step", type=int, default=5, help=" Number of items in the chebyshev expansion") parser.add_argument("--mu", type=float, default=0.2) parser.add_argument("--theta", type=float, default=0.5) # fmt: on @classmethod def build_model_from_args(cls, args): return cls(args.hidden_size, args.step, args.mu, args.theta) def __init__(self, dimension, step, mu, theta): super(ProNE, self).__init__() self.dimension = dimension self.step = step self.mu = mu self.theta = theta def train(self, G): self.num_node = G.number_of_nodes() self.matrix0 = sp.csr_matrix(nx.adjacency_matrix(G)) t_1 = time.time() features_matrix = self._pre_factorization(self.matrix0, self.matrix0) t_2 = time.time() embeddings_matrix = self._chebyshev_gaussian( self.matrix0, features_matrix, self.step, self.mu, self.theta ) t_3 = time.time() print("sparse NE time", t_2 - t_1) print("spectral Pro time", t_3 - t_2) self.embeddings = embeddings_matrix return self.embeddings def _get_embedding_rand(self, matrix): # Sparse randomized tSVD for fast embedding t1 = time.time() l = matrix.shape[0] smat = sp.csc_matrix(matrix) # convert to sparse CSC format print("svd sparse", smat.data.shape[0] * 1.0 / l ** 2) U, Sigma, VT = randomized_svd( smat, n_components=self.dimension, n_iter=5, random_state=None ) U = U * np.sqrt(Sigma) U = preprocessing.normalize(U, "l2") print("sparsesvd time", time.time() - t1) return U def _get_embedding_dense(self, matrix, dimension): # get dense embedding via SVD t1 = time.time() U, s, Vh = linalg.svd( matrix, full_matrices=False, check_finite=False, overwrite_a=True ) U = np.array(U) U = U[:, :dimension] s = s[:dimension] s = np.sqrt(s) U = U * s U = preprocessing.normalize(U, "l2") print("densesvd time", time.time() - t1) return U def _pre_factorization(self, tran, mask): # Network Embedding as Sparse Matrix Factorization t1 = time.time() l1 = 0.75 C1 = preprocessing.normalize(tran, "l1") neg = np.array(C1.sum(axis=0))[0] ** l1 neg = neg / neg.sum() neg = sp.diags(neg, format="csr") neg = mask.dot(neg) print("neg", time.time() - t1) C1.data[C1.data <= 0] = 1 neg.data[neg.data <= 0] = 1 C1.data = np.log(C1.data) neg.data = np.log(neg.data) C1 -= neg F = C1 features_matrix = self._get_embedding_rand(F) return features_matrix def _chebyshev_gaussian(self, A, a, order=10, mu=0.5, s=0.5): # NE Enhancement via Spectral Propagation print("Chebyshev Series -----------------") t1 = time.time() if order == 1: return a A = sp.eye(self.num_node) + A DA = preprocessing.normalize(A, norm="l1") L = sp.eye(self.num_node) - DA M = L - mu * sp.eye(self.num_node) Lx0 = a Lx1 = M.dot(a) Lx1 = 0.5 * M.dot(Lx1) - a conv = iv(0, s) * Lx0 conv -= 2 * iv(1, s) * Lx1 for i in range(2, order): Lx2 = M.dot(Lx1) Lx2 = (M.dot(Lx2) - 2 * Lx1) - Lx0 # Lx2 = 2*L.dot(Lx1) - Lx0 if i % 2 == 0: conv += 2 * iv(i, s) * Lx2 else: conv -= 2 * iv(i, s) * Lx2 Lx0 = Lx1 Lx1 = Lx2 del Lx2 print("Bessell time", i, time.time() - t1) mm = A.dot(a - conv) emb = self._get_embedding_dense(mm, self.dimension) return emb