python source code of data

#Copyright 2018 Google LLC
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
#    https://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.


"""Utils functions to load and process citation data."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import pickle as pkl
import sys

import networkx as nx
import numpy as np
import scipy.sparse as sp
from scipy.sparse.linalg.eigen.arpack import eigsh
import tensorflow as tf
from third_party.gcn.gcn.utils import normalize_adj
from third_party.gcn.gcn.utils import parse_index_file
from third_party.gcn.gcn.utils import sample_mask
from third_party.gcn.gcn.utils import sparse_to_tuple
from third_party.gcn.gcn.utils import preprocess_features


def load_test_edge_mask(dataset_str, data_path, drop_edge_prop):
  """Remove test edges by loading edge masks."""
  edge_mask_path = os.path.join(
      data_path, 'emask.{}.remove{}.npz'.format(dataset_str, drop_edge_prop))
  with tf.gfile.Open(edge_mask_path) as f:
    mask = sp.load_npz(f)
  return mask


def load_edge_masks(dataset_str, data_path, adj_true, drop_edge_prop):
  """Loads adjacency matrix as sparse matrix and masks for val & test links.

  Args:
    dataset_str: dataset to use
    data_path: path to data folder
    adj_true: true adjacency matrix in dense format,
    drop_edge_prop: proportion of edges to remove.

  Returns:
    adj_matrix: adjacency matrix
    train_mask: mask for train edges
    val_mask: mask for val edges
    test_mask: mask for test edges
  """
  edge_mask_path = os.path.join(
      data_path, 'emask.{}.remove{}.'.format(dataset_str, drop_edge_prop))
  val_mask = sp.load_npz(edge_mask_path + 'val.npz')
  test_mask = sp.load_npz(edge_mask_path + 'test.npz')
  train_mask = 1. - val_mask.todense() - test_mask.todense()
  # remove val and test edges from true A
  adj_train = np.multiply(adj_true, train_mask)
  train_mask -= np.eye(train_mask.shape[0])
  return adj_train, sparse_to_tuple(val_mask), sparse_to_tuple(
      val_mask), sparse_to_tuple(test_mask)


def add_top_k_edges(data, edge_mask_path, gae_scores_path, topk, nb_nodes,
                    norm_adj):
  """Loads GAE scores and adds topK edges to train adjacency."""
  test_mask = sp.load_npz(os.path.join(edge_mask_path, 'test_mask.npz'))
  train_mask = 1. - test_mask.todense()
  # remove val and test edges from true A
  adj_train_curr = np.multiply(data['adj_true'], train_mask)
  # Predict test edges using precomputed scores
  scores = np.load(os.path.join(gae_scores_path, 'gae_scores.npy'))
  # scores_mask = 1 - np.eye(nb_nodes)
  scores_mask = np.zeros((nb_nodes, nb_nodes))
  scores_mask[:140, 140:] = 1.
  scores_mask[140:, :140] = 1.
  scores = np.multiply(scores, scores_mask).reshape((-1,))
  threshold = scores[np.argsort(-scores)[topk]]
  adj_train_curr += 1 * (scores > threshold).reshape((nb_nodes, nb_nodes))
  adj_train_curr = 1 * (adj_train_curr > 0)
  if norm_adj:
    adj_train_norm = normalize_adj(data['adj_train'])
  else:
    adj_train_norm = sp.coo_matrix(data['adj_train'])
  return adj_train_curr, sparse_to_tuple(adj_train_norm)


def process_adj(adj, model_name):
  """Symmetrically normalize adjacency matrix."""
  if model_name == 'Cheby':
    laplacian = sp.eye(adj.shape[0]) - normalize_adj(adj - sp.eye(adj.shape[0]))
    # TODO(chamii): compare with
    # adj)
    largest_eigval, _ = eigsh(laplacian, 1, which='LM')
    laplacian_norm = (2. / largest_eigval[0]) * laplacian - sp.eye(adj.shape[0])
    return laplacian_norm
  else:
    return normalize_adj(adj)


def load_data(dataset_str, data_path):
  if dataset_str in ['cora', 'citeseer', 'pubmed']:
    return load_citation_data(dataset_str, data_path)
  else:
    return load_ppi_data(data_path)


def load_ppi_data(data_path):
  """Load PPI dataset."""
  with tf.gfile.Open(os.path.join(data_path, 'ppi.edges.npz')) as f:
    adj = sp.load_npz(f)

  with tf.gfile.Open(os.path.join(data_path, 'ppi.features.norm.npy')) as f:
    features = np.load(f)

  with tf.gfile.Open(os.path.join(data_path, 'ppi.labels.npz')) as f:
    labels = sp.load_npz(f).todense()

  train_mask = np.load(
      tf.gfile.Open(os.path.join(data_path, 'ppi.train_mask.npy'))) > 0
  val_mask = np.load(
      tf.gfile.Open(os.path.join(data_path, 'ppi.test_mask.npy'))) > 0
  test_mask = np.load(
      tf.gfile.Open(os.path.join(data_path, 'ppi.test_mask.npy'))) > 0

  return adj, features, labels, train_mask, val_mask, test_mask


def load_citation_data(dataset_str, data_path):
  """Load data."""
  names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
  objects = {}
  for name in names:
    with tf.gfile.Open(
        os.path.join(data_path, 'ind.{}.{}'.format(dataset_str, name)),
        'rb') as f:
      if sys.version_info > (3, 0):
        objects[name] = pkl.load(f)  # , encoding='latin1') comment to pass lint
      else:
        objects[name] = pkl.load(f)

  test_idx_reorder = parse_index_file(
      os.path.join(data_path, 'ind.{}.test.index'.format(dataset_str)))
  test_idx_range = np.sort(test_idx_reorder)

  if dataset_str == 'citeseer':
    # Fix citeseer dataset (there are some isolated nodes in the graph)
    # Find isolated nodes, add them as zero-vecs into the right position
    test_idx_range_full = range(
        min(test_idx_reorder),
        max(test_idx_reorder) + 1)
    tx_extended = sp.lil_matrix((len(test_idx_range_full),
                                 objects['x'].shape[1]))
    tx_extended[test_idx_range - min(test_idx_range), :] = objects['tx']
    objects['tx'] = tx_extended
    ty_extended = np.zeros((len(test_idx_range_full),
                            objects['y'].shape[1]))
    ty_extended[test_idx_range - min(test_idx_range), :] = objects['ty']
    objects['ty'] = ty_extended

  features = sp.vstack((objects['allx'], objects['tx'])).tolil()
  features[test_idx_reorder, :] = features[test_idx_range, :]
  adj = nx.adjacency_matrix(nx.from_dict_of_lists(objects['graph']))

  labels = np.vstack((objects['ally'], objects['ty']))
  labels[test_idx_reorder, :] = labels[test_idx_range, :]

  idx_test = test_idx_range.tolist()
  idx_train = range(len(objects['y']))
  idx_val = range(len(objects['y']), len(objects['y']) + 500)

  train_mask = sample_mask(idx_train, labels.shape[0])
  val_mask = sample_mask(idx_val, labels.shape[0])
  test_mask = sample_mask(idx_test, labels.shape[0])

  features = preprocess_features(features)
  return adj, features, labels, train_mask, val_mask, test_mask


def construct_feed_dict(adj_normalized, adj, features, placeholders):
  # construct feed dictionary
  feed_dict = dict()
  feed_dict.update({placeholders['features']: features})
  feed_dict.update({placeholders['adj']: adj_normalized})
  feed_dict.update({placeholders['adj_orig']: adj})
  return feed_dict


def mask_val_test_edges(adj, prop):
  """Function to mask test and val edges."""
  # NOTE: Splits are randomized and results might slightly
  # deviate from reported numbers in the paper.

  # Remove diagonal elements
  adj = adj - sp.dia_matrix(
      (adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape)
  adj.eliminate_zeros()
  # Check that diag is zero:
  assert np.diag(adj.todense()).sum() == 0

  adj_triu = sp.triu(adj)
  adj_tuple = sparse_to_tuple(adj_triu)
  edges = adj_tuple[0]
  edges_all = sparse_to_tuple(adj)[0]
  num_test = int(np.floor(edges.shape[0] * prop))
  # num_val = int(np.floor(edges.shape[0] * 0.05))  # we keep 5% for validation
  # we keep 10% of training edges for validation
  num_val = int(np.floor((edges.shape[0] - num_test) * 0.05))

  all_edge_idx = range(edges.shape[0])
  np.random.shuffle(all_edge_idx)
  val_edge_idx = all_edge_idx[:num_val]
  test_edge_idx = all_edge_idx[num_val:(num_val + num_test)]
  test_edges = edges[test_edge_idx]
  val_edges = edges[val_edge_idx]
  train_edges = np.delete(
      edges, np.hstack([test_edge_idx, val_edge_idx]), axis=0)

  def ismember(a, b, tol=5):
    rows_close = np.all(np.round(a - b[:, None], tol) == 0, axis=-1)
    return np.any(rows_close)

  test_edges_false = []
  while len(test_edges_false) < len(test_edges):
    idx_i = np.random.randint(0, adj.shape[0])
    idx_j = np.random.randint(0, adj.shape[0])
    if idx_i == idx_j:
      continue
    if ismember([idx_i, idx_j], edges_all):
      continue
    if test_edges_false:
      if ismember([idx_j, idx_i], np.array(test_edges_false)):
        continue
      if ismember([idx_i, idx_j], np.array(test_edges_false)):
        continue
    test_edges_false.append([idx_i, idx_j])

  val_edges_false = []
  while len(val_edges_false) < len(val_edges):
    idx_i = np.random.randint(0, adj.shape[0])
    idx_j = np.random.randint(0, adj.shape[0])
    if idx_i == idx_j:
      continue
    if ismember([idx_i, idx_j], train_edges):
      continue
    if ismember([idx_j, idx_i], train_edges):
      continue
    if ismember([idx_i, idx_j], val_edges):
      continue
    if ismember([idx_j, idx_i], val_edges):
      continue
    if val_edges_false:
      if ismember([idx_j, idx_i], np.array(val_edges_false)):
        continue
      if ismember([idx_i, idx_j], np.array(val_edges_false)):
        continue
    val_edges_false.append([idx_i, idx_j])

  assert ~ismember(test_edges_false, edges_all)
  assert ~ismember(val_edges_false, edges_all)
  assert ~ismember(val_edges, train_edges)
  assert ~ismember(test_edges, train_edges)
  assert ~ismember(val_edges, test_edges)

  data = np.ones(train_edges.shape[0])

  # Re-build adj matrix
  adj_train = sp.csr_matrix((data, (train_edges[:, 0], train_edges[:, 1])),
                            shape=adj.shape)
  adj_train = adj_train + adj_train.T

  # NOTE: these edge lists only contain single direction of edge!
  num_nodes = adj.shape[0]
  val_mask = np.zeros((num_nodes, num_nodes))
  for i, j in val_edges:
    val_mask[i, j] = 1
    val_mask[j, i] = 1
  for i, j in val_edges_false:
    val_mask[i, j] = 1
    val_mask[j, i] = 1
  test_mask = np.zeros((num_nodes, num_nodes))
  for i, j in test_edges:
    test_mask[i, j] = 1
    test_mask[j, i] = 1
  for i, j in test_edges_false:
    test_mask[i, j] = 1
    test_mask[j, i] = 1
  return adj_train, sparse_to_tuple(val_mask), sparse_to_tuple(test_mask)


def mask_test_edges(adj, prop):
  """Function to mask test edges.

  Args:
    adj: scipy sparse matrix
    prop: proportion of edges to remove (float in [0, 1])

  Returns:
    adj_train: adjacency with edges removed
    test_edges: list of positive and negative test edges
  """
  # Remove diagonal elements
  adj = adj - sp.dia_matrix(
      (adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape)
  adj.eliminate_zeros()
  # Check that diag is zero:
  assert np.diag(adj.todense()).sum() == 0

  adj_triu = sp.triu(adj)
  adj_tuple = sparse_to_tuple(adj_triu)
  edges = adj_tuple[0]
  edges_all = sparse_to_tuple(adj)[0]
  num_test = int(np.floor(edges.shape[0] * prop))

  all_edge_idx = range(edges.shape[0])
  np.random.shuffle(all_edge_idx)
  test_edge_idx = all_edge_idx[:num_test]
  test_edges = edges[test_edge_idx]
  train_edges = np.delete(edges, test_edge_idx, axis=0)

  def ismember(a, b, tol=5):
    rows_close = np.all(np.round(a - b[:, None], tol) == 0, axis=-1)
    return np.any(rows_close)

  test_edges_false = []
  while len(test_edges_false) < len(test_edges):
    idx_i = np.random.randint(0, adj.shape[0])
    idx_j = np.random.randint(0, adj.shape[0])
    if idx_i == idx_j:
      continue
    if ismember([idx_i, idx_j], edges_all):
      continue
    if test_edges_false:
      if ismember([idx_j, idx_i], np.array(test_edges_false)):
        continue
      if ismember([idx_i, idx_j], np.array(test_edges_false)):
        continue
    test_edges_false.append([idx_i, idx_j])

  assert ~ismember(test_edges_false, edges_all)
  assert ~ismember(test_edges, train_edges)

  data = np.ones(train_edges.shape[0])

  # Re-build adj matrix
  adj_train = sp.csr_matrix((data, (train_edges[:, 0], train_edges[:, 1])),
                            shape=adj.shape)
  adj_train = adj_train + adj_train.T

  # NOTE: these edge lists only contain single direction of edge!
  num_nodes = adj.shape[0]
  test_mask = np.zeros((num_nodes, num_nodes))
  for i, j in test_edges:
    test_mask[i, j] = 1
    test_mask[j, i] = 1
  for i, j in test_edges_false:
    test_mask[i, j] = 1
    test_mask[j, i] = 1
  return adj_train, sparse_to_tuple(test_mask)