Python scipy.sparse.load_npz() Examples

The following are 30 code examples of scipy.sparse.load_npz(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scipy.sparse , or try the search function .
Example #1
Source File: data_utils.py    From gcnn-survey-paper with Apache License 2.0 6 votes vote down vote up
def load_edge_masks(dataset_str, data_path, adj_true, drop_edge_prop):
  """Loads adjacency matrix as sparse matrix and masks for val & test links.

  Args:
    dataset_str: dataset to use
    data_path: path to data folder
    adj_true: true adjacency matrix in dense format,
    drop_edge_prop: proportion of edges to remove.

  Returns:
    adj_matrix: adjacency matrix
    train_mask: mask for train edges
    val_mask: mask for val edges
    test_mask: mask for test edges
  """
  edge_mask_path = os.path.join(
      data_path, 'emask.{}.remove{}.'.format(dataset_str, drop_edge_prop))
  val_mask = sp.load_npz(edge_mask_path + 'val.npz')
  test_mask = sp.load_npz(edge_mask_path + 'test.npz')
  train_mask = 1. - val_mask.todense() - test_mask.todense()
  # remove val and test edges from true A
  adj_train = np.multiply(adj_true, train_mask)
  train_mask -= np.eye(train_mask.shape[0])
  return adj_train, sparse_to_tuple(val_mask), sparse_to_tuple(
      val_mask), sparse_to_tuple(test_mask) 
Example #2
Source File: vectorizers.py    From recsys2019 with Apache License 2.0 6 votes vote down vote up
def save_to_one_flie_csrs(self, fns):
        save_as = os.path.join(self.output_folder, "Xcsr.h5")
        try:
            os.unlink(save_as)
        except:
            pass
        h5f = h5sparse.File(save_as)
        first = True
        for fn in fns:
            logger.info(f"Saving {fn}")
            mat = load_npz(os.path.join(self.output_folder, "chunks", fn)).astype(np.float32)
            if first:
                h5f.create_dataset("matrix", data=mat, chunks=(10_000_000,), maxshape=(None,))
                first = False
            else:
                h5f["matrix"].append(mat)
            gc.collect()
        h5f.close() 
Example #3
Source File: test_hicAverageRegions.py    From HiCExplorer with GNU General Public License v3.0 6 votes vote down vote up
def test_average_regions_single():

    outfile = NamedTemporaryFile(suffix='.npz', prefix='average_region', delete=False)
    matrix = ROOT + 'small_test_matrix.cool'
    bed_file = ROOT + 'hicAverageRegions/regions.bed'
    args = "--matrix {} --regions {} -o {} --range 100000 100000".format(matrix, bed_file, outfile.name).split()
    log.debug('path: {}'.format(matrix))

    hicAverageRegions.main(args)

    test_file = load_npz(ROOT + 'hicAverageRegions/result_range_100000.npz')
    new_file = load_npz(outfile.name)

    nt.assert_almost_equal(test_file.data, new_file.data, decimal=0)

    os.remove(outfile.name) 
Example #4
Source File: test_hicAverageRegions.py    From HiCExplorer with GNU General Public License v3.0 6 votes vote down vote up
def test_average_regions_center():

    outfile = NamedTemporaryFile(suffix='.npz', prefix='average_region', delete=False)
    matrix = ROOT + 'small_test_matrix.cool'
    bed_file = ROOT + 'hicAverageRegions/regions_multi.bed'
    args = "--matrix {} --regions {} -o {} --range 100000 100000 -cb {}".format(matrix, bed_file, outfile.name, 'center').split()
    log.debug('path: {}'.format(matrix))

    hicAverageRegions.main(args)

    test_file = load_npz(ROOT + 'hicAverageRegions/regions_multi_center.npz')
    new_file = load_npz(outfile.name)

    nt.assert_almost_equal(test_file.data, new_file.data, decimal=0)

    os.remove(outfile.name) 
Example #5
Source File: test_hicAverageRegions.py    From HiCExplorer with GNU General Public License v3.0 6 votes vote down vote up
def test_average_regions_start():

    outfile = NamedTemporaryFile(suffix='.npz', prefix='average_region', delete=False)
    matrix = ROOT + 'small_test_matrix.cool'
    bed_file = ROOT + 'hicAverageRegions/regions_multi.bed'
    args = "--matrix {} --regions {} -o {} --range 100000 100000 -cb {}".format(matrix, bed_file, outfile.name, 'start').split()
    log.debug('path: {}'.format(matrix))

    hicAverageRegions.main(args)

    test_file = load_npz(ROOT + 'hicAverageRegions/regions_multi_start.npz')
    new_file = load_npz(outfile.name)

    nt.assert_almost_equal(test_file.data, new_file.data, decimal=0)

    os.remove(outfile.name) 
Example #6
Source File: reddit_posts.py    From causal-text-embeddings with MIT License 6 votes vote down vote up
def load_term_counts(path='../dat/', force_redo=False):
    count_filename = path + 'reddit_term_counts'
    authors_counts_filename = path + 'reddit_author_term_counts'
    vocab_filename = path + 'vocab'

    if os.path.exists(count_filename + '.npz') and not force_redo:
        return sparse.load_npz(count_filename + '.npz'), sparse.load_npz(authors_counts_filename + '.npz'), np.load(
            vocab_filename + '.npy')

    reddit = load_reddit()
    post_docs = reddit['post_text'].values
    author_grouped = reddit.groupby('author')['post_text'].apply(lambda x: ' '.join(x)).reset_index()
    author_docs = author_grouped['post_text'].values
    counts, vocab, vec = tokenize_documents(post_docs)
    author_counts = vec.transform(author_docs)
    sparse.save_npz(count_filename, counts)
    sparse.save_npz(authors_counts_filename, author_counts)
    np.save(vocab_filename, vocab)
    return counts, author_counts, vocab 
Example #7
Source File: data_utils.py    From gcnn-survey-paper with Apache License 2.0 6 votes vote down vote up
def add_top_k_edges(data, edge_mask_path, gae_scores_path, topk, nb_nodes,
                    norm_adj):
  """Loads GAE scores and adds topK edges to train adjacency."""
  test_mask = sp.load_npz(os.path.join(edge_mask_path, 'test_mask.npz'))
  train_mask = 1. - test_mask.todense()
  # remove val and test edges from true A
  adj_train_curr = np.multiply(data['adj_true'], train_mask)
  # Predict test edges using precomputed scores
  scores = np.load(os.path.join(gae_scores_path, 'gae_scores.npy'))
  # scores_mask = 1 - np.eye(nb_nodes)
  scores_mask = np.zeros((nb_nodes, nb_nodes))
  scores_mask[:140, 140:] = 1.
  scores_mask[140:, :140] = 1.
  scores = np.multiply(scores, scores_mask).reshape((-1,))
  threshold = scores[np.argsort(-scores)[topk]]
  adj_train_curr += 1 * (scores > threshold).reshape((nb_nodes, nb_nodes))
  adj_train_curr = 1 * (adj_train_curr > 0)
  if norm_adj:
    adj_train_norm = normalize_adj(data['adj_train'])
  else:
    adj_train_norm = sp.coo_matrix(data['adj_train'])
  return adj_train_curr, sparse_to_tuple(adj_train_norm) 
Example #8
Source File: data_utils.py    From gcnn-survey-paper with Apache License 2.0 6 votes vote down vote up
def load_ppi_data(data_path):
  """Load PPI dataset."""
  with tf.gfile.Open(os.path.join(data_path, 'ppi.edges.npz')) as f:
    adj = sp.load_npz(f)

  with tf.gfile.Open(os.path.join(data_path, 'ppi.features.norm.npy')) as f:
    features = np.load(f)

  with tf.gfile.Open(os.path.join(data_path, 'ppi.labels.npz')) as f:
    labels = sp.load_npz(f).todense()

  train_mask = np.load(
      tf.gfile.Open(os.path.join(data_path, 'ppi.train_mask.npy'))) > 0
  val_mask = np.load(
      tf.gfile.Open(os.path.join(data_path, 'ppi.test_mask.npy'))) > 0
  test_mask = np.load(
      tf.gfile.Open(os.path.join(data_path, 'ppi.test_mask.npy'))) > 0

  return adj, features, labels, train_mask, val_mask, test_mask 
Example #9
Source File: reddit.py    From dgl with Apache License 2.0 6 votes vote down vote up
def _load(self):
        # graph
        coo_adj = sp.load_npz(os.path.join(
            self._extract_dir, "reddit{}_graph.npz".format(self._self_loop_str)))
        self.graph = DGLGraph(coo_adj, readonly=True)
        # features and labels
        reddit_data = np.load(os.path.join(self._extract_dir, "reddit_data.npz"))
        self.features = reddit_data["feature"]
        self.labels = reddit_data["label"]
        self.num_labels = 41
        # tarin/val/test indices
        node_ids = reddit_data["node_ids"]
        node_types = reddit_data["node_types"]
        self.train_mask = (node_types == 1)
        self.val_mask = (node_types == 2)
        self.test_mask = (node_types == 3)

        print('Finished data loading.')
        print('  NumNodes: {}'.format(self.graph.number_of_nodes()))
        print('  NumEdges: {}'.format(self.graph.number_of_edges()))
        print('  NumFeats: {}'.format(self.features.shape[1]))
        print('  NumClasses: {}'.format(self.num_labels))
        print('  NumTrainingSamples: {}'.format(len(np.nonzero(self.train_mask)[0])))
        print('  NumValidationSamples: {}'.format(len(np.nonzero(self.val_mask)[0])))
        print('  NumTestSamples: {}'.format(len(np.nonzero(self.test_mask)[0]))) 
Example #10
Source File: reverse_transform.py    From multi-categorical-gans with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def uscensus_reverse_transform(input_path, output_path, metadata_path):
    with open(metadata_path, "r") as metadata_file:
        metadata = json.load(metadata_file)

    features = load_npz(input_path)

    csv_file = open(output_path, "w")
    output = csv.DictWriter(csv_file, ["caseid"] + metadata["variables"])
    output.writeheader()

    for row_value_indices in features:
        _, selected_value_indices = row_value_indices.nonzero()
        # there should be one value per variable
        assert len(selected_value_indices) == len(metadata["variables"])

        row_dict = dict()

        for selected_value_index in selected_value_indices:
            variable, value = metadata["index_to_value"][selected_value_index]
            row_dict[variable] = value

        output.writerow(row_dict)

    csv_file.close() 
Example #11
Source File: reddit.py    From pytorch_geometric with MIT License 6 votes vote down vote up
def process(self):
        data = np.load(osp.join(self.raw_dir, 'reddit_data.npz'))
        x = torch.from_numpy(data['feature']).to(torch.float)
        y = torch.from_numpy(data['label']).to(torch.long)
        split = torch.from_numpy(data['node_types'])

        adj = sp.load_npz(osp.join(self.raw_dir, 'reddit_graph.npz'))
        row = torch.from_numpy(adj.row).to(torch.long)
        col = torch.from_numpy(adj.col).to(torch.long)
        edge_index = torch.stack([row, col], dim=0)
        edge_index, _ = coalesce(edge_index, None, x.size(0), x.size(0))

        data = Data(x=x, edge_index=edge_index, y=y)
        data.train_mask = split == 1
        data.val_mask = split == 2
        data.test_mask = split == 3

        data = data if self.pre_transform is None else self.pre_transform(data)

        torch.save(self.collate([data]), self.processed_paths[0]) 
Example #12
Source File: dict.py    From asreview with Apache License 2.0 5 votes vote down vote up
def get_feature_matrix(self, data_hash):
        my_data = self._state_dict["data_properties"][data_hash]
        encoded_X = my_data["feature_matrix"]
        matrix_type = my_data["matrix_type"]
        if matrix_type == "ndarray":
            return np.array(encoded_X)
        elif matrix_type == "csr_matrix":
            with BytesIO(b64decode(encoded_X)) as f:
                return load_npz(f)
        return encoded_X 
Example #13
Source File: wikidatagraph.py    From opentapioca with Apache License 2.0 5 votes vote down vote up
def load_from_matrix(self, fname):
        self.mat = sparse.load_npz(fname)
        self.shape = self.mat.shape[1] 
Example #14
Source File: utils.py    From SGC with MIT License 5 votes vote down vote up
def loadRedditFromNPZ(dataset_dir):
    adj = sp.load_npz(dataset_dir+"reddit_adj.npz")
    data = np.load(dataset_dir+"reddit.npz")

    return adj, data['feats'], data['y_train'], data['y_val'], data['y_test'], data['train_index'], data['val_index'], data['test_index'] 
Example #15
Source File: formats.py    From multi-categorical-gans with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def load_sparse(features_path, transform=True):
    features = load_npz(features_path)
    if transform:
        features = np.asarray(features.todense()).astype(np.float32)
    return features 
Example #16
Source File: test_hicAverageRegions.py    From HiCExplorer with GNU General Public License v3.0 5 votes vote down vote up
def test_average_regions_range_in_bins_end():

    outfile = NamedTemporaryFile(suffix='.npz', prefix='average_region', delete=False)
    matrix = ROOT + 'small_test_matrix.cool'
    bed_file = ROOT + 'hicAverageRegions/regions_multi.bed'
    args = "--matrix {} --regions  {} -o {} --rangeInBins 100 100 -cb {}".format(matrix, bed_file, outfile.name, 'end').split()
    hicAverageRegions.main(args)

    test_file = load_npz(ROOT + 'hicAverageRegions/regions_multi_range_in_bins_end.npz')
    new_file = load_npz(outfile.name)

    nt.assert_almost_equal(test_file.data, new_file.data, decimal=0)

    os.remove(outfile.name) 
Example #17
Source File: data_utils.py    From gcnn-survey-paper with Apache License 2.0 5 votes vote down vote up
def load_test_edge_mask(dataset_str, data_path, drop_edge_prop):
  """Remove test edges by loading edge masks."""
  edge_mask_path = os.path.join(
      data_path, 'emask.{}.remove{}.npz'.format(dataset_str, drop_edge_prop))
  with tf.gfile.Open(edge_mask_path) as f:
    mask = sp.load_npz(f)
  return mask 
Example #18
Source File: load_data.py    From neural_graph_collaborative_filtering with MIT License 5 votes vote down vote up
def get_adj_mat(self):
        try:
            t1 = time()
            adj_mat = sp.load_npz(self.path + '/s_adj_mat.npz')
            norm_adj_mat = sp.load_npz(self.path + '/s_norm_adj_mat.npz')
            mean_adj_mat = sp.load_npz(self.path + '/s_mean_adj_mat.npz')
            print('already load adj matrix', adj_mat.shape, time() - t1)

        except Exception:
            adj_mat, norm_adj_mat, mean_adj_mat = self.create_adj_mat()
            sp.save_npz(self.path + '/s_adj_mat.npz', adj_mat)
            sp.save_npz(self.path + '/s_norm_adj_mat.npz', norm_adj_mat)
            sp.save_npz(self.path + '/s_mean_adj_mat.npz', mean_adj_mat)
        return adj_mat, norm_adj_mat, mean_adj_mat 
Example #19
Source File: loader_nfm.py    From knowledge_graph_attention_network with MIT License 5 votes vote down vote up
def get_kg_feature(self, kg_feat_file):
        try:
            kg_feat_mat = sp.load_npz(kg_feat_file)
            print('already load item kg feature mat', kg_feat_mat.shape)
        except Exception:
            kg_feat_mat = self._create_kg_feat_mat()
            sp.save_npz(kg_feat_file, kg_feat_mat)
            print('already save item kg feature mat:', kg_feat_file)
        return kg_feat_mat 
Example #20
Source File: cooc.py    From ALaCarte with MIT License 5 votes vote down vote up
def alacache(nameroot, feature='ngram'):
  ''' function to return output of this script
  Args:
    nameroot: root of files (without extensions); the input argument 'outputroot'
    feature: string name of feature that was computed
  Returns:
    if file is for word x word cooccurrence: returns cooc matrix, word vocab, word counts; otherwise also returns feature vocab and featurecounts
  '''

  matrix = sp.load_npz(nameroot+'.npz')
  with open(nameroot+'.pkl', 'rb') as f:
    data = pickle.load(f)
  if len(data) == 2:
    return matrix, data['words'], data['counts']
  return matrix, data['words'], data['wordcounts'], data[feature+'s'], data[feature+'counts'] 
Example #21
Source File: make-trie.py    From isdi with MIT License 5 votes vote down vote up
def join_mats(fnames, s, e):
    ofname="mat_{}_{}".format(s, e)
    print(ofname, fnames)
    M = [sps.load_npz(f) for f in fnames]
    print("Done reading..")
    sps.save_npz(
        ofname,
        sps.vstack(M)
    ) 
Example #22
Source File: make-trie.py    From isdi with MIT License 5 votes vote down vote up
def join_smart_mat(fnames):
    """Join arrays in Mlist inplace"""
    # M.indptr M.indices
    indptr = np.zeros(num_devices+1, dtype=np.int32)
    indices = np.zeros(Msize, dtype=np.int32)    
    i_indptr, i_indices = 0, 0
    ofname = 'joined_mat.npz'
    M = [None for _ in fnames]
    for i, mf in enumerate(fnames) :
        M[i] = sps.load_npz(mf)
        print("Loaded matrix={}. shape={}. nnz={}".format(mf, M[i].shape, M[i].nnz))
        # Mindptr = M.indptr
        # Mindices = M.indices
        # indptr[i_indptr+1:i_indptr+len(Mindptr)] = Mindptr[1:] + indptr[i_indptr]
        # i_indptr += len(Mindptr)-1
        # indices[i_indices:i_indices+len(Mindices)] = Mindices
        # i_indices += i_indices
        # del M
    print("Saving the file...")
    M = sps.csr_matrix(
        (np.ones(len(indices)), indices, indptr),
        shape=(len(indptr)-1, num_apps),
        dtype=bool
    )
    print(M.nnz)
    sps.save_npz(ofname, M) 
Example #23
Source File: datasets.py    From interpret-community with MIT License 5 votes vote down vote up
def retrieve_dataset(dataset, **kwargs):
    # if data not extracted, download zip and extract
    outdirname = 'datasets.12.18.2019'
    if not os.path.exists(outdirname):
        try:
            from urllib import urlretrieve
        except ImportError:
            from urllib.request import urlretrieve
        import zipfile
        zipfilename = outdirname + '.zip'
        urlretrieve('https://publictestdatasets.blob.core.windows.net/data/' + zipfilename, zipfilename)
        with zipfile.ZipFile(zipfilename, 'r') as unzip:
            unzip.extractall('.')
    extension = os.path.splitext(dataset)[1]
    filepath = os.path.join(outdirname, dataset)
    if extension == '.npz':
        # sparse format file
        from scipy.sparse import load_npz
        return load_npz(filepath)
    elif extension == '.svmlight':
        from sklearn import datasets
        return datasets.load_svmlight_file(filepath)
    elif extension == '.json':
        import json
        with open(filepath, encoding='utf-8') as f:
            dataset = json.load(f)
        return dataset
    elif extension == '.csv':
        import pandas as pd
        return pd.read_csv(filepath, **kwargs)
    else:
        raise Exception('Unrecognized file extension: ' + extension) 
Example #24
Source File: features.py    From SecuML with GNU General Public License v2.0 5 votes vote down vote up
def get_matrix(features_files, num_instances, sparse=False):
        if not sparse:
            iterator = FeaturesFromExp.get_matrix_iterator(features_files,
                                                           num_instances)
            features = np.vstack(tuple(r for r in iterator))
        else:
            features = None
            for _, f_path, f_mask in features_files:
                indices = np.where(f_mask)[0]
                matrix = load_npz(f_path)[:, indices]
                if features is None:
                    features = matrix
                else:
                    features = hstack([features, matrix])
        return features 
Example #25
Source File: peerread_output_ate.py    From causal-text-embeddings with MIT License 5 votes vote down vote up
def load_term_counts(path='../dat/reddit/'):
	return load_npz(path + 'term_counts.npz').toarray() 
Example #26
Source File: hashing.py    From deep_architect with MIT License 5 votes vote down vote up
def load_state(self, folderpath):
        state = ut.read_jsonfile(
            ut.join_paths([folderpath, 'hash_model_state.json']))
        self.vals_lst = state['vals_lst']
        num_evals = state['num_evals']
        for i in range(num_evals):
            self.vecs_lst.append(
                sp.load_npz(ut.join_paths([folderpath,
                                           str(i) + '.npz'])))
        if num_evals > 0:
            self._refit() 
Example #27
Source File: utils.py    From DropEdge with MIT License 5 votes vote down vote up
def loadRedditFromNPZ(dataset_dir=datadir):
    adj = sp.load_npz(dataset_dir+"reddit_adj.npz")
    data = np.load(dataset_dir +"reddit.npz")

    return adj, data['feats'], data['y_train'], data['y_val'], data['y_test'], data['train_index'], data['val_index'], data['test_index'] 
Example #28
Source File: attacked_data.py    From DeepRobust with MIT License 5 votes vote down vote up
def load_data(self):
        if not osp.exists(self.data_filename):
            self.download_npz()
        print('Loading {} dataset perturbed by 0.05 mettack...'.format(self.name))
        adj = sp.load_npz(self.data_filename)
        warnings.warn('''the adjacency matrix is perturbed, using the data splits under seed 15(default seed for deeprobust.graph.data.Dataset), so if you are going to verify the attacking performance, you should use the same data splits''')
        return adj 
Example #29
Source File: attacked_data.py    From DeepRobust with MIT License 5 votes vote down vote up
def load_data(self):
        if not osp.exists(self.data_filename):
            self.download_npz()
        print('Loading {} dataset perturbed by {} {}...'.format(self.name, self.ptb_rate, self.attack_method))

        if self.attack_method == 'meta':
            warnings.warn('''the pre-attacked graph is perturbed, using the data splits under seed 15 (default seed), so if you are going to verify the attacking performance, you should use the same data splits.''')
            adj = sp.load_npz(self.data_filename)

        if self.attack_method == 'nettack':
            assert True, "Will update pre-attacked data by nettack soon"
            adj = sp.load_npz(self.data_filename)

        return adj 
Example #30
Source File: plot_adjustment.py    From causal-text-embeddings with MIT License 5 votes vote down vote up
def load_terms(data):
	termfile = '../dat/' + data + '/term_counts.npz'
	if data == 'reddit':
		termfile = '../dat/' + data + '_term_counts.npz'
	term_counts = load_npz(termfile).toarray()
	if drop_terms:
		term_indices = np.arange(term_counts.shape[1])
		random_indices = np.random.choice(term_indices, 1000)
		term_counts = term_counts[:,random_indices]
	return term_counts