Python scipy.sparse.load_npz() Examples
The following are 30
code examples of scipy.sparse.load_npz().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scipy.sparse
, or try the search function
.
Example #1
Source File: data_utils.py From gcnn-survey-paper with Apache License 2.0 | 6 votes |
def load_edge_masks(dataset_str, data_path, adj_true, drop_edge_prop): """Loads adjacency matrix as sparse matrix and masks for val & test links. Args: dataset_str: dataset to use data_path: path to data folder adj_true: true adjacency matrix in dense format, drop_edge_prop: proportion of edges to remove. Returns: adj_matrix: adjacency matrix train_mask: mask for train edges val_mask: mask for val edges test_mask: mask for test edges """ edge_mask_path = os.path.join( data_path, 'emask.{}.remove{}.'.format(dataset_str, drop_edge_prop)) val_mask = sp.load_npz(edge_mask_path + 'val.npz') test_mask = sp.load_npz(edge_mask_path + 'test.npz') train_mask = 1. - val_mask.todense() - test_mask.todense() # remove val and test edges from true A adj_train = np.multiply(adj_true, train_mask) train_mask -= np.eye(train_mask.shape[0]) return adj_train, sparse_to_tuple(val_mask), sparse_to_tuple( val_mask), sparse_to_tuple(test_mask)
Example #2
Source File: vectorizers.py From recsys2019 with Apache License 2.0 | 6 votes |
def save_to_one_flie_csrs(self, fns): save_as = os.path.join(self.output_folder, "Xcsr.h5") try: os.unlink(save_as) except: pass h5f = h5sparse.File(save_as) first = True for fn in fns: logger.info(f"Saving {fn}") mat = load_npz(os.path.join(self.output_folder, "chunks", fn)).astype(np.float32) if first: h5f.create_dataset("matrix", data=mat, chunks=(10_000_000,), maxshape=(None,)) first = False else: h5f["matrix"].append(mat) gc.collect() h5f.close()
Example #3
Source File: test_hicAverageRegions.py From HiCExplorer with GNU General Public License v3.0 | 6 votes |
def test_average_regions_single(): outfile = NamedTemporaryFile(suffix='.npz', prefix='average_region', delete=False) matrix = ROOT + 'small_test_matrix.cool' bed_file = ROOT + 'hicAverageRegions/regions.bed' args = "--matrix {} --regions {} -o {} --range 100000 100000".format(matrix, bed_file, outfile.name).split() log.debug('path: {}'.format(matrix)) hicAverageRegions.main(args) test_file = load_npz(ROOT + 'hicAverageRegions/result_range_100000.npz') new_file = load_npz(outfile.name) nt.assert_almost_equal(test_file.data, new_file.data, decimal=0) os.remove(outfile.name)
Example #4
Source File: test_hicAverageRegions.py From HiCExplorer with GNU General Public License v3.0 | 6 votes |
def test_average_regions_center(): outfile = NamedTemporaryFile(suffix='.npz', prefix='average_region', delete=False) matrix = ROOT + 'small_test_matrix.cool' bed_file = ROOT + 'hicAverageRegions/regions_multi.bed' args = "--matrix {} --regions {} -o {} --range 100000 100000 -cb {}".format(matrix, bed_file, outfile.name, 'center').split() log.debug('path: {}'.format(matrix)) hicAverageRegions.main(args) test_file = load_npz(ROOT + 'hicAverageRegions/regions_multi_center.npz') new_file = load_npz(outfile.name) nt.assert_almost_equal(test_file.data, new_file.data, decimal=0) os.remove(outfile.name)
Example #5
Source File: test_hicAverageRegions.py From HiCExplorer with GNU General Public License v3.0 | 6 votes |
def test_average_regions_start(): outfile = NamedTemporaryFile(suffix='.npz', prefix='average_region', delete=False) matrix = ROOT + 'small_test_matrix.cool' bed_file = ROOT + 'hicAverageRegions/regions_multi.bed' args = "--matrix {} --regions {} -o {} --range 100000 100000 -cb {}".format(matrix, bed_file, outfile.name, 'start').split() log.debug('path: {}'.format(matrix)) hicAverageRegions.main(args) test_file = load_npz(ROOT + 'hicAverageRegions/regions_multi_start.npz') new_file = load_npz(outfile.name) nt.assert_almost_equal(test_file.data, new_file.data, decimal=0) os.remove(outfile.name)
Example #6
Source File: reddit_posts.py From causal-text-embeddings with MIT License | 6 votes |
def load_term_counts(path='../dat/', force_redo=False): count_filename = path + 'reddit_term_counts' authors_counts_filename = path + 'reddit_author_term_counts' vocab_filename = path + 'vocab' if os.path.exists(count_filename + '.npz') and not force_redo: return sparse.load_npz(count_filename + '.npz'), sparse.load_npz(authors_counts_filename + '.npz'), np.load( vocab_filename + '.npy') reddit = load_reddit() post_docs = reddit['post_text'].values author_grouped = reddit.groupby('author')['post_text'].apply(lambda x: ' '.join(x)).reset_index() author_docs = author_grouped['post_text'].values counts, vocab, vec = tokenize_documents(post_docs) author_counts = vec.transform(author_docs) sparse.save_npz(count_filename, counts) sparse.save_npz(authors_counts_filename, author_counts) np.save(vocab_filename, vocab) return counts, author_counts, vocab
Example #7
Source File: data_utils.py From gcnn-survey-paper with Apache License 2.0 | 6 votes |
def add_top_k_edges(data, edge_mask_path, gae_scores_path, topk, nb_nodes, norm_adj): """Loads GAE scores and adds topK edges to train adjacency.""" test_mask = sp.load_npz(os.path.join(edge_mask_path, 'test_mask.npz')) train_mask = 1. - test_mask.todense() # remove val and test edges from true A adj_train_curr = np.multiply(data['adj_true'], train_mask) # Predict test edges using precomputed scores scores = np.load(os.path.join(gae_scores_path, 'gae_scores.npy')) # scores_mask = 1 - np.eye(nb_nodes) scores_mask = np.zeros((nb_nodes, nb_nodes)) scores_mask[:140, 140:] = 1. scores_mask[140:, :140] = 1. scores = np.multiply(scores, scores_mask).reshape((-1,)) threshold = scores[np.argsort(-scores)[topk]] adj_train_curr += 1 * (scores > threshold).reshape((nb_nodes, nb_nodes)) adj_train_curr = 1 * (adj_train_curr > 0) if norm_adj: adj_train_norm = normalize_adj(data['adj_train']) else: adj_train_norm = sp.coo_matrix(data['adj_train']) return adj_train_curr, sparse_to_tuple(adj_train_norm)
Example #8
Source File: data_utils.py From gcnn-survey-paper with Apache License 2.0 | 6 votes |
def load_ppi_data(data_path): """Load PPI dataset.""" with tf.gfile.Open(os.path.join(data_path, 'ppi.edges.npz')) as f: adj = sp.load_npz(f) with tf.gfile.Open(os.path.join(data_path, 'ppi.features.norm.npy')) as f: features = np.load(f) with tf.gfile.Open(os.path.join(data_path, 'ppi.labels.npz')) as f: labels = sp.load_npz(f).todense() train_mask = np.load( tf.gfile.Open(os.path.join(data_path, 'ppi.train_mask.npy'))) > 0 val_mask = np.load( tf.gfile.Open(os.path.join(data_path, 'ppi.test_mask.npy'))) > 0 test_mask = np.load( tf.gfile.Open(os.path.join(data_path, 'ppi.test_mask.npy'))) > 0 return adj, features, labels, train_mask, val_mask, test_mask
Example #9
Source File: reddit.py From dgl with Apache License 2.0 | 6 votes |
def _load(self): # graph coo_adj = sp.load_npz(os.path.join( self._extract_dir, "reddit{}_graph.npz".format(self._self_loop_str))) self.graph = DGLGraph(coo_adj, readonly=True) # features and labels reddit_data = np.load(os.path.join(self._extract_dir, "reddit_data.npz")) self.features = reddit_data["feature"] self.labels = reddit_data["label"] self.num_labels = 41 # tarin/val/test indices node_ids = reddit_data["node_ids"] node_types = reddit_data["node_types"] self.train_mask = (node_types == 1) self.val_mask = (node_types == 2) self.test_mask = (node_types == 3) print('Finished data loading.') print(' NumNodes: {}'.format(self.graph.number_of_nodes())) print(' NumEdges: {}'.format(self.graph.number_of_edges())) print(' NumFeats: {}'.format(self.features.shape[1])) print(' NumClasses: {}'.format(self.num_labels)) print(' NumTrainingSamples: {}'.format(len(np.nonzero(self.train_mask)[0]))) print(' NumValidationSamples: {}'.format(len(np.nonzero(self.val_mask)[0]))) print(' NumTestSamples: {}'.format(len(np.nonzero(self.test_mask)[0])))
Example #10
Source File: reverse_transform.py From multi-categorical-gans with BSD 3-Clause "New" or "Revised" License | 6 votes |
def uscensus_reverse_transform(input_path, output_path, metadata_path): with open(metadata_path, "r") as metadata_file: metadata = json.load(metadata_file) features = load_npz(input_path) csv_file = open(output_path, "w") output = csv.DictWriter(csv_file, ["caseid"] + metadata["variables"]) output.writeheader() for row_value_indices in features: _, selected_value_indices = row_value_indices.nonzero() # there should be one value per variable assert len(selected_value_indices) == len(metadata["variables"]) row_dict = dict() for selected_value_index in selected_value_indices: variable, value = metadata["index_to_value"][selected_value_index] row_dict[variable] = value output.writerow(row_dict) csv_file.close()
Example #11
Source File: reddit.py From pytorch_geometric with MIT License | 6 votes |
def process(self): data = np.load(osp.join(self.raw_dir, 'reddit_data.npz')) x = torch.from_numpy(data['feature']).to(torch.float) y = torch.from_numpy(data['label']).to(torch.long) split = torch.from_numpy(data['node_types']) adj = sp.load_npz(osp.join(self.raw_dir, 'reddit_graph.npz')) row = torch.from_numpy(adj.row).to(torch.long) col = torch.from_numpy(adj.col).to(torch.long) edge_index = torch.stack([row, col], dim=0) edge_index, _ = coalesce(edge_index, None, x.size(0), x.size(0)) data = Data(x=x, edge_index=edge_index, y=y) data.train_mask = split == 1 data.val_mask = split == 2 data.test_mask = split == 3 data = data if self.pre_transform is None else self.pre_transform(data) torch.save(self.collate([data]), self.processed_paths[0])
Example #12
Source File: dict.py From asreview with Apache License 2.0 | 5 votes |
def get_feature_matrix(self, data_hash): my_data = self._state_dict["data_properties"][data_hash] encoded_X = my_data["feature_matrix"] matrix_type = my_data["matrix_type"] if matrix_type == "ndarray": return np.array(encoded_X) elif matrix_type == "csr_matrix": with BytesIO(b64decode(encoded_X)) as f: return load_npz(f) return encoded_X
Example #13
Source File: wikidatagraph.py From opentapioca with Apache License 2.0 | 5 votes |
def load_from_matrix(self, fname): self.mat = sparse.load_npz(fname) self.shape = self.mat.shape[1]
Example #14
Source File: utils.py From SGC with MIT License | 5 votes |
def loadRedditFromNPZ(dataset_dir): adj = sp.load_npz(dataset_dir+"reddit_adj.npz") data = np.load(dataset_dir+"reddit.npz") return adj, data['feats'], data['y_train'], data['y_val'], data['y_test'], data['train_index'], data['val_index'], data['test_index']
Example #15
Source File: formats.py From multi-categorical-gans with BSD 3-Clause "New" or "Revised" License | 5 votes |
def load_sparse(features_path, transform=True): features = load_npz(features_path) if transform: features = np.asarray(features.todense()).astype(np.float32) return features
Example #16
Source File: test_hicAverageRegions.py From HiCExplorer with GNU General Public License v3.0 | 5 votes |
def test_average_regions_range_in_bins_end(): outfile = NamedTemporaryFile(suffix='.npz', prefix='average_region', delete=False) matrix = ROOT + 'small_test_matrix.cool' bed_file = ROOT + 'hicAverageRegions/regions_multi.bed' args = "--matrix {} --regions {} -o {} --rangeInBins 100 100 -cb {}".format(matrix, bed_file, outfile.name, 'end').split() hicAverageRegions.main(args) test_file = load_npz(ROOT + 'hicAverageRegions/regions_multi_range_in_bins_end.npz') new_file = load_npz(outfile.name) nt.assert_almost_equal(test_file.data, new_file.data, decimal=0) os.remove(outfile.name)
Example #17
Source File: data_utils.py From gcnn-survey-paper with Apache License 2.0 | 5 votes |
def load_test_edge_mask(dataset_str, data_path, drop_edge_prop): """Remove test edges by loading edge masks.""" edge_mask_path = os.path.join( data_path, 'emask.{}.remove{}.npz'.format(dataset_str, drop_edge_prop)) with tf.gfile.Open(edge_mask_path) as f: mask = sp.load_npz(f) return mask
Example #18
Source File: load_data.py From neural_graph_collaborative_filtering with MIT License | 5 votes |
def get_adj_mat(self): try: t1 = time() adj_mat = sp.load_npz(self.path + '/s_adj_mat.npz') norm_adj_mat = sp.load_npz(self.path + '/s_norm_adj_mat.npz') mean_adj_mat = sp.load_npz(self.path + '/s_mean_adj_mat.npz') print('already load adj matrix', adj_mat.shape, time() - t1) except Exception: adj_mat, norm_adj_mat, mean_adj_mat = self.create_adj_mat() sp.save_npz(self.path + '/s_adj_mat.npz', adj_mat) sp.save_npz(self.path + '/s_norm_adj_mat.npz', norm_adj_mat) sp.save_npz(self.path + '/s_mean_adj_mat.npz', mean_adj_mat) return adj_mat, norm_adj_mat, mean_adj_mat
Example #19
Source File: loader_nfm.py From knowledge_graph_attention_network with MIT License | 5 votes |
def get_kg_feature(self, kg_feat_file): try: kg_feat_mat = sp.load_npz(kg_feat_file) print('already load item kg feature mat', kg_feat_mat.shape) except Exception: kg_feat_mat = self._create_kg_feat_mat() sp.save_npz(kg_feat_file, kg_feat_mat) print('already save item kg feature mat:', kg_feat_file) return kg_feat_mat
Example #20
Source File: cooc.py From ALaCarte with MIT License | 5 votes |
def alacache(nameroot, feature='ngram'): ''' function to return output of this script Args: nameroot: root of files (without extensions); the input argument 'outputroot' feature: string name of feature that was computed Returns: if file is for word x word cooccurrence: returns cooc matrix, word vocab, word counts; otherwise also returns feature vocab and featurecounts ''' matrix = sp.load_npz(nameroot+'.npz') with open(nameroot+'.pkl', 'rb') as f: data = pickle.load(f) if len(data) == 2: return matrix, data['words'], data['counts'] return matrix, data['words'], data['wordcounts'], data[feature+'s'], data[feature+'counts']
Example #21
Source File: make-trie.py From isdi with MIT License | 5 votes |
def join_mats(fnames, s, e): ofname="mat_{}_{}".format(s, e) print(ofname, fnames) M = [sps.load_npz(f) for f in fnames] print("Done reading..") sps.save_npz( ofname, sps.vstack(M) )
Example #22
Source File: make-trie.py From isdi with MIT License | 5 votes |
def join_smart_mat(fnames): """Join arrays in Mlist inplace""" # M.indptr M.indices indptr = np.zeros(num_devices+1, dtype=np.int32) indices = np.zeros(Msize, dtype=np.int32) i_indptr, i_indices = 0, 0 ofname = 'joined_mat.npz' M = [None for _ in fnames] for i, mf in enumerate(fnames) : M[i] = sps.load_npz(mf) print("Loaded matrix={}. shape={}. nnz={}".format(mf, M[i].shape, M[i].nnz)) # Mindptr = M.indptr # Mindices = M.indices # indptr[i_indptr+1:i_indptr+len(Mindptr)] = Mindptr[1:] + indptr[i_indptr] # i_indptr += len(Mindptr)-1 # indices[i_indices:i_indices+len(Mindices)] = Mindices # i_indices += i_indices # del M print("Saving the file...") M = sps.csr_matrix( (np.ones(len(indices)), indices, indptr), shape=(len(indptr)-1, num_apps), dtype=bool ) print(M.nnz) sps.save_npz(ofname, M)
Example #23
Source File: datasets.py From interpret-community with MIT License | 5 votes |
def retrieve_dataset(dataset, **kwargs): # if data not extracted, download zip and extract outdirname = 'datasets.12.18.2019' if not os.path.exists(outdirname): try: from urllib import urlretrieve except ImportError: from urllib.request import urlretrieve import zipfile zipfilename = outdirname + '.zip' urlretrieve('https://publictestdatasets.blob.core.windows.net/data/' + zipfilename, zipfilename) with zipfile.ZipFile(zipfilename, 'r') as unzip: unzip.extractall('.') extension = os.path.splitext(dataset)[1] filepath = os.path.join(outdirname, dataset) if extension == '.npz': # sparse format file from scipy.sparse import load_npz return load_npz(filepath) elif extension == '.svmlight': from sklearn import datasets return datasets.load_svmlight_file(filepath) elif extension == '.json': import json with open(filepath, encoding='utf-8') as f: dataset = json.load(f) return dataset elif extension == '.csv': import pandas as pd return pd.read_csv(filepath, **kwargs) else: raise Exception('Unrecognized file extension: ' + extension)
Example #24
Source File: features.py From SecuML with GNU General Public License v2.0 | 5 votes |
def get_matrix(features_files, num_instances, sparse=False): if not sparse: iterator = FeaturesFromExp.get_matrix_iterator(features_files, num_instances) features = np.vstack(tuple(r for r in iterator)) else: features = None for _, f_path, f_mask in features_files: indices = np.where(f_mask)[0] matrix = load_npz(f_path)[:, indices] if features is None: features = matrix else: features = hstack([features, matrix]) return features
Example #25
Source File: peerread_output_ate.py From causal-text-embeddings with MIT License | 5 votes |
def load_term_counts(path='../dat/reddit/'): return load_npz(path + 'term_counts.npz').toarray()
Example #26
Source File: hashing.py From deep_architect with MIT License | 5 votes |
def load_state(self, folderpath): state = ut.read_jsonfile( ut.join_paths([folderpath, 'hash_model_state.json'])) self.vals_lst = state['vals_lst'] num_evals = state['num_evals'] for i in range(num_evals): self.vecs_lst.append( sp.load_npz(ut.join_paths([folderpath, str(i) + '.npz']))) if num_evals > 0: self._refit()
Example #27
Source File: utils.py From DropEdge with MIT License | 5 votes |
def loadRedditFromNPZ(dataset_dir=datadir): adj = sp.load_npz(dataset_dir+"reddit_adj.npz") data = np.load(dataset_dir +"reddit.npz") return adj, data['feats'], data['y_train'], data['y_val'], data['y_test'], data['train_index'], data['val_index'], data['test_index']
Example #28
Source File: attacked_data.py From DeepRobust with MIT License | 5 votes |
def load_data(self): if not osp.exists(self.data_filename): self.download_npz() print('Loading {} dataset perturbed by 0.05 mettack...'.format(self.name)) adj = sp.load_npz(self.data_filename) warnings.warn('''the adjacency matrix is perturbed, using the data splits under seed 15(default seed for deeprobust.graph.data.Dataset), so if you are going to verify the attacking performance, you should use the same data splits''') return adj
Example #29
Source File: attacked_data.py From DeepRobust with MIT License | 5 votes |
def load_data(self): if not osp.exists(self.data_filename): self.download_npz() print('Loading {} dataset perturbed by {} {}...'.format(self.name, self.ptb_rate, self.attack_method)) if self.attack_method == 'meta': warnings.warn('''the pre-attacked graph is perturbed, using the data splits under seed 15 (default seed), so if you are going to verify the attacking performance, you should use the same data splits.''') adj = sp.load_npz(self.data_filename) if self.attack_method == 'nettack': assert True, "Will update pre-attacked data by nettack soon" adj = sp.load_npz(self.data_filename) return adj
Example #30
Source File: plot_adjustment.py From causal-text-embeddings with MIT License | 5 votes |
def load_terms(data): termfile = '../dat/' + data + '/term_counts.npz' if data == 'reddit': termfile = '../dat/' + data + '_term_counts.npz' term_counts = load_npz(termfile).toarray() if drop_terms: term_indices = np.arange(term_counts.shape[1]) random_indices = np.random.choice(term_indices, 1000) term_counts = term_counts[:,random_indices] return term_counts