Python gzip.open() Examples
The following are 30
code examples of gzip.open().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gzip
, or try the search function
.
Example #1
Source File: utils.py From deep-learning-note with MIT License | 9 votes |
def parse_data(path, dataset, flatten): if dataset != 'train' and dataset != 't10k': raise NameError('dataset must be train or t10k') label_file = os.path.join(path, dataset + '-labels-idx1-ubyte') with open(label_file, 'rb') as file: _, num = struct.unpack(">II", file.read(8)) labels = np.fromfile(file, dtype=np.int8) # int8 new_labels = np.zeros((num, 10)) new_labels[np.arange(num), labels] = 1 img_file = os.path.join(path, dataset + '-images-idx3-ubyte') with open(img_file, 'rb') as file: _, num, rows, cols = struct.unpack(">IIII", file.read(16)) imgs = np.fromfile(file, dtype=np.uint8).reshape(num, rows, cols) # uint8 imgs = imgs.astype(np.float32) / 255.0 if flatten: imgs = imgs.reshape([num, -1]) return imgs, new_labels
Example #2
Source File: core.py From neuropythy with GNU Affero General Public License v3.0 | 6 votes |
def save_csv(filename, dat, index=False, **kw): ''' save_csv(filename, d) writes a pandas dataframe d to a CSV file with the given name. If pandas cannot be loaded, then an error is raised. If d is not a dataframe, to_dataframe() is called on it. All optional arguments are passed along to the pandas.DataFrame.to_csv function. ''' import pandas from neuropythy.util import to_dataframe d = to_dataframe(dat) if any(filename.endswith(s) for s in ('.gz', '.bz2', '.lzma')): with gzip.open(filename, 'wt', newlines='') as fl: d.to_csv(fl, index=index, **kw) else: with open(filename, 'wt') as fl: d.to_csv(fl, index=index, **kw) return dat
Example #3
Source File: ggtnn_train.py From gated-graph-transformer-network with MIT License | 6 votes |
def assemble_batch(story_fns, num_answer_words, format_spec): stories = [] for sfn in story_fns: with gzip.open(sfn,'rb') as f: cvtd_story, _, _, _ = pickle.load(f) stories.append(cvtd_story) sents, graphs, queries, answers = zip(*stories) cvtd_sents = np.array(sents, np.int32) cvtd_queries = np.array(queries, np.int32) max_ans_len = max(len(a) for a in answers) cvtd_answers = np.stack([convert_answer(answer, num_answer_words, format_spec, max_ans_len) for answer in answers]) num_new_nodes, new_node_strengths, new_node_ids, next_edges = zip(*graphs) num_new_nodes = np.stack(num_new_nodes) new_node_strengths = np.stack(new_node_strengths) new_node_ids = np.stack(new_node_ids) next_edges = np.stack(next_edges) return cvtd_sents, cvtd_queries, cvtd_answers, num_new_nodes, new_node_strengths, new_node_ids, next_edges
Example #4
Source File: input.py From DOTA_models with Apache License 2.0 | 6 votes |
def extract_mnist_data(filename, num_images, image_size, pixel_depth): """ Extract the images into a 4D tensor [image index, y, x, channels]. Values are rescaled from [0, 255] down to [-0.5, 0.5]. """ # if not os.path.exists(file): if not tf.gfile.Exists(filename+".npy"): with gzip.open(filename) as bytestream: bytestream.read(16) buf = bytestream.read(image_size * image_size * num_images) data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32) data = (data - (pixel_depth / 2.0)) / pixel_depth data = data.reshape(num_images, image_size, image_size, 1) np.save(filename, data) return data else: with tf.gfile.Open(filename+".npy", mode='r') as file_obj: return np.load(file_obj)
Example #5
Source File: input_data.py From IntroToDeepLearning with MIT License | 6 votes |
def extract_images(filename): """Extract the images into a 4D uint8 numpy array [index, y, x, depth].""" print('Extracting', filename) with gzip.open(filename) as bytestream: magic = _read32(bytestream) if magic != 2051: raise ValueError( 'Invalid magic number %d in MNIST image file: %s' % (magic, filename)) num_images = _read32(bytestream) rows = _read32(bytestream) cols = _read32(bytestream) buf = bytestream.read(rows * cols * num_images) data = numpy.frombuffer(buf, dtype=numpy.uint8) data = data.reshape(num_images, rows, cols, 1) return data
Example #6
Source File: simplify_nq_data.py From natural-questions with Apache License 2.0 | 6 votes |
def main(_): """Runs `text_utils.simplify_nq_example` over all shards of a split. Prints simplified examples to a single gzipped file in the same directory as the input shards. """ split = os.path.basename(FLAGS.data_dir) outpath = os.path.join(FLAGS.data_dir, "simplified-nq-{}.jsonl.gz".format(split)) with gzip.open(outpath, "wb") as fout: num_processed = 0 start = time.time() for inpath in glob.glob(os.path.join(FLAGS.data_dir, "nq-*-??.jsonl.gz")): print("Processing {}".format(inpath)) with gzip.open(inpath, "rb") as fin: for l in fin: utf8_in = l.decode("utf8", "strict") utf8_out = json.dumps( text_utils.simplify_nq_example(json.loads(utf8_in))) + u"\n" fout.write(utf8_out.encode("utf8")) num_processed += 1 if not num_processed % 100: print("Processed {} examples in {}.".format(num_processed, time.time() - start))
Example #7
Source File: input_helpers.py From deep-siamese-text-similarity with MIT License | 6 votes |
def loadW2V(self,emb_path, type="bin"): print("Loading W2V data...") num_keys = 0 if type=="textgz": # this seems faster than gensim non-binary load for line in gzip.open(emb_path): l = line.strip().split() st=l[0].lower() self.pre_emb[st]=np.asarray(l[1:]) num_keys=len(self.pre_emb) if type=="text": # this seems faster than gensim non-binary load for line in open(emb_path): l = line.strip().split() st=l[0].lower() self.pre_emb[st]=np.asarray(l[1:]) num_keys=len(self.pre_emb) else: self.pre_emb = Word2Vec.load_word2vec_format(emb_path,binary=True) self.pre_emb.init_sims(replace=True) num_keys=len(self.pre_emb.vocab) print("loaded word2vec len ", num_keys) gc.collect()
Example #8
Source File: log-parser.py From aws-waf-security-automations with Apache License 2.0 | 6 votes |
def write_output(bucket_name, key_name, output_key_name, outstanding_requesters): logging.getLogger().debug('[write_output] Start') try: current_data = '/tmp/' + key_name.split('/')[-1] + '_LOCAL.json' with open(current_data, 'w') as outfile: json.dump(outstanding_requesters, outfile) s3 = boto3.client('s3') s3.upload_file(current_data, bucket_name, output_key_name, ExtraArgs={'ContentType': "application/json"}) remove(current_data) except Exception as e: logging.getLogger().error("[write_output] \tError to write output file") logging.getLogger().error(e) logging.getLogger().debug('[write_output] End')
Example #9
Source File: input_helpers.py From deep-siamese-text-similarity with MIT License | 6 votes |
def dumpValidation(self,x1_text,x2_text,y,shuffled_index,dev_idx,i): print("dumping validation "+str(i)) x1_shuffled=x1_text[shuffled_index] x2_shuffled=x2_text[shuffled_index] y_shuffled=y[shuffled_index] x1_dev=x1_shuffled[dev_idx:] x2_dev=x2_shuffled[dev_idx:] y_dev=y_shuffled[dev_idx:] del x1_shuffled del y_shuffled with open('validation.txt'+str(i),'w') as f: for text1,text2,label in zip(x1_dev,x2_dev,y_dev): f.write(str(label)+"\t"+text1+"\t"+text2+"\n") f.close() del x1_dev del y_dev # Data Preparatopn # ==================================================
Example #10
Source File: input_data.py From IntroToDeepLearning with MIT License | 6 votes |
def extract_images(filename): """Extract the images into a 4D uint8 numpy array [index, y, x, depth].""" print('Extracting', filename) with gzip.open(filename) as bytestream: magic = _read32(bytestream) if magic != 2051: raise ValueError( 'Invalid magic number %d in MNIST image file: %s' % (magic, filename)) num_images = _read32(bytestream) rows = _read32(bytestream) cols = _read32(bytestream) buf = bytestream.read(rows * cols * num_images) data = numpy.frombuffer(buf, dtype=numpy.uint8) data = data.reshape(num_images, rows, cols, 1) return data
Example #11
Source File: input_helpers.py From deep-siamese-text-similarity with MIT License | 6 votes |
def getTsvData(self, filepath): print("Loading training data from "+filepath) x1=[] x2=[] y=[] # positive samples from file for line in open(filepath): l=line.strip().split("\t") if len(l)<2: continue if random() > 0.5: x1.append(l[0].lower()) x2.append(l[1].lower()) else: x1.append(l[1].lower()) x2.append(l[0].lower()) y.append(int(l[2])) return np.asarray(x1),np.asarray(x2),np.asarray(y)
Example #12
Source File: wmt_utils.py From DOTA_models with Apache License 2.0 | 6 votes |
def get_wmt_enfr_dev_set(directory): """Download the WMT en-fr training corpus to directory unless it's there.""" dev_name = "newstest2013" dev_path = os.path.join(directory, dev_name) if not (tf.gfile.Exists(dev_path + ".fr") and tf.gfile.Exists(dev_path + ".en")): dev_file = maybe_download(directory, "dev-v2.tgz", _WMT_ENFR_DEV_URL) print "Extracting tgz file %s" % dev_file with tarfile.open(dev_file, "r:gz") as dev_tar: fr_dev_file = dev_tar.getmember("dev/" + dev_name + ".fr") en_dev_file = dev_tar.getmember("dev/" + dev_name + ".en") fr_dev_file.name = dev_name + ".fr" # Extract without "dev/" prefix. en_dev_file.name = dev_name + ".en" dev_tar.extract(fr_dev_file, directory) dev_tar.extract(en_dev_file, directory) return dev_path
Example #13
Source File: dataset_tool.py From disentangling_conditional_gans with MIT License | 6 votes |
def create_mnist(tfrecord_dir, mnist_dir): print('Loading MNIST from "%s"' % mnist_dir) import gzip with gzip.open(os.path.join(mnist_dir, 'train-images-idx3-ubyte.gz'), 'rb') as file: images = np.frombuffer(file.read(), np.uint8, offset=16) with gzip.open(os.path.join(mnist_dir, 'train-labels-idx1-ubyte.gz'), 'rb') as file: labels = np.frombuffer(file.read(), np.uint8, offset=8) images = images.reshape(-1, 1, 28, 28) images = np.pad(images, [(0,0), (0,0), (2,2), (2,2)], 'constant', constant_values=0) assert images.shape == (60000, 1, 32, 32) and images.dtype == np.uint8 assert labels.shape == (60000,) and labels.dtype == np.uint8 assert np.min(images) == 0 and np.max(images) == 255 assert np.min(labels) == 0 and np.max(labels) == 9 onehot = np.zeros((labels.size, np.max(labels) + 1), dtype=np.float32) onehot[np.arange(labels.size), labels] = 1.0 with TFRecordExporter(tfrecord_dir, images.shape[0]) as tfr: order = tfr.choose_shuffled_order() for idx in range(order.size): tfr.add_image(images[order[idx]]) tfr.add_labels(onehot[order]) #----------------------------------------------------------------------------
Example #14
Source File: dataset_tool.py From disentangling_conditional_gans with MIT License | 6 votes |
def create_mnistrgb(tfrecord_dir, mnist_dir, num_images=1000000, random_seed=123): print('Loading MNIST from "%s"' % mnist_dir) import gzip with gzip.open(os.path.join(mnist_dir, 'train-images-idx3-ubyte.gz'), 'rb') as file: images = np.frombuffer(file.read(), np.uint8, offset=16) images = images.reshape(-1, 28, 28) images = np.pad(images, [(0,0), (2,2), (2,2)], 'constant', constant_values=0) assert images.shape == (60000, 32, 32) and images.dtype == np.uint8 assert np.min(images) == 0 and np.max(images) == 255 with TFRecordExporter(tfrecord_dir, num_images) as tfr: rnd = np.random.RandomState(random_seed) for idx in range(num_images): tfr.add_image(images[rnd.randint(images.shape[0], size=3)]) #----------------------------------------------------------------------------
Example #15
Source File: dataset_tool.py From disentangling_conditional_gans with MIT License | 6 votes |
def create_cifar100(tfrecord_dir, cifar100_dir): print('Loading CIFAR-100 from "%s"' % cifar100_dir) import pickle with open(os.path.join(cifar100_dir, 'train'), 'rb') as file: data = pickle.load(file, encoding='latin1') images = data['data'].reshape(-1, 3, 32, 32) labels = np.array(data['fine_labels']) assert images.shape == (50000, 3, 32, 32) and images.dtype == np.uint8 assert labels.shape == (50000,) and labels.dtype == np.int32 assert np.min(images) == 0 and np.max(images) == 255 assert np.min(labels) == 0 and np.max(labels) == 99 onehot = np.zeros((labels.size, np.max(labels) + 1), dtype=np.float32) onehot[np.arange(labels.size), labels] = 1.0 with TFRecordExporter(tfrecord_dir, images.shape[0]) as tfr: order = tfr.choose_shuffled_order() for idx in range(order.size): tfr.add_image(images[order[idx]]) tfr.add_labels(onehot[order]) #----------------------------------------------------------------------------
Example #16
Source File: dataset_tool.py From disentangling_conditional_gans with MIT License | 6 votes |
def create_celeba(tfrecord_dir, celeba_dir, cx=89, cy=121): print('Loading CelebA from "%s"' % celeba_dir) glob_pattern = os.path.join(celeba_dir, 'img_align_celeba_png', '*.png') image_filenames = sorted(glob.glob(glob_pattern)) expected_images = 202599 if len(image_filenames) != expected_images: error('Expected to find %d images' % expected_images) with TFRecordExporter(tfrecord_dir, len(image_filenames)) as tfr: order = tfr.choose_shuffled_order() for idx in range(order.size): img = np.asarray(PIL.Image.open(image_filenames[order[idx]])) assert img.shape == (218, 178, 3) img = img[cy - 64 : cy + 64, cx - 64 : cx + 64] img = img.transpose(2, 0, 1) # HWC => CHW tfr.add_image(img) #----------------------------------------------------------------------------
Example #17
Source File: download_and_convert_mnist.py From DOTA_models with Apache License 2.0 | 6 votes |
def _extract_labels(filename, num_labels): """Extract the labels into a vector of int64 label IDs. Args: filename: The path to an MNIST labels file. num_labels: The number of labels in the file. Returns: A numpy array of shape [number_of_labels] """ print('Extracting labels from: ', filename) with gzip.open(filename) as bytestream: bytestream.read(8) buf = bytestream.read(1 * num_labels) labels = np.frombuffer(buf, dtype=np.uint8).astype(np.int64) return labels
Example #18
Source File: download_and_convert_mnist.py From DOTA_models with Apache License 2.0 | 6 votes |
def _extract_images(filename, num_images): """Extract the images into a numpy array. Args: filename: The path to an MNIST images file. num_images: The number of images in the file. Returns: A numpy array of shape [number_of_images, height, width, channels]. """ print('Extracting images from: ', filename) with gzip.open(filename) as bytestream: bytestream.read(16) buf = bytestream.read( _IMAGE_SIZE * _IMAGE_SIZE * num_images * _NUM_CHANNELS) data = np.frombuffer(buf, dtype=np.uint8) data = data.reshape(num_images, _IMAGE_SIZE, _IMAGE_SIZE, _NUM_CHANNELS) return data
Example #19
Source File: test_utils.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 6 votes |
def discard_stderr(): """ Discards error output of a routine if invoked as: with discard_stderr(): ... """ with open(os.devnull, 'w') as bit_bucket: try: stderr_fileno = sys.stderr.fileno() old_stderr = os.dup(stderr_fileno) try: os.dup2(bit_bucket.fileno(), stderr_fileno) yield finally: os.dup2(old_stderr, stderr_fileno) except AttributeError: # On some systems is stderr not a file descriptor but actually a virtual pipeline # that can not be copied yield
Example #20
Source File: datasets.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 6 votes |
def _get_data(self): if any(not os.path.exists(path) or not check_sha1(path, sha1) for path, sha1 in ((os.path.join(self._root, name), sha1) for name, sha1 in self._train_data + self._test_data)): namespace = 'gluon/dataset/'+self._namespace filename = download(_get_repo_file_url(namespace, self._archive_file[0]), path=self._root, sha1_hash=self._archive_file[1]) with tarfile.open(filename) as tar: tar.extractall(self._root) if self._train: data_files = self._train_data else: data_files = self._test_data data, label = zip(*(self._read_batch(os.path.join(self._root, name)) for name, _ in data_files)) data = np.concatenate(data) label = np.concatenate(label) self._data = nd.array(data, dtype=data.dtype) self._label = label
Example #21
Source File: create_joint_gs.py From CAMISIM with Apache License 2.0 | 6 votes |
def create_gsa_mapping(path, metadata, sample_name, shuffle): """ Creates the binning gold standard/gsa mapping """ to_genome = name_to_genome(metadata) gsa_path = os.path.join(path, "anonymous_gsa.fasta") # count = 0 if not os.path.exists(gsa_path): gsa_path = os.path.join(path, "anonymous_gsa.fasta.gz") # if zipped with gzip.open(gsa_path,'r') as gsa: for line in gsa: if line.startswith('>'): count += 1 with gzip.open(gsa_path,'r') as gsa: gsa_temp = shuffle_anonymize(gsa, path, to_genome, metadata, sample_name, count, shuffle) else: with open(gsa_path,'r') as gsa: for line in gsa: if line.startswith('>'): count += 1 with open(gsa_path,'r') as gsa: gsa_temp = shuffle_anonymize(gsa, path, to_genome, metadata, sample_name, count, shuffle) os.rename(gsa_temp, gsa_path)
Example #22
Source File: get_genomes.py From CAMISIM with Apache License 2.0 | 6 votes |
def read_genomes_list(genomes_path, additional_file = None): genomes_map = {} total_genomes = 0 if additional_file is not None: with open(additional_file,'r') as add: for line in add: ncbi_id, sci_name, path, novelty = line.strip().split('\t') if ncbi_id in genomes_map: genomes_map[ncbi_id][1].append(path) else: genomes_map[ncbi_id] = (sci_name, [path], novelty) # this might not be a http path total_genomes += 1 with open(genomes_path,'r') as genomes: for line in genomes: ncbi_id, sci_name, ftp = line.strip().split('\t') http = ftp.replace("ftp://","http://") # not using ftp address but http (proxies) if ncbi_id in genomes_map: genomes_map[ncbi_id][1].append(http) else: genomes_map[ncbi_id] = (sci_name, [http], 'known_strain') # sci_name is always the same for same taxid (?) total_genomes += 1 return genomes_map, total_genomes
Example #23
Source File: get_genomes.py From CAMISIM with Apache License 2.0 | 6 votes |
def download_genome(genome, out_path): genome_path = os.path.join(out_path,"genomes") out_name = genome.rstrip().split('/')[-1] http_address = os.path.join(genome, out_name + "_genomic.fna.gz") opened = urllib2.urlopen(http_address) out = os.path.join(genome_path, out_name + ".fa") tmp_out = os.path.join(genome_path, out_name + "tmp.fa") out_gz = out + ".gz" with open(out_gz,'wb') as outF: outF.write(opened.read()) gf = gzip.open(out_gz) new_out = open(tmp_out,'wb') new_out.write(gf.read()) gf.close() os.remove(out_gz) new_out.close() split_by_N(tmp_out, out) return out
Example #24
Source File: datasets.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 6 votes |
def _get_data(self): if self._train: data, label = self._train_data, self._train_label else: data, label = self._test_data, self._test_label namespace = 'gluon/dataset/'+self._namespace data_file = download(_get_repo_file_url(namespace, data[0]), path=self._root, sha1_hash=data[1]) label_file = download(_get_repo_file_url(namespace, label[0]), path=self._root, sha1_hash=label[1]) with gzip.open(label_file, 'rb') as fin: struct.unpack(">II", fin.read(8)) label = np.frombuffer(fin.read(), dtype=np.uint8).astype(np.int32) with gzip.open(data_file, 'rb') as fin: struct.unpack(">IIII", fin.read(16)) data = np.frombuffer(fin.read(), dtype=np.uint8) data = data.reshape(len(label), 28, 28, 1) self._data = nd.array(data, dtype=data.dtype) self._label = label
Example #25
Source File: core.py From neuropythy with GNU Affero General Public License v3.0 | 6 votes |
def load_string(filename, to=None): ''' load_string(filename) loads the given file as a string. The optional argument to can be set to 'lines' to load a list of lines or to 'bytes' to load the data as a byte-string. ''' if to is None: to = 'string' to = to.lower() if to in ['string', 'str', 'text', 'txt', 't', 's']: with open(filename, 'r') as fl: return fl.read() elif to in ['binary', 'bytes', 'b', 'raw']: with open(filename, 'rb') as fl: return fl.read() elif to in ['lines', 'l']: with open(filename, 'r') as fl: return fl.read().splitlines()
Example #26
Source File: core.py From neuropythy with GNU Affero General Public License v3.0 | 6 votes |
def load_json(filename, to='auto'): ''' load_json(filename) yields the object represented by the json file or stream object filename. The optional argument to may be set to None to indicate that the JSON data should be returned verbatim rather than parsed by neuropythy's denormalize system. ''' from neuropythy.util import denormalize as denorm if pimms.is_str(filename): try: with gzip.open(filename, 'rt') as fl: dat = json.load(fl) except Exception: with open(filename, 'rt') as fl: dat = json.load(fl) else: dat = json.load(filename) filename = '<stream>' if to is None: return dat elif to == 'auto': return denorm(dat) else: raise ValueError('unrecognized to option: %s' % to)
Example #27
Source File: core.py From neuropythy with GNU Affero General Public License v3.0 | 6 votes |
def save_json(filename, obj, normalize=True): ''' save_json(filename, obj) writes the given object to the given filename (or stream) in a normalized JSON format. The optional argument normalize (default True) may be set to False to prevent the object from being run through neuropythy's normalize system. ''' from neuropythy.util import normalize as norm dat = norm(obj) if normalize else obj if pimms.is_str(filename): jsonstr = json.dumps(dat) if any(filename.endswith(s) for s in ('.gz', '.bz2', '.lzma')): with gzip.open(filename, 'wt') as fl: fl.write(jsonstr) else: with open(filename, 'wt') as fl: fl.write(jsonstr) else: json.dump(dat, filename) return filename
Example #28
Source File: core.py From neuropythy with GNU Affero General Public License v3.0 | 6 votes |
def save_tsv(filename, dat, sep='\t', index=False, **kw): ''' save_tsv(filename, d) writes a pandas dataframe d to a TSV file with the given name. If pandas cannot be loaded, then an error is raised. If d is not a dataframe, to_dataframe() is called on it. All optional arguments are passed along to the pandas.DataFrame.to_csv function. Note that this function is identical to save_csv() except that it has a default sep value of '\t' instead of ','. ''' import pandas from neuropythy.util import to_dataframe d = to_dataframe(dat) if any(filename.endswith(s) for s in ('.gz', '.bz2', '.lzma')): with gzip.open(filename, 'wt', newlines='') as fl: d.to_csv(fl, sep=sep, index=index, **kw) else: with open(filename, 'wt') as fl: d.to_csv(fl, sep=sep, index=index, **kw) return dat # Nifti!
Example #29
Source File: app.py From svviz with MIT License | 6 votes |
def saveState(dataHub): import pickle as pickle import gzip pickle.dump(dataHub, gzip.open(dataHub.args.save_state, "wb")) logging.warn("^"*20 + " saving state to pickle and exiting " + "^"*20)
Example #30
Source File: capsulenet.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 5 votes |
def read_data(label_url, image_url): with gzip.open(download_data(label_url)) as flbl: magic, num = struct.unpack(">II", flbl.read(8)) label = np.fromstring(flbl.read(), dtype=np.int8) with gzip.open(download_data(image_url), 'rb') as fimg: magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16)) image = np.fromstring(fimg.read(), dtype=np.uint8).reshape(len(label), rows, cols) return label, image