Python h5py.special_dtype() Examples

The following are 30 code examples of h5py.special_dtype(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module h5py , or try the search function

Example #1

Source File: ilsvrc2010.py From attention-lvcsr with MIT License

6 votes

def prepare_hdf5_file(hdf5_file, n_train, n_valid, n_test):
    """Create datasets within a given HDF5 file.

    Parameters
    ----------
    hdf5_file : :class:`h5py.File` instance
        HDF5 file handle to which to write.
    n_train : int
        The number of training set examples.
    n_valid : int
        The number of validation set examples.
    n_test : int
        The number of test set examples.

    """
    n_total = n_train + n_valid + n_test
    splits = create_splits(n_train, n_valid, n_test)
    hdf5_file.attrs['split'] = H5PYDataset.create_split_array(splits)
    vlen_dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
    hdf5_file.create_dataset('encoded_images', shape=(n_total,),
                             dtype=vlen_dtype)
    hdf5_file.create_dataset('targets', shape=(n_total, 1), dtype=numpy.int16)
    hdf5_file.create_dataset('filenames', shape=(n_total, 1), dtype='S32')

Example #2

Source File: million_song_dataset.py From implicit with MIT License

6 votes

def _hfd5_from_dataframe(data, track_info, outputfilename):
    # create a sparse matrix of all the users/plays
    plays = coo_matrix((data['plays'].astype(np.float32),
                       (data['track'].cat.codes.copy(),
                        data['user'].cat.codes.copy()))).tocsr()

    with h5py.File(outputfilename, "w") as f:
        g = f.create_group('track_user_plays')
        g.create_dataset("data", data=plays.data)
        g.create_dataset("indptr", data=plays.indptr)
        g.create_dataset("indices", data=plays.indices)

        dt = h5py.special_dtype(vlen=str)
        dset = f.create_dataset('track', track_info.shape, dtype=dt)
        dset[:] = track_info

        user = list(data['user'].cat.categories)
        dset = f.create_dataset('user', (len(user),), dtype=dt)
        dset[:] = user

Example #3

Source File: sketchfab.py From implicit with MIT License

6 votes

def _hfd5_from_dataframe(data, outputfilename):
    items = data['mid'].cat.codes.copy()
    users = data['uid'].cat.codes.copy()
    values = np.ones(len(items)).astype(np.float32)

    # create a sparse matrix of all the item/users/likes
    likes = coo_matrix((values, (items, users))).astype(np.float32).tocsr()

    with h5py.File(outputfilename, "w") as f:
        g = f.create_group('item_user_likes')
        g.create_dataset("data", data=likes.data)
        g.create_dataset("indptr", data=likes.indptr)
        g.create_dataset("indices", data=likes.indices)

        dt = h5py.special_dtype(vlen=str)
        item = list(data['mid'].cat.categories)
        dset = f.create_dataset('item', (len(item),), dtype=dt)
        dset[:] = item

        user = list(data['uid'].cat.categories)
        dset = f.create_dataset('user', (len(user),), dtype=dt)
        dset[:] = user

Example #4

Source File: lastfm.py From implicit with MIT License

6 votes

def _hfd5_from_dataframe(data, outputfilename):
    # create a sparse matrix of all the users/plays
    plays = coo_matrix((data['plays'].astype(np.float32),
                       (data['artist'].cat.codes.copy(),
                        data['user'].cat.codes.copy()))).tocsr()

    with h5py.File(outputfilename, "w") as f:
        g = f.create_group('artist_user_plays')
        g.create_dataset("data", data=plays.data)
        g.create_dataset("indptr", data=plays.indptr)
        g.create_dataset("indices", data=plays.indices)

        dt = h5py.special_dtype(vlen=str)
        artist = list(data['artist'].cat.categories)
        dset = f.create_dataset('artist', (len(artist),), dtype=dt)
        dset[:] = artist

        user = list(data['user'].cat.categories)
        dset = f.create_dataset('user', (len(user),), dtype=dt)
        dset[:] = user

Example #5

Source File: h5_test.py From keras-image-segmentation with MIT License

6 votes

def write_data(h5py_file, mode, x_paths, y_paths):
    num_data = len(x_paths)

    uint8_dt = h5py.special_dtype(vlen=np.uint8)
    string_dt = h5py.special_dtype(vlen=str)

    group = h5py_file.create_group(mode)
    h5_name = group.create_dataset('name', shape=(num_data,), dtype=string_dt)
    h5_image = group.create_dataset('image', shape=(num_data,), dtype=uint8_dt)
    h5_label = group.create_dataset('label', shape=(num_data,), dtype=uint8_dt)

    h5_image.attrs['size'] = [256,512,3]
    h5_label.attrs['size'] = [256,512,1]

    for i in range(num_data):
        x_img = cv2.imread(x_paths[i], 1)
        y_img = cv2.imread(y_paths[i], 0)
        x_img = cv2.resize(x_img, None, fx=0.25, fy=0.25, interpolation=cv2.INTER_LINEAR)
        y_img = cv2.resize(y_img, None, fx=0.25, fy=0.25, interpolation=cv2.INTER_NEAREST)

        h5_image[i] = x_img.flatten()
        h5_label[i] = y_img.flatten()
        h5_name[i] = os.path.basename(x_paths[i])

        # break

Example #6

Source File: pyMtsf.py From PySimulator with GNU Lesser General Public License v3.0

6 votes

def WriteUnits(self):
        if len(self.units) == 0:
            return

        #maxLenTypeName = self._getMaxLength([x.name for x in self.units])
        numpyDataType = numpy.dtype({'names': ['name', 'factor',
                                              'offset', 'mode'],
                               'formats': [h5py.special_dtype(vlen=unicode),
                                          'double',
                                          'double',
                                          h5py.special_dtype(enum=(numpy.uint8, {'BaseUnit':0, 'Unit':1, 'DefaultDisplayUnit':2}))]})  # 'uint8']})

        dataset = self.description.create_dataset('Units', (len(self.units), 1), dtype=numpyDataType, maxshape=(len(self.units), 1), compression='gzip')
        allData = []
        for unit in self.units:
            allData.append((unit.name, unit.factor, unit.offset, unit.mode))
        dataset[:, 0] = allData

Example #7

Source File: movielens.py From implicit with MIT License

6 votes

def _hfd5_from_dataframe(ratings, movies, outputfilename):
    # transform ratings dataframe into a sparse matrix
    m = coo_matrix((ratings['rating'].astype(np.float32),
                   (ratings['movieId'], ratings['userId']))).tocsr()

    with h5py.File(outputfilename, "w") as f:
        # write out the ratings matrix
        g = f.create_group('movie_user_ratings')
        g.create_dataset("data", data=m.data)
        g.create_dataset("indptr", data=m.indptr)
        g.create_dataset("indices", data=m.indices)

        # write out the titles as a numpy array
        titles = np.empty(shape=(movies.movieId.max()+1,), dtype=np.object)
        titles[movies.movieId] = movies.title
        dt = h5py.special_dtype(vlen=str)
        dset = f.create_dataset('movie', (len(titles),), dtype=dt)
        dset[:] = titles

Example #8

Source File: data_generator.py From GroundedTranslation with BSD 3-Clause "New" or "Revised" License

6 votes

def set_predicted_description(self, split, data_key, sentence):
        '''
        Set the predicted sentence tokens in the data_key group,
        creating the group if necessary, or erasing the current value if
        necessary.
        '''

        if self.openmode != "r+":
            # forcefully quit when trying to write to a read-only file
            raise RuntimeError("Dataset is read-only, try again with --h5_writable")

        dataset_key = 'predicted_description'

        try:
            predicted_text = self.dataset[split][data_key].create_dataset(dataset_key, (1,), dtype=h5py.special_dtype(vlen=unicode))
        except RuntimeError:
            # the dataset already exists, erase it and create an empty space
            del self.dataset[split][data_key][dataset_key]
            predicted_text = self.dataset[split][data_key].create_dataset(dataset_key, (1,), dtype=h5py.special_dtype(vlen=unicode))

        predicted_text[0] = " ".join([x for x in sentence])

Example #9

Source File: vectors.py From text_embedding with MIT License

6 votes

def text2hdf5(textfile, hdf5file, **kwargs):
  '''converts word embeddings file from text to HDF5 format
  Args:
      textfile: word embeddings file in format "word float ... float\n"
      hdf5file: output file ; will have keys 'words' and 'vectors'
      kwargs: passed to load
  Returns:
      None
  '''

  words, vectors = zip(*load(textfile, **kwargs))
  f = h5py.File(hdf5file)
  f.create_dataset('words', (len(words),), dtype=h5py.special_dtype(vlen=str))
  for i, word in enumerate(words):
      f['words'][i] = word
  f.create_dataset('vectors', data=np.vstack(vectors))
  f.close()

Example #10

Source File: hdf5.py From pyPESTO with BSD 3-Clause "New" or "Revised" License

6 votes

def write_string_array(f: h5py.Group,
                       path: str,
                       strings: Collection) -> None:
    """
    Write string array to hdf5

    Parameters
    -------------
    f:
        h5py.Group where dataset should be created
    path:
        path of the dataset to create
    strings:
        list of strings to be written to f
    """
    dt = h5py.special_dtype(vlen=str)
    dset = f.create_dataset(path, (len(strings),), dtype=dt)
    dset[:] = [s.encode('utf8') for s in strings]

Example #11

Source File: ilsvrc2012.py From fuel with MIT License

6 votes

def prepare_hdf5_file(hdf5_file, n_train, n_valid, n_test):
    """Create datasets within a given HDF5 file.

    Parameters
    ----------
    hdf5_file : :class:`h5py.File` instance
        HDF5 file handle to which to write.
    n_train : int
        The number of training set examples.
    n_valid : int
        The number of validation set examples.
    n_test : int
        The number of test set examples.

    """
    n_total = n_train + n_valid + n_test
    n_labeled = n_train + n_valid
    splits = create_splits(n_train, n_valid, n_test)
    hdf5_file.attrs['split'] = H5PYDataset.create_split_array(splits)
    vlen_dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
    hdf5_file.create_dataset('encoded_images', shape=(n_total,),
                             dtype=vlen_dtype)
    hdf5_file.create_dataset('targets', shape=(n_labeled, 1),
                             dtype=numpy.int16)
    hdf5_file.create_dataset('filenames', shape=(n_total, 1), dtype='S32')

Example #12

Source File: ilsvrc2010.py From fuel with MIT License

6 votes

def prepare_hdf5_file(hdf5_file, n_train, n_valid, n_test):
    """Create datasets within a given HDF5 file.

    Parameters
    ----------
    hdf5_file : :class:`h5py.File` instance
        HDF5 file handle to which to write.
    n_train : int
        The number of training set examples.
    n_valid : int
        The number of validation set examples.
    n_test : int
        The number of test set examples.

    """
    n_total = n_train + n_valid + n_test
    splits = create_splits(n_train, n_valid, n_test)
    hdf5_file.attrs['split'] = H5PYDataset.create_split_array(splits)
    vlen_dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
    hdf5_file.create_dataset('encoded_images', shape=(n_total,),
                             dtype=vlen_dtype)
    hdf5_file.create_dataset('targets', shape=(n_total, 1), dtype=numpy.int16)
    hdf5_file.create_dataset('filenames', shape=(n_total, 1), dtype='S32')

Example #13

Source File: hdf5_dataset_writer.py From calamari with Apache License 2.0

6 votes

def finish_chunck(self):
        if len(self.text) == 0:
            return

        codec = self.compute_codec()

        filename = "{}_{:03d}{}".format(self.output_filename, self.current_chunk, DataSetType.gt_extension(DataSetType.HDF5))
        self.files.append(filename)
        file = h5py.File(filename, 'w')
        dti32 = h5py.special_dtype(vlen=np.dtype('int32'))
        dtui8 = h5py.special_dtype(vlen=np.dtype('uint8'))
        file.create_dataset('transcripts', (len(self.text),), dtype=dti32, compression='gzip')
        file.create_dataset('images_dims', data=[d.shape for d in self.data], dtype=int)
        file.create_dataset('images', (len(self.text),), dtype=dtui8, compression='gzip')
        file.create_dataset('codec', data=list(map(ord, codec)))
        file['transcripts'][...] = [list(map(codec.index, d)) for d in self.text]
        file['images'][...] = [d.reshape(-1) for d in self.data]
        file.close()

        self.current_chunk += 1
        self.data = []
        self.text = []

Example #14

Source File: test_datatype.py From GraphicDesignPatternByPython with MIT License

6 votes

def test_compound_vlen(self):
        vidt = h5py.special_dtype(vlen=np.uint8)
        eidt = h5py.special_dtype(enum=(np.uint8, {'OFF': 0, 'ON': 1}))

        for np_align in (False, True):
            dt = np.dtype([
                ('a', eidt),
                ('foo', vidt),
                ('bar', vidt),
                ('switch', eidt)], align=np_align)
            np_offsets = [dt.fields[i][1] for i in dt.names]

            for logical in (False, True):
                if logical and np_align:
                    # Vlen types have different size in the numpy struct
                    self.assertRaises(TypeError, h5py.h5t.py_create, dt,
                            logical=logical)
                else:
                    ht = h5py.h5t.py_create(dt, logical=logical)
                    offsets = [ht.get_member_offset(i)
                               for i in range(ht.get_nmembers())]
                    if np_align:
                        self.assertEqual(np_offsets, offsets)

Example #15

Source File: test_datatype.py From GraphicDesignPatternByPython with MIT License

6 votes

def test_vlen_enum(self):
        fname = self.mktemp()
        arr1 = [[1],[1,2]]
        dt1 = h5py.special_dtype(vlen=h5py.special_dtype(
            enum=('i', dict(foo=1, bar=2))))

        with h5py.File(fname,'w') as f:
            df1 = f.create_dataset('test', (len(arr1),), dtype=dt1)
            df1[:] = np.array(arr1)

        with h5py.File(fname,'r') as f:
            df2  = f['test']
            dt2  = df2.dtype
            arr2 = [e.tolist() for e in df2[:]]

        self.assertEqual(arr1, arr2)
        self.assertEqual(h5py.check_dtype(enum=h5py.check_dtype(vlen=dt1)),
                         h5py.check_dtype(enum=h5py.check_dtype(vlen=dt2)))

Example #16

Source File: test_datatype.py From GraphicDesignPatternByPython with MIT License

6 votes

def test_compound_vlen_enum(self):
        eidt = h5py.special_dtype(enum=(np.uint8, {'OFF': 0, 'ON': 1}))
        vidt = h5py.special_dtype(vlen=np.uint8)
        def a(items):
            return np.array(items, dtype=np.uint8)

        f = self.f

        dt_vve = np.dtype([
            ('foo', vidt),
            ('bar', vidt),
            ('switch', eidt)])
        vve = f.create_dataset('dt_vve', shape=(2,), dtype=dt_vve)
        data = np.array([(a([1,2,3]), a([1,2]),   1),
                         (a([]),      a([2,4,6]), 0),],
                         dtype=dt_vve)
        vve[:] = data
        actual = vve[:]
        self.assertVlenArrayEqual(data['foo'], actual['foo'])
        self.assertVlenArrayEqual(data['bar'], actual['bar'])
        self.assertArrayEqual(data['switch'], actual['switch'])

Example #17

Source File: _utils.py From PyINT with GNU General Public License v3.0

6 votes

def write_h5(datasetDict, out_file, metadata=None, ref_file=None, compression=None):

    if os.path.isfile(out_file):
        print('delete exsited file: {}'.format(out_file))
        os.remove(out_file)

    print('create HDF5 file: {} with w mode'.format(out_file))
    dt = h5py.special_dtype(vlen=np.dtype('float64'))

    
    with h5py.File(out_file, 'w') as f:
        for dsName in datasetDict.keys():
            data = datasetDict[dsName]
            ds = f.create_dataset(dsName,
                              data=data,
                              compression=compression)
        
        for key, value in metadata.items():
            f.attrs[key] = str(value)
            #print(key + ': ' +  value)
    print('finished writing to {}'.format(out_file))
        
    return out_file 
    
######################################################################

Example #18

Source File: h5f.py From costar_plan with Apache License 2.0

6 votes

def write(self, example, filename, image_types=[]):
        '''
        Write an example out to disk.

        status: success, failure or error.failure
        '''
        filename = os.path.join(self.name, filename)
        f = h5f.File(filename, 'w')
        if image_types != []:
            dt = h5f.special_dtype(vlen=bytes)
            for (img_type_str, img_format_str) in image_types:
                f.create_dataset("type_" + img_type_str, data=[img_format_str])
        for key, value in example.items():
            if self.verbose > 0:
                print('H5fDataset writing key: ' + str(key))
            f.create_dataset(key, data=value)
        f.close()

Example #19

Source File: h5ad.py From anndata with BSD 3-Clause "New" or "Revised" License

6 votes

def write_series(group, key, series, dataset_kwargs=MappingProxyType({})):
    # group here is an h5py type, otherwise categoricals won’t write
    if series.dtype == object:  # Assuming it’s string
        group.create_dataset(
            key,
            data=series.values,
            dtype=h5py.special_dtype(vlen=str),
            **dataset_kwargs,
        )
    elif is_categorical_dtype(series):
        # This should work for categorical Index and Series
        categorical: pd.Categorical = series.values
        categories: np.ndarray = categorical.categories.values
        codes: np.ndarray = categorical.codes
        category_key = f"__categories/{key}"

        write_array(group, category_key, categories, dataset_kwargs=dataset_kwargs)
        write_array(group, key, codes, dataset_kwargs=dataset_kwargs)

        group[key].attrs["categories"] = group[category_key].ref
        group[category_key].attrs["ordered"] = categorical.ordered
    else:
        group[key] = series.values

Example #20

Source File: legacyapi.py From h5netcdf with BSD 3-Clause "New" or "Revised" License

6 votes

def createVariable(self, varname, datatype, dimensions=(), zlib=False,
                       complevel=4, shuffle=True, fletcher32=False,
                       chunksizes=None, fill_value=None):
        if len(dimensions) == 0:  # it's a scalar
            # rip off chunk and filter options for consistency with netCDF4-python

            chunksizes = None
            zlib = False
            fletcher32 = False
            shuffle = False

        if datatype is str:
            datatype = h5py.special_dtype(vlen=unicode)

        kwds = {}
        if zlib:
            # only add compression related keyword arguments if relevant (h5py
            # chokes otherwise)
            kwds['compression'] = 'gzip'
            kwds['compression_opts'] = complevel
            kwds['shuffle'] = shuffle

        return super(Group, self).create_variable(
            varname, dimensions, dtype=datatype, fletcher32=fletcher32,
            chunks=chunksizes, fillvalue=fill_value, **kwds)

Example #21

Source File: test_dataset.py From GraphicDesignPatternByPython with MIT License

6 votes

def test_convert(self):
        dt = h5py.special_dtype(vlen=int)
        ds = self.f.create_dataset('vlen', (3,), dtype=dt)
        ds[0] = np.array([1.4, 1.2])
        ds[1] = np.array([1.2])
        ds[2] = [1.2, 2, 3]
        self.assertArrayEqual(ds[0], np.array([1, 1]))
        self.assertArrayEqual(ds[1], np.array([1]))
        self.assertArrayEqual(ds[2], np.array([1, 2, 3]))
        ds[0:2] = np.array([[0.1, 1.1, 2.1, 3.1, 4], np.arange(4)])
        self.assertArrayEqual(ds[0], np.arange(5))
        self.assertArrayEqual(ds[1], np.arange(4))
        ds[0:2] = np.array([np.array([0.1, 1.2, 2.2]),
                            np.array([0.2, 1.2, 2.2])])
        self.assertArrayEqual(ds[0], np.arange(3))
        self.assertArrayEqual(ds[1], np.arange(3))

Example #22

Source File: test_dataset.py From GraphicDesignPatternByPython with MIT License

6 votes

def test_int(self):
        dt = h5py.special_dtype(vlen=int)
        ds = self.f.create_dataset('vlen', (4,), dtype=dt)
        ds[0] = np.arange(3)
        ds[1] = np.arange(0)
        ds[2] = [1, 2, 3]
        ds[3] = np.arange(1)
        self.assertArrayEqual(ds[0], np.arange(3))
        self.assertArrayEqual(ds[1], np.arange(0))
        self.assertArrayEqual(ds[2], np.array([1, 2, 3]))
        self.assertArrayEqual(ds[1], np.arange(0))
        ds[0:2] = np.array([np.arange(5), np.arange(4)])
        self.assertArrayEqual(ds[0], np.arange(5))
        self.assertArrayEqual(ds[1], np.arange(4))
        ds[0:2] = np.array([np.arange(3), np.arange(3)])
        self.assertArrayEqual(ds[0], np.arange(3))
        self.assertArrayEqual(ds[1], np.arange(3))

Example #23

Source File: utils.py From acerta-abide with GNU General Public License v2.0

5 votes

def hdf5_handler(filename, mode="r"):
    h5py.File(filename, "a").close()
    propfaid = h5py.h5p.create(h5py.h5p.FILE_ACCESS)
    settings = list(propfaid.get_cache())
    settings[1] = 0
    settings[2] = 0
    propfaid.set_cache(*settings)
    with contextlib.closing(h5py.h5f.open(filename, fapl=propfaid)) as fid:
        f = h5py.File(fid, mode)
        # f.attrs.create(dtype=h5py.special_dtype(vlen=str)) 
        return f

Example #24

Source File: test_dataset.py From keras-lambda with MIT License

5 votes

def test_vlen_bytes(self):
        """ Vlen bytes dataset maps to vlen ascii in the file """
        dt = h5py.special_dtype(vlen=bytes)
        ds = self.f.create_dataset('x', (100,), dtype=dt)
        tid = ds.id.get_type()
        self.assertEqual(type(tid), h5py.h5t.TypeStringID)
        self.assertEqual(tid.get_cset(), h5py.h5t.CSET_ASCII)

Example #25

Source File: global_attribute_manager.py From loompy with BSD 2-Clause "Simplified" License

5 votes

def __setattr__(self, name: str, val: Any) -> None:
		if name.startswith("!"):
			super(GlobalAttributeManager, self).__setattr__(name[1:], val)
		elif "/" in name:
			raise KeyError("Attribute name cannot contain slash (/)")
		else:
			if self.f is not None:
				if loompy.compare_loom_spec_version(self.f, "3.0.0") < 0 and "attrs" not in self.f["/"]:
					normalized = loompy.normalize_attr_values(val, False)
					self.f.attrs[name] = normalized
					self.f.flush()
					val = self.f.attrs[name]
					# Read it back in to ensure it's synced and normalized
					normalized = loompy.materialize_attr_values(val)
					self.__dict__["storage"][name] = normalized
				else:
					normalized = loompy.normalize_attr_values(val, True)
					if name in self.f["attrs"]:
						del self.f["attrs"][name]
					if not np.isscalar(normalized) and normalized.dtype == np.object_:
						self.ds._file.create_dataset("attrs/" + name, data=normalized, dtype=h5py.special_dtype(vlen=str))
					else:
						self.f["attrs"][name] = normalized
					self.f.flush()
					val = self.f["attrs"][name][()]
					# Read it back in to ensure it's synced and normalized
					normalized = loompy.materialize_attr_values(val)
					self.__dict__["storage"][name] = normalized

Example #26

Source File: datasets.py From pysaliency with MIT License

5 votes

def to_hdf5(self, target):
        """ Write FileStimuli to hdf5 file or hdf5 group
        """

        target.attrs['type'] = np.string_('FileStimuli')
        target.attrs['version'] = np.string_('2.1')

        import h5py
        # make sure everything is unicode

        hdf5_filename = target.file.filename
        hdf5_directory = os.path.dirname(hdf5_filename)

        relative_filenames = [os.path.relpath(filename, hdf5_directory) for filename in self.filenames]
        decoded_filenames = [decode_string(filename) for filename in relative_filenames]
        encoded_filenames = [filename.encode('utf8') for filename in decoded_filenames]

        target.create_dataset(
            'filenames',
            data=np.array(encoded_filenames),
            dtype=h5py.special_dtype(vlen=str)
        )

        shape_dataset = target.create_dataset(
            'shapes',
            (len(self), ),
            dtype=h5py.special_dtype(vlen=np.dtype('int64'))
        )

        for n, shape in enumerate(self.shapes):
            shape_dataset[n] = np.array(shape)

        for attribute_name, attribute_value in self.attributes.items():
            target.create_dataset(attribute_name, data=attribute_value)
        target.attrs['__attributes__'] = np.string_(json.dumps(self.__attributes__))

        target.attrs['size'] = len(self)

Example #27

Source File: hdf5_dataset_writer.py From aiexamples with Apache License 2.0

5 votes

def store_class_labels(self, class_labels):
    dt = h5py.special_dtype(vlen=str)
    labelset = self.db.create_dataset("label_names", (len(class_labels),), dtype=dt)
    labelset[:] = class_labels

Example #28

Source File: lm_utils.py From espnet with Apache License 2.0

5 votes

def load_dataset(path, label_dict, outdir=None):
    """Load and save HDF5 that contains a dataset and stats for LM

    Args:
        path (str): The path of an input text dataset file
        label_dict (dict[str, int]):
            dictionary that maps token label string to its ID number
        outdir (str): The path of an output dir

    Returns:
        tuple[list[np.ndarray], int, int]: Tuple of
            token IDs in np.int32 converted by `read_tokens`
            the number of tokens by `count_tokens`,
            and the number of OOVs by `count_tokens`
    """
    if outdir is not None:
        os.makedirs(outdir, exist_ok=True)
        filename = outdir + "/" + os.path.basename(path) + ".h5"
        if os.path.exists(filename):
            logging.info(f"loading binary dataset: {filename}")
            f = h5py.File(filename, "r")
            return f["data"][:], f["n_tokens"][()], f["n_oovs"][()]
    else:
        logging.info("skip dump/load HDF5 because the output dir is not specified")
    logging.info(f"reading text dataset: {path}")
    ret = read_tokens(path, label_dict)
    n_tokens, n_oovs = count_tokens(ret, label_dict["<unk>"])
    if outdir is not None:
        logging.info(f"saving binary dataset: {filename}")
        with h5py.File(filename, "w") as f:
            # http://docs.h5py.org/en/stable/special.html#arbitrary-vlen-data
            data = f.create_dataset(
                "data", (len(ret),), dtype=h5py.special_dtype(vlen=np.int32)
            )
            data[:] = ret
            f["n_tokens"] = n_tokens
            f["n_oovs"] = n_oovs
    return ret, n_tokens, n_oovs

Example #29

Source File: dbfun_lookuptable.py From ABXpy with MIT License

5 votes

def get_dtype(data):
    str_dtype = h5py.special_dtype(vlen=unicode)
    # allow for the use of strings
    if isinstance(data[0], str):
        dtype = str_dtype
    # could add some checks that the dtype is one of those supported by h5 ?
    elif hasattr(data, 'dtype'):
        dtype = data.dtype
    else:
        dtype = numpy.array(data).dtype
    return dtype


# item_size given in bytes, size_in_mem given in kilobytes

Example #30

Source File: embed.py From triplet-reid-pytorch with MIT License

5 votes

def write_to_h5(output_file, model, endpoints, num_augmentations, dataloader, dataset, keys=["emb"]):
    """
    Writes model to h5
    """
    print(len(dataloader), len(dataset))

    print("Model dimension is {}".format(model.module.dim))
    if len(keys) == 0:
        raise RuntimeError("Plase specify at least one key that should be written to file.")

    with h5py.File(output_file) as f_out:
        # Dataparallel class!
        datasets = {}
        for key in keys:
            datasets[key] = f_out.create_dataset(key, shape=(len(dataset), num_augmentations)                                      + model.module.dimensions[key], dtype=np.float32)
        for key in dataset.header:
            datasets[key] = f_out.create_dataset(
                    key,
                    shape=(len(dataset),),
                    dtype=h5py.special_dtype(vlen=str))
        start_idx = 0

        for endpoints, rows in run_forward_pass(dataloader, model, endpoints):
            # TODO this will not work if for some reason some endpoints are shorter than others
            for key in keys:
                end_idx = start_idx + len(endpoints[key])
                datasets[key][start_idx:end_idx] = endpoints[key]
            for key, values in rows.items():
                end_idx = start_idx + len(values)
                datasets[key][start_idx:end_idx] = np.asarray(values)
            start_idx = end_idx
    return output_file