python source code of data

import h5py
import numpy as np
from pkg_resources import parse_version


def check_version_compatibility():
    if parse_version(h5py.__version__) < parse_version("2.7") \
            and parse_version(np.__version__) >= parse_version("1.13"):
        raise EnvironmentError("Incompatible h5py=={} and numpy=={} versions detected. \n"
                               "Array reading/decoding may not proceed as expected. \n"
                               "Please upgrade to the latest compatible verions"
                               "".format(h5py.__version__, np.__version__))


def _clean(value):
    """ Convert numpy numeric types to their python equivalents. """
    if isinstance(value, np.ndarray):
        if value.dtype.kind == 'S':
            return np.char.decode(value).tolist()
        else:
            return value.tolist()
    elif type(value).__module__ == np.__name__:
        # h5py==2.8.0 on windows sometimes fails to cast this from an np.float64 to a python.float
        # We have to let the user do this themselves, since casting here could be dangerous
        # https://github.com/h5py/h5py/issues/1051
        conversion = value.item()  # np.asscalar(value) was deprecated in v1.16
        if isinstance(conversion, bytes):
            conversion = conversion.decode()
        return conversion
    elif isinstance(value, bytes):
        return value.decode()
    else:
        return value


def _sanitize_data_for_writing(data):
    # To make the interface more user friendly we encode python strings as  byte-strings when writing datasets
    check_version_compatibility()
    if isinstance(data, str):
        # Plain python-strings can be encoded trivially
        return data.encode()
    elif isinstance(data, np.ndarray) and data.dtype.kind == np.dtype(np.unicode):
        # If the array is all of one type, unicode-string, we can encode with numpy
        return data.astype('S')
    elif isinstance(data, np.ndarray) and len(data.dtype) > 1:
        # If the array is of mixed types we have to set the encoding column by column
        encoded_dtypes = []
        for field_name in data.dtype.names:
            field_dtype, field_byte_index = data.dtype.fields[field_name]
            if field_dtype.kind == 'U':
                str_len = field_dtype.itemsize // field_dtype.alignment
                field_dtype = np.dtype("|S{}".format(str_len))
            encoded_dtypes.append((field_name, field_dtype))
        try:
            return data.astype(encoded_dtypes)
        except (ValueError, UnicodeEncodeError):
            if parse_version(h5py.__version__) < parse_version("2.7"):
                raise UnicodeError("Cannot encode array with types: {}.\n"
                                   "There are known bugs in h5py<2.7 which yield non-deteministic results when decoding "
                                   "arrays with empty strings and additional bugs with compatibility between "
                                   "h5py<2.7 and numpy>=1.13 when decoding arrays with  mixed/padded data types.\n"
                                   "Please try upgrading to the latest h5py and numpy versions"
                                   "".format(encoded_dtypes))
            else:
                raise
    return data


def _sanitize_data_for_reading(data):
    # To make the interface more user friendly we decode byte-strings into unicode strings when reading datasets
    check_version_compatibility()
    if isinstance(data, h5py.Dataset):
        data = data[()]

    if isinstance(data, bytes):
        # Plain byte-strings can be decoded trivially
        return data.decode()
    elif isinstance(data, np.ndarray) and data.dtype.kind == 'S':
        # If the array is all of one type, byte-string, we can decode with numpy
        return np.char.decode(data)
    elif isinstance(data, np.ndarray) and len(data.dtype) > 1:
        # If the array is of mixed types we have to decode column by column
        decoded_dtypes = []
        for field_name in data.dtype.names:
            field_dtype, field_byte_index = data.dtype.fields[field_name]
            if field_dtype.kind == 'S':
                field_dtype = np.dtype("<U{}".format(field_dtype.itemsize))
            decoded_dtypes.append((field_name, field_dtype))
        try:
            return data.astype(decoded_dtypes)
        except (UnicodeDecodeError, SystemError):
            # On h5py==2.6 we can't decode padded string-arrays properly - we should advise users to upgrade
            if parse_version(h5py.__version__) < parse_version("2.7"):
                raise UnicodeError("Cannot encode array with types: {}.\n"
                                   "There are known bugs in h5py<2.7 which yield non-deteministic results when decoding "
                                   "arrays with empty strings and additional bugs with compatibility between "
                                   "h5py<2.7 and numpy>=1.13 when decoding arrays with  mixed/padded data types.\n"
                                   "Please try upgrading to the latest h5py and numpy versions".format(decoded_dtypes))
            else:
                raise
    return data