#################################################################################################### # neuropythy/datasets/core.py # Implementation of builtin datasets for Neuropythy # by Noah C. Benson import os, six, shutil, tempfile, atexit, pimms import numpy as np from ..util import (config, to_credentials, ObjectWithMetaData) from ..freesurfer import subject as freesurfer_subject # We declare a configuration variable, data_cache_root -- where to put the data that is downloaded. # If this is None / unset, then we'll use a temporary directory and auto-delete it on exit. config.declare_dir('data_cache_root') @pimms.immutable class Dataset(ObjectWithMetaData): ''' The Dataset class is a simple immutable class that should be implemented by all neuropythy datasets. The design is such that neuropythy.data[name] should always (lazily) yield a Dataset object specific to the dataset given by name, if it exists and can be loaded. One reason to require (by convention) that all datasets are distinct classes is that it should thus be easy to evaluate help(ny.data[name]) to see help on the given dataset. If you overload this class, be sure to overload the documentation. ''' def __init__(self, name, meta_data=None, custom_directory=None, cache_required=True, create_directories=True, create_mode=0o755): ObjectWithMetaData.__init__(self, meta_data) self.custom_directory = custom_directory self.name = name self.create_directories = create_directories self.create_mode = create_mode self.cache_required = cache_required def __repr__(self): return self.repr @pimms.value def repr(name): ''' dataset.repr is the representation string used for the given dataset. ''' return ("Dataset('%s')" % name) if pimms.is_str(name) else ("Dataset%s" % (name,)) @staticmethod def to_name(nm): ''' Dataset.to_name(name) yields a valid dataset name equivalent to the given name or raises an error if name is not valid. In order to be valid, a name must be either strings or a tuple of number and strings that start with a string. ''' if pimms.is_str(nm): return nm if not pimms.is_vector(nm): raise ValueError('name must be a string or tuple') if len(nm) < 1: raise ValueError('names that are tuples must have at least one element') if not pimms.is_str(nm): raise ValueError('names that are tuples must begin with a string') if not all(pimms.is_str(x) or pimms.is_number(x) for x in nm): raise ValueError('dataset names that are tuples must contain only strings and numbers') return tuple(nm) @pimms.param def custom_directory(d): ''' dataset.custom_directory is None if no custom directory was provided for the given dataset; otherwise it is the provided custom directory. ''' if d is None: return None if not pimms.is_str(d): raise ValueError('custom_directory must be a string') else: return d @pimms.param def create_directories(c): ''' dataset.create_directories is True if the dataset was instructed to create its cache directory, should it be found to not exist, and is otherwise False. ''' return bool(c) @pimms.param def create_mode(c): ''' dataset.create_mode is the octal permision mode used to create the cache directory for the given dataset, if the dataset had to create its directory at all. ''' return c @pimms.param def name(nm): ''' dataset.name is either a string or a tuple of strings and numbers that identifies the given dataset. If dataset.name is a tuple, then the first element must be a string. ''' return Dataset.to_name(nm) @pimms.param def cache_required(cr): ''' dataset.cache_required is True if the dataset requires a cache directory and False otherwise. ''' return cr @pimms.value def cache_root(custom_directory): ''' dataset.cache_root is the root directory in which the given dataset has been cached. ''' if custom_directory is not None: return None elif config['data_cache_root'] is None: # we create a data-cache in a temporary directory path = tempfile.mkdtemp(prefix='npythy_data_cache_') if not os.path.isdir(path): raise ValueError('Could not find or create cache directory') config['data_cache_root'] = path atexit.register(shutil.rmtree, path) return config['data_cache_root'] @pimms.value def cache_directory(cache_root, name, custom_directory): ''' dataset.cache_directory is the directory in which the given dataset is cached. ''' if custom_directory is not None: return custom_directory return os.path.join(cache_root, (name if pimms.is_str(name) else name[0] if len(name) == 1 else '%s_%x' % (name[0], hash(name[1:])))) @pimms.require def ensure_cache_directory(cache_directory, create_directories, create_mode, cache_required): ''' ensure_cache_directory requires that a dataset's cache directory exists and raises an error if it cannot be found. ''' if not cache_required: return True if os.path.isdir(cache_directory): return True if not create_directories: raise ValueError('dataset cache directory not found: %s' % (cache_directory,)) os.makedirs(os.path.abspath(cache_directory), create_mode) return True # We create the dataset repository: this is just a lazy map; to add a dataset to iit, use the # function add_dataset(), immediately below data = pimms.lazy_map({}) def add_dataset(dset, fn=None): ''' add_dataset(dset) adds the given dataset to the neuropythy.data map. add_dataset(name, fn) adds a dataset with the given name; fn must be a function of zero arguments that yields the dataset. add_dataset always yeilds None or raises an error. ''' global data if fn is None: if not isinstance(dset, Dataset): raise ValueError('Cannot add non-Dataset object to neuropythy datasets') nm = dset.name data = data.set(nm, dset) else: nm = Dataset.to_name(dset) def _load_dset(): x = fn() if not isinstance(x, Dataset): raise ValueError('Loader for dataset %s failed to return a dataset' % nm) return x data = data.set(nm, _load_dset) # we want to update neuropythy.data also; this is a bit of a hack, but should work fine import neuropythy neuropythy.data = data return None