""" Useful for: * Testing * building tutorials in the documentation. """ import hashlib import os from urllib.request import urlretrieve import xarray as xr from esmlab import config _default_cache_dir = config.get('esmlab.sample_data_dir') def file_md5_checksum(fname): hash_md5 = hashlib.md5() with open(fname, 'rb') as f: hash_md5.update(f.read()) return hash_md5.hexdigest() def open_dataset( name, cache=True, cache_dir=_default_cache_dir, github_url='https://github.com/NCAR/esmlab-data', branch='master', **kwargs ): """Load a dataset from the online repository (requires access to internet). If a local copy is found then always use that to avoid network traffic. Parameters ---------- name : str Name of the netcdf file containing the dataset ie. 'air_temperature' cache_dir : string, optional The directory in which to search for and write cached data. cache : boolean, optional If True, then cache data locally for use on subsequent calls github_url : string Github repository where the data is stored branch : string The git branch to download from kwargs : dict, optional Passed to xarray.open_dataset """ longdir = os.path.expanduser(cache_dir) fullname = name + '.nc' localfile = os.sep.join((longdir, fullname)) md5name = name + '.md5' md5file = os.sep.join((longdir, md5name)) if not os.path.exists(localfile): # This will always leave this directory on disk. if not os.path.isdir(longdir): os.mkdir(longdir) url = '/'.join((github_url, 'raw', branch, fullname)) urlretrieve(url, localfile) url = '/'.join((github_url, 'raw', branch, md5name)) urlretrieve(url, md5file) localmd5 = file_md5_checksum(localfile) with open(md5file, 'r') as f: remotemd5 = f.read() if localmd5 != remotemd5: os.remove(localfile) msg = """ MD5 checksum does not match, try downloading dataset again. """ raise IOError(msg) ds = xr.open_dataset(localfile, **kwargs) if not cache: ds = ds.load() os.remove(localfile) return ds