import numpy as np import xarray as xr from scipy.signal import filtfilt, gaussian from scipy import stats from dask.array import coarsen, ones_like from dask.array.core import Array import warnings from xarrayutils.utilities import detect_dtype from xarrayutils.filtering import filter_1D as filter_1D_refactored """ Collection of several useful routines for xarray """ # Needs testing def shift_lon(ds, londim, shift=360, crit=0, smaller=True, sort=True): ds = ds.copy() lon = ds[londim].data if smaller: lon[lon < crit] = lon[lon < crit] + shift else: lon[lon > crit] = lon[lon > crit] + shift ds[londim].data = lon if sort: ds = ds.sortby(londim) return ds def xr_linregress(x, y, dim="time"): """Calculates linear regression along dimension `dim`. Results are equivalent to `scipy.stats.linregress`. Parameters ---------- x : {xr.DataArray} Independent variable for linear regression. E.g. time. y : {xr.DataArray, xr.Dataset} Dependent variable. dim : str Dimension over which to perform linear regression. Must be present in both `a` and `b` (the default is 'time'). Returns ------- type(b) Returns a dataarray containing the parameter values for each data_variable in `b`. The naming convention follows `scipy.stats.linregress` """ # align the nan Values before... x = x.where(~np.isnan(y)) y = y.where(~np.isnan(x)) # TODO: think about making this optional? Right now I err on the side of caution # Inspired by this post https://stackoverflow.com/a/60352716 but adjusted, so that # results are exactly as with scipy.stats.linregress for 1d vectors. n = y.notnull().sum(dim) nanmask = np.isnan(y).all(dim) xmean = x.mean(dim) ymean = y.mean(dim) xstd = x.std(dim) ystd = y.std(dim) cov = ((x - xmean) * (y - ymean)).sum(dim) / (n) cor = cov / (xstd * ystd) slope = cov / (xstd ** 2) intercept = ymean - xmean * slope df = n - 2 TINY = 1.0e-20 tstats = cor * np.sqrt(df / ((1.0 - cor + TINY) * (1.0 + cor + TINY))) stderr = slope / tstats pval = ( xr.apply_ufunc( stats.distributions.t.sf, abs(tstats), df, dask="parallelized", output_dtypes=[y.dtype], ) * 2 ) return xr.Dataset( { "slope": slope, "intercept": intercept, "r_value": cor.fillna(0).where(~nanmask), "p_value": pval, "std_err": stderr.where(~np.isinf(stderr), 0), } ) def linear_trend(obj, dim): """Convenience wrapper for 'xr_linregress'. Calculates the trend per given timestep. E.g. if the data is passed as yearly values, the trend is in units/yr. """ x = xr.DataArray( np.arange(len(obj[dim])).astype(np.float), dims=dim, coords={dim: obj[dim]} ) trend = xr_linregress(x, obj, dim=dim) return trend def _lin_trend_legacy(y): """ufunc to be used by linear_trend""" x = np.arange(len(y)) return np.polyfit(x, y, 1) def aggregate_w_nanmean(da, weights, blocks, **kwargs): """ weighted nanmean for xarrays """ # make sure that the missing values are exactly equal to each otherwise weights = weights.where(~np.isnan(da)) if not np.all(np.isnan(da) == np.isnan(weights)): raise RuntimeError( "weights cannot have more missing values \ then the data array" ) weights_sum = aggregate(weights, blocks, func=np.nansum, **kwargs) da_sum = aggregate(da * weights, blocks, func=np.nansum, **kwargs) return da_sum / weights_sum def aggregate(da, blocks, func=np.nanmean, debug=False): """ Performs efficient block averaging in one or multiple dimensions. Only works on regular grid dimensions. Parameters ---------- da : xarray DataArray (must be a dask array!) blocks : list List of tuples containing the dimension and interval to aggregate over func : function Aggregation function.Defaults to numpy.nanmean Returns ------- da_agg : xarray Data Aggregated array Examples -------- >>> from xarrayutils import aggregate >>> import numpy as np >>> import xarray as xr >>> import matplotlib.pyplot as plt >>> %matplotlib inline >>> import dask.array as da >>> x = np.arange(-10,10) >>> y = np.arange(-10,10) >>> xx,yy = np.meshgrid(x,y) >>> z = xx**2-yy**2 >>> a = xr.DataArray(da.from_array(z, chunks=(20, 20)), coords={'x':x,'y':y}, dims=['y','x']) >>> print a <xarray.DataArray 'array-7e422c91624f207a5f7ebac426c01769' (y: 20, x: 20)> dask.array<array-7..., shape=(20, 20), dtype=int64, chunksize=(20, 20)> Coordinates: * y (y) int64 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 * x (x) int64 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 >>> blocks = [('x',2),('y',5)] >>> a_coarse = aggregate(a,blocks,func=np.mean) >>> print a_coarse <xarray.DataArray 'array-7e422c91624f207a5f7ebac426c01769' (y: 2, x: 10)> dask.array<coarsen..., shape=(2, 10), dtype=float64, chunksize=(2, 10)> Coordinates: * y (y) int64 -10 0 * x (x) int64 -10 -8 -6 -4 -2 0 2 4 6 8 Attributes: Coarsened with: <function mean at 0x111754230> Coarsenblocks: [('x', 2), ('y', 10)] """ # Check if the input is a dask array (I might want to convert this # automaticlaly in the future) if not isinstance(da.data, Array): raise RuntimeError("data array data must be a dask array") # Check data type of blocks # TODO write test if not all(isinstance(n[0], str) for n in blocks) or not all( isinstance(n[1], int) for n in blocks ): print("blocks input", str(blocks)) raise RuntimeError( "block dimension must be dtype(str), \ e.g. ('lon',4)" ) # Check if the given array has the dimension specified in blocks try: block_dict = dict((da.get_axis_num(x), y) for x, y in blocks) except ValueError: raise RuntimeError("'blocks' contains non matching dimension") # Check the size of the excess in each aggregated axis blocks = [(a[0], a[1], da.shape[da.get_axis_num(a[0])] % a[1]) for a in blocks] # for now default to trimming the excess da_coarse = coarsen(func, da.data, block_dict, trim_excess=True) # for now default to only the dims new_coords = dict([]) # for cc in da.coords.keys(): warnings.warn("WARNING: only dimensions are carried over as coordinates") for cc in list(da.dims): new_coords[cc] = da.coords[cc] for dd in blocks: if dd[0] in list(da.coords[cc].dims): new_coords[cc] = new_coords[cc].isel( **{dd[0]: slice(0, -(1 + dd[2]), dd[1])} ) attrs = {"Coarsened with": str(func), "Coarsenblocks": str(blocks)} da_coarse = xr.DataArray( da_coarse, dims=da.dims, coords=new_coords, name=da.name, attrs=attrs ) return da_coarse def fancymean(raw, dim=None, axis=None, method="arithmetic", weights=None, debug=False): """ extenden mean function for xarray Applies various methods to estimate mean values {arithmetic,geometric,harmonic} along specified dimension with optional weigthing values, which can be a coordinate in the passed xarray structure """ if not isinstance(raw, xr.Dataset) and not isinstance(raw, xr.DataArray): raise RuntimeError("input needs to be xarray structure") # map dim to axis so this works on ndarrays and DataArray/Dataset # Below is the preferred way when passing a LOT of optional values # and when this is implemented as a class function # dim = kwargs.pop('dim', None)s # if dim is not None: # if 'axis' in kwargs: # raise ValueError('cannot set both `dim` and `axis`') # kwargs['axis'] = self.get_axis_num(dim) # For now I will add this in a simple way if dim is not None: if axis is not None: raise ValueError("cannot set both `dim` and `axis`") if isinstance(raw, xr.Dataset): axis = raw[raw.data_vars.keys()[0]].get_axis_num(dim) if debug: print("dim ", dim, " changed to axis ", axis) elif isinstance(raw, xr.DataArray): axis = raw.get_axis_num(dim) if debug: print("dim ", dim, " changed to axis ", axis) if debug: print("axis", axis) if weights is None: w = 1 elif isinstance(weights, str): w = raw[weights] elif isinstance(weights, np.ndarray): w = xr.DataArray(np.ones_like(raw.data), coords=raw.coords, dims=raw.dims) # make sure the w array is the same size as the raw array # This way also nans will be propagated correctly in a bidirectional way ones = raw.copy() ones = (ones * 0) + 1 w = w * ones # now transpose the w array to the same axisorder as raw order = raw.dims w = w.transpose(*order) if method == "arithmetic": up = raw * w down = w out = up.sum(axis=axis) / down.sum(axis=axis) elif method == "geometric": w = w.where(raw > 0) raw = raw.where(raw > 0) up = np.log10(raw) * w down = w out = 10 ** (up.sum(axis=axis) / down.sum(axis=axis)) elif method == "harmonic": w = w.where(raw != 0) raw = raw.where(raw != 0) up = w / raw down = w out = down.sum(axis=axis) / up.sum(axis=axis) if debug: print("w", w.shape) print("raw", raw.shape) print("up", up.shape) print("down", down.shape) print("out", out.shape) return out def timefilter( xr_in, steps, step_spec, timename="time", filtertype="gaussian", stdev=0.1 ): timedim = xr_in.dims.index(timename) dt = np.diff(xr_in.time.data[0:2])[0] cut_dt = np.timedelta64(steps, step_spec) if filtertype == "gaussian": win_length = (cut_dt / dt).astype(int) a = [1.0] win = gaussian(win_length, std=(float(win_length) * stdev)) b = win / win.sum() if np.nansum(win) == 0: raise RuntimeError("window to short for time interval") print("win_length", str(win_length)) print("stddev", str(stdev)) print("win", str(win)) filtered = filtfilt(b, a, xr_in.data, axis=timedim, padtype=None, padlen=0) out = xr.DataArray( filtered, dims=xr_in.dims, coords=xr_in.coords, attrs=xr_in.attrs ) out.attrs.update({"filterlength": (steps, step_spec), "filtertype": filtertype}) if xr_in.name: out.name = xr_in.name + "_lowpassed" return out def extractBox(da, box, xdim="lon", ydim="lat"): print("This is deprecated. Use extractBox_dict") box_dict = {xdim: box[0, 1], ydim: box[2, 3]} return extractBox_dict(da, box_dict, concat_wrap=True) # box_dict = {xdim: slice(box[0], box[1]), # ydim: slice(box[2], box[3])} # return da.loc[box_dict] def extractBox_dict(ds, box, concat_wrap=True): """Advanced box extraction from xarray Dataset""" if not isinstance(concat_wrap, dict): concat_wrap_dict = dict() for kk in box.keys(): concat_wrap_dict[kk] = concat_wrap concat_wrap = concat_wrap_dict ds = ds.copy() for dim, ind in box.items(): wrap = concat_wrap[dim] if np.diff(ind) < 0: # This would trigger a python 2 error # if (ind[1] - ind[0]) < 0: # box is defined over a discontinuity dim_data = ds[dim].data split_a = dim_data[dim_data > ind[0]].max() split_b = dim_data[dim_data < ind[1]].min() a = ds.loc[{dim: slice(ind[0], split_a)}] b = ds.loc[{dim: slice(split_b, ind[1])}] if wrap: c = (a, b) else: c = (b, a) ds = xr.concat(c, dim) else: ds = ds.loc[{dim: slice(ind[0], ind[1])}] return ds # This will be deprecated def extractBoxes(da, bo, xname=None, yname=None, xdim="lon", ydim="lat"): raise RuntimeWarning("Hard deprecated. Please use extractBox_dict instead") # def extractBoxes(da, bo, xname=None, yname=None, xdim='lon', ydim='lat'): # """ Extracts boxes from DataArray # # # Keyword arguments: # da -- xarray dataarray # bo -- dict with box name as keys and box corner # values as numpy array ([x0,x1,y0,y1]) # xdim -- dimension name for x (default: 'lon') # ydim -- dimension name for y (default: 'lat') # # # xname -- coordinate name for x (default: 'None') # yname -- coordinate name for y (default: 'None') # xname and yname have to be specified if coordinates are of differnt shape # """ # raise RuntimeError("this function is hellla slow! DO NOT use on \ # large datasets") # # if not type(xname) == type(yname): # raise RuntimeError('xname and yname need to be the same type') # # timeseries = [] # for ii, bb in enumerate(bo.keys()): # box = bo[bb] # if xname is None: # box_dict = {xdim: slice(box[0], box[1]), # ydim: slice(box[2], box[3])} # temp = da.loc[box_dict] # else: # mask = np.logical_and(np.logical_and(da[xname] > box[0], # da[xname] < box[1]), # np.logical_and(da[yname] > box[2], # da[yname] < box[3])) # temp = da.where(mask) # # timeseries.append(temp) # boxname_dim = concat_dim_da(list(bo.keys()), 'boxname') # out = xr.concat(timeseries, boxname_dim) # return out # Mapping related stuff def dll_dist(dlon, dlat, lon, lat): """Converts lat/lon differentials into distances PARAMETERS ---------- dlon : xarray.DataArray longitude differentials dlat : xarray.DataArray latitude differentials lon : xarray.DataArray longitude values lat : xarray.DataArray latitude values RETURNS ------- dx : xarray.DataArray distance inferred from dlon dy : xarray.DataArray distance inferred from dlat """ dll_factor = 111000.0 dx = dlon * xr.ufuncs.cos(xr.ufuncs.deg2rad(lat)) * dll_factor dy = ((lon * 0) + 1) * dlat * dll_factor return dx, dy # TODO: This needs a test and perhaps I can refactor it into a 'budget tools # Module' def convert_flux_array(da, da_full, dim, top=True, fillval=0): dummy = xr.DataArray( ones_like(da_full.data) * fillval, coords=da_full.coords, dims=da_full.dims ) if top: da.coords[dim] = da_full[dim][0] dummy_cut = dummy[{dim: slice(1, None)}] out = xr.concat([da, dummy_cut], dim=dim) else: da.coords[dim] = da_full[dim][-1] dummy_cut = dummy[{dim: slice(0, -1)}] out = xr.concat([dummy_cut, da], dim=dim) return out def composite(data, index, bounds): """ Composites Dataarray according to index Parameters ---------- data : xarray.Dataarray index : xarray.Dataarray Timeseries matching one dimension of 'data'. Values lower(higher) then 'bounds' are composited in additional coordinate bounds : int or array_like Values determining the values of 'index' composited into ['low','neutral','high']. If given as int, bounds will be computed as [-std(index) std(index)]*bounds. Returns ------- composited_array : array_like xarray like data with additional composite-coordinate ['low','neutral','high'] based on 'bounds' Examples -------- TODO """ if isinstance(bounds, int): bounds = float(bounds) if isinstance(bounds, float): bounds = [-bounds * np.std(index), bounds * np.std(index)] if len(bounds) != 2: raise RuntimeError("bounds can only have 1 or two elements") comp_name = "composite" zones = [ index >= bounds[1], np.logical_and(index < bounds[1], index >= bounds[0]), index < bounds[0], ] zones_coords = ["high", "neutral", "low"] out = xr.concat([data.where(z) for z in zones], comp_name) out[comp_name] = zones_coords counts = np.array([a.sum().data for a in zones]) out.coords["counts"] = xr.DataArray(counts, coords=[out[comp_name]]) out.attrs["IndexName"] = index.name out.attrs["CompositeBounds"] = bounds return out def corrmap( a, b, shifts=0, a_x_dim="i", a_y_dim="j", a_x_coord=None, a_y_coord=None, b_x_dim="i", b_y_dim="j", b_x_coord=None, b_y_coord=None, t_dim="time", debug=True, ): """ a -- input b -- target () TODO This thing is slow. I can most likely rewrite this with \ numpy.apply_along_axis """ from scipy.stats import linregress if not type(a_x_coord) == type(a_y_coord): raise RuntimeError("a_x_coord and a_y_coord need to be the same type") if not type(b_x_coord) == type(b_y_coord): raise RuntimeError("a_x_coord and a_y_coord need to be the same type") if isinstance(shifts, int): shifts = [shifts] # determine if the timseries is a timeseries or a 3d array if len(b.shape) == 3: arrayswitch = True elif len(b.shape) == 1: arrayswitch = False else: raise RuntimeWarning( "this only works with a timseries \ or map of timeseries" ) # shift timeseries slope = [] corr = [] p_value = [] for sh, shift in enumerate(shifts): shifted_b = b.shift(time=shift) s = a.mean(dim=t_dim).copy() s[:] = np.nan s.name = a.name + " regressed onto " + b.name c = a.mean(dim=t_dim).copy() c[:] = np.nan c.name = "Corr coeff " + a.name + "/" + b.name p = a.mean(dim=t_dim).copy() p[:] = np.nan p.name = "p value " + a.name + "/" + b.name for ii in range(len(a[a_x_dim])): for jj in range(len(a[a_y_dim])): # Define the 'input' (position in a) correctly, accounting for # the possibility that the # lat/lon position can be defined in the coordinates # or dimensions # interp timeseries onto the data.time in_a = a[{a_x_dim: ii, a_y_dim: jj}] if arrayswitch: if not a_x_coord: in_x = in_a[a_x_dim].data in_y = in_a[a_y_dim].data else: in_x = in_a[a_x_coord].data in_y = in_a[a_y_coord].data # rename the dimensions so it can be reindexed if not b_x_coord: in_b = xr.DataArray( shifted_b.data, coords={ "xdim": shifted_b[b_x_dim].data, "ydim": shifted_b[b_y_dim].data, "time": shifted_b.time.data, }, dims=["time", "ydim", "xdim"], ) else: raise RuntimeError("Not implemented yet") # This would have to be acomplished by a mask of some # sort # (with some tolerance around the input position) # extract the matching timeseries in_b = in_b.sel(xdim=in_x, ydim=in_y, method="nearest") reindexed_b = in_b.reindex_like(in_a.time, method="nearest") else: reindexed_b = shifted_b.reindex_like(in_a.time, method="nearest") x = reindexed_b.data y = in_a.data idx = np.logical_and(~np.isnan(y), ~np.isnan(x)) if y[idx].size: ( s[{a_x_dim: ii, a_y_dim: jj}], _, c[{a_x_dim: ii, a_y_dim: jj}], p[{a_x_dim: ii, a_y_dim: jj}], _, ) = linregress(x[idx], y[idx]) slope.append(s) corr.append(c) p_value.append(p) out_s = xr.concat(slope, "timeshifts") out_s["timeshifts"] = shifts # !!! I think this is a bug...this should be # possible with out_c = xr.concat(corr, "timeshifts") out_c["timeshifts"] = shifts out_p = xr.concat(p_value, "timeshifts") out_p["timeshifts"] = shifts return out_c, out_p, out_s def concat_dim_da(data, name): """creates an xarray.Dataarray to label the concat dim in xarray.concat. data is the dimension array and name is the name (DuHHHHH)""" return xr.DataArray(data, dims=[name], coords={name: (name, data)}, name=name) def xr_detrend(b, dim="time", trend_params=None, convert_datetime=True): """Removes linear trend along dimension `dim` from dataarray `b`. If no `trend_params` are passed (default), the linear trend is calculated using `xr_linregress`. Parameters ---------- b : {xr.DataArray, xr.Dataset} Data source to be detrended. dim : str Dimension along which to remove linear trend trend_params: {xr.DataArray, xr.Dataset, None} Precomputed output of xr_linregress. This can be usefull for large datasets where intermediate results are saved already. Defaults to None, meaning the linear trend is computed within the function. convert_datetime: bool If true (default), the dimension `dim` is converted from a datetime to float. """ if convert_datetime: t_data = b[dim].astype(np.float64) else: t_data = b[dim] if trend_params is None: out = xr_linregress(t_data, b) else: out = trend_params # Create new time dataarray trend_full = t_data * out.slope + out.intercept trend_full = trend_full.assign_coords({dim: b[dim].data}) return b - trend_full def lag_and_combine(ds, lags, dim="time"): """Creates lagged versions of the input object, combined along new `lag` dimension. NOTE: Lagging produces missing values at boundary. Use `.fillna(...)` to avoid problems with e.g. xr_linregress. Parameters ---------- ds : {xr.DataArray, xr.Dataset} Input object lags : np.Array Lags to be computed and combined. Values denote number of timesteps. Negative lag indicates a shift backwards (to the left of the axis). dim : str dimension of `ds` to be lagged Returns ------- {xr.DataArray, xr.Dataset} Lagged version of `ds` with additional dimension `lag` """ datasets = [] for ll in lags: datasets.append(ds.shift(**{dim: ll})) return xr.concat(datasets, dim=concat_dim_da(lags, "lag")) ################## # Refactored stuff def filter_1D(data, std, dim="time", dtype=None): warnings.warn( "This version of 1D filter is outdated. \ Please import from xarrayutils.filtering", DeprecationWarning, ) return filter_1D_refactored(data, std, dim="time", dtype=None)