python source code of run

# -*- coding: utf-8 -*-
"""
Run length algorithms submodule
===============================

Computation of statistics on runs of True values in boolean arrays.
"""
from datetime import datetime
from functools import partial
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union
from warnings import warn

import dask.array as dsk
import numpy as np
import xarray as xr

npts_opt = 9000


def get_npts(da: xr.DataArray) -> int:
    """Return the number of gridpoints in a DataArray.

    Parameters
    ----------
    da : xarray.DataArray
      N-dimensional input array

    Returns
    -------
    int
      Product of input DataArray coordinate sizes excluding the dimension 'time'
    """
    coords = list(da.coords)
    coords.remove("time")
    npts = 1
    for c in coords:
        npts *= da[c].size
    return npts


def rle(da: xr.DataArray, dim: str = "time", max_chunk: int = 1_000_000):
    n = len(da[dim])
    i = xr.DataArray(np.arange(da[dim].size), dims=dim).chunk({"time": 1})
    ind = xr.broadcast(i, da)[0].chunk(da.chunks)
    b = ind.where(~da)  # find indexes where false
    end1 = (
        da.where(b[dim] == b[dim][-1], drop=True) * 0 + n
    )  # add additional end value index (deal with end cases)
    start1 = (
        da.where(b[dim] == b[dim][0], drop=True) * 0 - 1
    )  # add additional start index (deal with end cases)
    b = xr.concat([start1, b, end1], dim)

    # Ensure bfill operates on entire (unchunked) time dimension
    # Determine appropraite chunk size for other dims - do not exceed 'max_chunk' total size per chunk (default 1000000)
    ndims = len(b.shape)
    chunk_dim = b[dim].size
    # divide extra dims into equal size
    # Note : even if calculated chunksize > dim.size result will have chunk==dim.size
    chunksize_ex_dims = None
    if ndims > 1:
        chunksize_ex_dims = np.round(np.power(max_chunk / chunk_dim, 1 / (ndims - 1)))
    chunks = dict()
    chunks[dim] = -1
    for dd in b.dims:
        if dd != dim:
            chunks[dd] = chunksize_ex_dims
    b = b.chunk(chunks)

    # back fill nans with first position after
    z = b.bfill(dim=dim)
    # calculate lengths
    d = z.diff(dim=dim) - 1
    d = d.where(d >= 0)
    return d


def longest_run(
    da: xr.DataArray, dim: str = "time", ufunc_1dim: Union[str, bool] = "auto"
):
    """Return the length of the longest consecutive run of True values.

    Parameters
    ----------
    da : xr.DataArray
      N-dimensional array (boolean)
    dim : str
      Dimension along which to calculate consecutive run; Default: 'time'.
    ufunc_1dim : Union[str, bool]
      Use the 1d 'ufunc' version of this function : default (auto) will attempt to select optimal
      usage based on number of data points.  Using 1D_ufunc=True is typically more efficient
      for DataArray with a small number of grid points.
    Returns
    -------
    N-dimensional array (int)
      Length of longest run of True values along dimension
    """
    if ufunc_1dim == "auto":
        npts = get_npts(da)
        ufunc_1dim = npts <= npts_opt

    if ufunc_1dim:
        rl_long = longest_run_ufunc(da)
    else:
        d = rle(da, dim=dim)
        rl_long = d.max(dim=dim)

    return rl_long


def windowed_run_events(
    da: xr.DataArray,
    window: int,
    dim: str = "time",
    ufunc_1dim: Union[str, bool] = "auto",
) -> xr.DataArray:
    """Return the number of runs of a minimum length.

    Parameters
    ----------
    da: xr.DataArray
      Input N-dimensional DataArray (boolean)
    window : int
      Minimum run length.
    dim : str
      Dimension along which to calculate consecutive run (default: 'time').
    ufunc_1dim : Union[str, bool]
      Use the 1d 'ufunc' version of this function : default (auto) will attempt to select optimal
      usage based on number of data points.  Using 1D_ufunc=True is typically more efficient
      for dataarray with a small number of gridpoints.
    Returns
    -------
    xr.DataArray
      Number of distinct runs of a minimum length (int).
    """
    if ufunc_1dim == "auto":
        npts = get_npts(da)
        ufunc_1dim = npts <= npts_opt

    if ufunc_1dim:
        out = windowed_run_events_ufunc(da, window)
    else:
        d = rle(da, dim=dim)
        out = (d >= window).sum(dim=dim)
    return out


def windowed_run_count(
    da: xr.DataArray,
    window: int,
    dim: str = "time",
    ufunc_1dim: Union[str, bool] = "auto",
) -> xr.DataArray:
    """Return the number of consecutive true values in array for runs at least as long as given duration.

    Parameters
    ----------
    da: xr.DataArray
      Input N-dimensional DataArray (boolean).
    window : int
      Minimum run length.
    dim : str
      Dimension along which to calculate consecutive run (default: 'time').
    ufunc_1dim : Union[str, bool]
      Use the 1d 'ufunc' version of this function : default (auto) will attempt to select optimal
      usage based on number of data points. Using 1D_ufunc=True is typically more efficient
      for dataarray with a small number of gridpoints.

    Returns
    -------
    xr.DataArray
      Total number of true values part of a consecutive runs of at least `window` long.
    """
    if ufunc_1dim == "auto":
        npts = get_npts(da)
        ufunc_1dim = npts <= npts_opt

    if ufunc_1dim:
        out = windowed_run_count_ufunc(da, window)
    else:
        d = rle(da, dim=dim)
        out = d.where(d >= window, 0).sum(dim=dim)
    return out


def first_run(
    da: xr.DataArray,
    window: int,
    dim: str = "time",
    coord: Optional[Union[str, bool]] = False,
    ufunc_1dim: Union[str, bool] = "auto",
):
    """Return the index of the first item of the first run of at least a given length.

    Parameters
    ----------
    da : xr.DataArray
      Input N-dimensional DataArray (boolean)
    window : int
      Minimum duration of consecutive run to accumulate values.
    dim : str
      Dimension along which to calculate consecutive run (default: 'time').
    coord : Optional[str]
      If not False, the function returns values along `dim` instead of indexes.
      If `dim` has a datetime dtype, `coord` can also be a str of the name of the
      DateTimeAccessor object to use (ex: 'dayofyear').
    ufunc_1dim : Union[str, bool]
      Use the 1d 'ufunc' version of this function : default (auto) will attempt to select optimal
      usage based on number of data points.  Using 1D_ufunc=True is typically more efficient
      for dataarray with a small number of gridpoints.

    Returns
    -------
    out : xr.DataArray
      Index (or coordinate if `coord` is not False) of first item in first valid run. Returns np.nan if there are no valid run.
    """
    if ufunc_1dim == "auto":
        if isinstance(da.data, dsk.Array) and len(da.chunks[da.dims.index(dim)]) > 1:
            ufunc_1dim = False
        else:
            npts = get_npts(da)
            ufunc_1dim = npts <= npts_opt

    da = da.fillna(0)  # We expect a boolean array, but there could be NaNs nonetheless

    if ufunc_1dim:
        out = first_run_ufunc(x=da, window=window, dim=dim)

    else:
        da = da.astype("int")
        i = xr.DataArray(np.arange(da[dim].size), dims=dim)
        ind = xr.broadcast(i, da)[0].transpose(*da.dims)
        if isinstance(da.data, dsk.Array):
            ind = ind.chunk(da.chunks)
        wind_sum = da.rolling(time=window).sum(allow_lazy=True, skipna=False)
        out = ind.where(wind_sum >= window).min(dim=dim) - (
            window - 1
        )  # remove window -1 as rolling result index is last element of the moving window

    if coord:
        crd = da[dim]
        if isinstance(coord, str):
            crd = getattr(crd.dt, coord)

        out = lazy_indexing(crd, out)

    if dim in out.coords:
        out = out.drop_vars(dim)

    return out


def last_run(
    da: xr.DataArray,
    window: int,
    dim: str = "time",
    coord: Optional[Union[str, bool]] = False,
    ufunc_1dim: Union[str, bool] = "auto",
):
    """Return the index of the last item of the last run of at least a given length.

    Parameters
    ----------
    da : xr.DataArray
      Input N-dimensional DataArray (boolean)
    window : int
      Minimum duration of consecutive run to accumulate values.
    dim : str
      Dimension along which to calculate consecutive run (default: 'time').
    coord : Optional[str]
      If not False, the function returns values along `dim` instead of indexes.
      If `dim` has a datetime dtype, `coord` can also be a str of the name of the
      DateTimeAccessor object to use (ex: 'dayofyear').
    ufunc_1dim : Union[str, bool]
      Use the 1d 'ufunc' version of this function : default (auto) will attempt to select optimal
      usage based on number of data points.  Using `1D_ufunc=True` is typically more efficient
      for a DataArray with a small number of grid points.

    Returns
    -------
    out : xr.DataArray
      Index (or coordinate if `coord` is not False) of last item in last valid run. Returns np.nan if there are no valid run.
    """
    reversed_da = da.sortby(dim, ascending=False)
    out = first_run(
        reversed_da, window=window, dim=dim, coord=coord, ufunc_1dim=ufunc_1dim
    )
    if not coord:
        return reversed_da[dim].size - out - 1
    return out


def run_length_with_date(
    da: xr.DataArray, window: int, date: str = "07-01", dim: str = "time",
):
    """Return the length of the longest consecutive run of True values found
    to be semi-continuous before and after a given date.

    Parameters
    ----------
    da : xr.DataArray
      Input N-dimensional DataArray (boolean)
    window : int
      Minimum duration of consecutive run to accumulate values.
    date:
      The date that a run must include to be considered valid.
    dim : str
      Dimension along which to calculate consecutive run (default: 'time').

    Returns
    -------
    out : xr.DataArray
      Length of longest run of True values along a given dimension inclusive of a given date.

    Notes
    -----
    The run can include holes of False or NaN values, so long as they do not exceed the window size.
    """
    include_date = datetime.strptime(date, "%m-%d").timetuple().tm_yday

    mid_index = np.where(da.time.dt.dayofyear == include_date)[0]
    if mid_index.size == 0:  # The date is not within the group. Happens at boundaries.
        return xr.full_like(da.isel(time=0), np.nan, float).drop_vars("time")

    end = first_run(
        (~da).where(da.time >= da.time[mid_index][0]), window=window, dim=dim,
    )
    beg = first_run(da, window=window, dim=dim)
    sl = end - beg
    sl = xr.where(beg.isnull() & end.notnull(), 0, sl)  # If series is never triggered
    sl = xr.where(
        beg.notnull() & end.isnull(), da.time.size - beg, sl
    )  # If series is not ended by end of resample time frequency
    return sl.where(sl >= 0)


def run_end_after_date(
    da: xr.DataArray,
    window: int,
    date: str = "07-01",
    dim: str = "time",
    coord: Optional[Union[bool, str]] = "dayofyear",
):
    """Return the index of the first item after the end of a run after a given date. The run must begin before the date.

    Parameters
    ----------
    da : xr.DataArray
      Input N-dimensional DataArray (boolean)
    window : int
      Minimum duration of consecutive run to accumulate values.
    date : str
      The date after which to look for the end of a run.
    dim : str
      Dimension along which to calculate consecutive run (default: 'time').
    coord : Optional[Union[bool, str]]
      If not False, the function returns values along `dim` instead of indexes.
      If `dim` has a datetime dtype, `coord` can also be a str of the name of the
      DateTimeAccessor object to use (ex: 'dayofyear').

    Returns
    -------
    out : xr.DataArray
      Index (or coordinate if `coord` is not False) of last item in last valid run. Returns np.nan if there are no valid run.
    """
    after_date = datetime.strptime(date, "%m-%d").timetuple().tm_yday

    mid_idx = np.where(da.time.dt.dayofyear == after_date)[0]
    if mid_idx.size == 0:  # The date is not within the group. Happens at boundaries.
        return xr.full_like(da.isel(time=0), np.nan, float).drop_vars("time")

    end = first_run(
        (~da).where(da.time >= da.time[mid_idx][0]),
        window=window,
        dim=dim,
        coord=coord,
    )
    beg = first_run(da.where(da.time < da.time[mid_idx][0]), window=window, dim=dim)
    end = xr.where(
        end.isnull() & beg.notnull(), da.time.isel(time=-1).dt.dayofyear, end
    )
    return end.where(beg.notnull())


def first_run_after_date(
    da: xr.DataArray,
    window: int,
    date: str = "07-01",
    dim: str = "time",
    coord: Optional[Union[bool, str]] = "dayofyear",
):
    """Return the index of the first item of the first run after a given date.

    Parameters
    ----------
    da : xr.DataArray
      Input N-dimensional DataArray (boolean)
    window : int
      Minimum duration of consecutive run to accumulate values.
    date : str
      The date after which to look for the run.
    dim : str
      Dimension along which to calculate consecutive run (default: 'time').
    coord : Optional[Union[bool, str]]
      If not False, the function returns values along `dim` instead of indexes.
      If `dim` has a datetime dtype, `coord` can also be a str of the name of the
      DateTimeAccessor object to use (ex: 'dayofyear').

    Returns
    -------
    out : xr.DataArray
      Index (or coordinate if `coord` is not False) of first item in the first valid run. Returns np.nan if there are no valid run.
    """
    after_date = datetime.strptime(date, "%m-%d").timetuple().tm_yday

    mid_idx = np.where(da.time.dt.dayofyear == after_date)[0]
    if mid_idx.size == 0:  # The date is not within the group. Happens at boundaries.
        return xr.full_like(da.isel(time=0), np.nan, float).drop_vars("time")

    return first_run(
        da.where(da.time >= da.time[mid_idx][0]), window=window, dim=dim, coord=coord,
    )


def last_run_before_date(
    da: xr.DataArray,
    window: int,
    date: str = "07-01",
    dim: str = "time",
    coord: Optional[Union[bool, str]] = "dayofyear",
):
    """Return the index of the last item of the last run before a given date.

    Parameters
    ----------
    da : xr.DataArray
      Input N-dimensional DataArray (boolean)
    window : int
      Minimum duration of consecutive run to accumulate values.
    date : str
      The date before which to look for the last event.
    dim : str
      Dimension along which to calculate consecutive run (default: 'time').
    coord : Optional[Union[bool, str]]
      If not False, the function returns values along `dim` instead of indexes.
      If `dim` has a datetime dtype, `coord` can also be a str of the name of the
      DateTimeAccessor object to use (ex: 'dayofyear').

    Returns
    -------
    out : xr.DataArray
      Index (or coordinate if `coord` is not False) of last item in last valid run. Returns np.nan if there are no valid run.
    """
    before_date = datetime.strptime(date, "%m-%d").timetuple().tm_yday

    mid_idx = np.where(da.time.dt.dayofyear == before_date)[0]
    if mid_idx.size == 0:  # The date is not within the group. Happens at boundaries.
        return xr.full_like(da.isel(time=0), np.nan, float).drop_vars("time")

    run = da.where(da.time <= da.time[mid_idx][0])
    return last_run(run, window=window, dim=dim, coord=coord)


def rle_1d(
    arr: Union[int, float, bool, Sequence[Union[int, float, bool]]]
) -> Tuple[np.array, np.array, np.array]:
    """Return the length, starting position and value of consecutive identical values.

    Parameters
    ----------
    arr : Sequence[Union[int, float, bool]]
      Array of values to be parsed.

    Returns
    -------
    values : np.array
      The values taken by arr over each run
    run lengths : np.array
      The length of each run
    start position : np.array
      The starting index of each run

    Examples
    --------
    >>> a = [1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3]
    >>> rle_1d(a)
    (array([1, 2, 3]), array([2, 4, 6]), array([0, 2, 6]))
    """
    ia = np.asarray(arr)
    n = len(ia)

    if n == 0:
        e = "run length array empty"
        warn(e)
        # Returning None makes some other 1d func below fail.
        return np.array(np.nan), 0, np.array(np.nan)

    y = np.array(ia[1:] != ia[:-1])  # pairwise unequal (string safe)
    i = np.append(np.where(y), n - 1)  # must include last element position
    rl = np.diff(np.append(-1, i))  # run lengths
    pos = np.cumsum(np.append(0, rl))[:-1]  # positions
    return ia[i], rl, pos


def first_run_1d(arr: Sequence[Union[int, float]], window: int) -> int:
    """Return the index of the first item of a run of at least a given length.

    Parameters
    ----------
    arr : Sequence[Union[int, float]]
      Input array
    window : int
      Minimum duration of consecutive run to accumulate values.

    Returns
    -------
    int
      Index of first item in first valid run. Returns np.nan if there are no valid run.
    """
    v, rl, pos = rle_1d(arr)
    ind = np.where(v * rl >= window, pos, np.inf).min()

    if np.isinf(ind):
        return np.nan
    return ind


def longest_run_1d(arr: Sequence[bool]) -> int:
    """Return the length of the longest consecutive run of identical values.

    Parameters
    ----------
    arr : Sequence[bool]
      Input array (bool)

    Returns
    -------
    int
      Length of longest run.
    """
    v, rl = rle_1d(arr)[:2]
    return np.where(v, rl, 0).max()


def windowed_run_count_1d(arr: Sequence[bool], window: int) -> int:
    """Return the number of consecutive true values in array for runs at least as long as given duration.

    Parameters
    ----------
    arr : Sequence[bool]
      Input array (bool)
    window : int
      Minimum duration of consecutive run to accumulate values.

    Returns
    -------
    int
      Total number of true values part of a consecutive run at least `window` long.
    """
    v, rl = rle_1d(arr)[:2]
    return np.where(v * rl >= window, rl, 0).sum()


def windowed_run_events_1d(arr: Sequence[bool], window: int):
    """Return the number of runs of a minimum length.

    Parameters
    ----------
    arr : Sequence[bool]
      Input array (bool)
    window : int
      Minimum run length.

    Returns
    -------
    out : func
      Number of distinct runs of a minimum length.
    """
    v, rl, pos = rle_1d(arr)
    return (v * rl >= window).sum()


def windowed_run_count_ufunc(x: Sequence[bool], window: int) -> xr.apply_ufunc:
    """Dask-parallel version of windowed_run_count_1d, ie the number of consecutive true values in
    array for runs at least as long as given duration.

    Parameters
    ----------
    x : Sequence[bool]
      Input array (bool)
    window : int
      Minimum duration of consecutive run to accumulate values.

    Returns
    -------
    out : func
      A function operating along the time dimension of a dask-array.
    """
    return xr.apply_ufunc(
        windowed_run_count_1d,
        x,
        input_core_dims=[["time"]],
        vectorize=True,
        dask="parallelized",
        output_dtypes=[np.int],
        keep_attrs=True,
        kwargs={"window": window},
    )


def windowed_run_events_ufunc(x: Sequence[bool], window: int) -> xr.apply_ufunc:
    """Dask-parallel version of windowed_run_events_1d, ie the number of runs at least as long as given duration.

    Parameters
    ----------
    x : Sequence[bool]
      Input array (bool)
    window : int
      Minimum run length

    Returns
    -------
    out : func
      A function operating along the time dimension of a dask-array.
    """
    return xr.apply_ufunc(
        windowed_run_events_1d,
        x,
        input_core_dims=[["time"]],
        vectorize=True,
        dask="parallelized",
        output_dtypes=[np.int],
        keep_attrs=True,
        kwargs={"window": window},
    )


def longest_run_ufunc(x: Sequence[bool]) -> xr.apply_ufunc:
    """Dask-parallel version of longest_run_1d, ie the maximum number of consecutive true values in
    array.

    Parameters
    ----------
    x : Sequence[bool]
      Input array (bool)

    Returns
    -------
    out : func
      A function operating along the time dimension of a dask-array.
    """
    return xr.apply_ufunc(
        longest_run_1d,
        x,
        input_core_dims=[["time"]],
        vectorize=True,
        dask="parallelized",
        output_dtypes=[np.int],
        keep_attrs=True,
    )


def first_run_ufunc(x: xr.DataArray, window: int, dim: str = "time",) -> xr.apply_ufunc:
    """Dask-parallel version of first_run_1d, ie the first entry in array of consecutive true values.

    Parameters
    ----------
    x : xr.DataArray
      Input array (bool)
    window : int
    dim: Optional[str]

    Returns
    -------
    out : func
      A function operating along the time dimension of a dask-array.
    """

    ind = xr.apply_ufunc(
        first_run_1d,
        x,
        input_core_dims=[[dim]],
        vectorize=True,
        dask="parallelized",
        output_dtypes=[np.float],
        keep_attrs=True,
        kwargs={"window": window},
    )

    return ind


def lazy_indexing(da: xr.DataArray, index: xr.DataArray, dim=None):
    """Get values of `da` at indices `index` in a NaN-aware and lazy manner.

    The algorithm differs whether da is 1D or not.

    Parameters
    ----------
    da : xr.DataArray
      Input array. If not 1D, `dim` must be given and must not appear in index.
    index : xr.DataArray
      N-d integer indices, all dimensions of index must be in da
    dim : Dimension along which to index,
          unused if `da` is 1D, should not be present in `index`.

    Returns
    -------
    xr.DataArray
      Values of `da` at indices `index`
    """
    if da.ndim == 1:

        def _index_from_1d_array(array, indices):
            return array[
                indices,
            ]

        invalid = index.isnull()
        index = index.fillna(0).astype(int)
        func = partial(_index_from_1d_array, da)

        out = index.map_blocks(func)
        out = out.where(~invalid)
        if index.shape == ():
            out = out.drop_vars(da.dims[0])
        return out

    if dim is None:
        diff_dims = set(da.dims) - set(index.dims)
        if len(diff_dims) == 0:
            raise ValueError(
                "da must have at least one dimension more than index for lazy_indexing."
            )
        if len(diff_dims) > 1:
            raise ValueError(
                "If da has more than one dimension more than index, the indexing dim must be given through `dim`"
            )
        dim = diff_dims.pop()

    def _index_from_nd_array(array, indices):
        return np.take_along_axis(array, indices[..., np.newaxis], axis=-1)[..., 0]

    return xr.apply_ufunc(
        _index_from_nd_array,
        da,
        index,
        input_core_dims=[[dim], []],
        output_core_dims=[[]],
        dask="parallelized",
        output_dtypes=[da.dtype],
    )