python source code of timescan

# -*- coding: utf-8 -*-
# import stdlib modules
import os
from os.path import join as opj

from datetime import datetime
from datetime import timedelta
from calendar import isleap

import rasterio
import numpy as np
from scipy import stats

from ost.helpers import raster as ras
from ost.helpers import helpers as h


def remove_outliers(arrayin, stddev=3, z_threshold=None):

    if z_threshold:
        z_score = np.abs(stats.zscore(arrayin))
        array_out = np.ma.MaskedArray(
            arrayin,
            mask=z_score > z_threshold)
    else:

        # calculate percentiles
        perc95 = np.percentile(arrayin, 95, axis=0)
        perc5 = np.percentile(arrayin, 5, axis=0)

        # we mask out the percetile outliers for std dev calculation
        masked_array = np.ma.MaskedArray(
            arrayin,
            mask=np.logical_or(
                arrayin > perc95,
                arrayin < perc5
                )
            )

        # we calculate new std and mean
        masked_std = np.std(masked_array, axis=0)
        masked_mean = np.mean(masked_array, axis=0)

        # we mask based on mean +- 3 * stddev
        array_out = np.ma.MaskedArray(
            arrayin,
            mask=np.logical_or(
                arrayin > masked_mean + masked_std * stddev,
                arrayin < masked_mean - masked_std * stddev,
                )
            )

    return array_out


def date_as_float(date):
    size_of_day = 1. / 366.
    size_of_second = size_of_day / (24. * 60. * 60.)
    days_from_jan1 = date - datetime(date.year, 1, 1)
    if not isleap(date.year) and days_from_jan1.days >= 31+28:
        days_from_jan1 += timedelta(1)
    return date.year + days_from_jan1.days * size_of_day + days_from_jan1.seconds * size_of_second


def difference_in_years(start, end):
    return date_as_float(end) - date_as_float(start)


def deseasonalize(stack):
    
    percentiles = np.percentile(stack, 95, axis=[1,2])
    deseasoned = np.subtract(percentiles[:,np.newaxis], stack.reshape(stack.shape[0], -1))
    return deseasoned.reshape(stack.shape)


def _zvalue_from_index(arr, ind):
    """private helper function to work around the limitation of np.choose() by employing np.take()
    arr has to be a 3D array
    ind has to be a 2D array containing values for z-indicies to take from arr
    See: http://stackoverflow.com/a/32091712/4169585
    This is faster and more memory efficient than using the ogrid based solution with fancy indexing.
    """
    # get number of columns and rows
    _, nC, nR = arr.shape

    # get linear indices and extract elements with np.take()
    idx = nC * nR * ind + np.arange(nC*nR).reshape((nC,nR))
    return np.take(arr, idx)


def nan_percentile(arr, q):
    # taken from: https://krstn.eu/np.nanpercentile()-there-has-to-be-a-faster-way/
    # valid (non NaN) observations along the first axis
    valid_obs = np.sum(np.isfinite(arr), axis=0)
    # replace NaN with maximum
    max_val = np.nanmax(arr)
    arr[np.isnan(arr)] = max_val
    # sort - former NaNs will move to the end
    arr = np.sort(arr, axis=0)

    # loop over requested quantiles
    if type(q) is list:
        qs = []
        qs.extend(q)
    else:
        qs = [q]
    if len(qs) < 2:
        quant_arr = np.zeros(shape=(arr.shape[1], arr.shape[2]))
    else:
        quant_arr = np.zeros(shape=(len(qs), arr.shape[1], arr.shape[2]))

    result = []
    for i in range(len(qs)):
        quant = qs[i]
        # desired position as well as floor and ceiling of it
        k_arr = (valid_obs - 1) * (quant / 100.0)
        f_arr = np.floor(k_arr).astype(np.int32)
        c_arr = np.ceil(k_arr).astype(np.int32)
        fc_equal_k_mask = f_arr == c_arr

        # linear interpolation (like numpy percentile) takes the fractional part of desired position
        floor_val = _zvalue_from_index(arr=arr, ind=f_arr) * (c_arr - k_arr)
        ceil_val = _zvalue_from_index(arr=arr, ind=c_arr) * (k_arr - f_arr)

        quant_arr = floor_val + ceil_val
        quant_arr[fc_equal_k_mask] = _zvalue_from_index(arr=arr, ind=k_arr.astype(np.int32))[fc_equal_k_mask]  # if floor == ceiling take floor value

        result.append(quant_arr)

    return result


def mt_metrics(stack, out_prefix, metrics, rescale_to_datatype=False,
               to_power=False, outlier_removal=False, datelist=None):
    if type(rescale_to_datatype) == str:
        if rescale_to_datatype == 'True':
            rescale_to_datatype = True
        elif rescale_to_datatype == 'False':
            rescale_to_datatype = False
    if type(to_power) == str:
        if to_power == 'True':
            to_power = True
        elif to_power == 'False':
            to_power = False
    if type(outlier_removal) == str:
        if outlier_removal == 'True':
            outlier_removal = True
        elif outlier_removal == 'False':
            outlier_removal = False
    if type(metrics) == str:
        metrics = metrics.replace("'", '').strip('][').split(', ')
    if type(datelist) == str:
        datelist = datelist.replace("'", '').strip('][').split(', ')

    # from datetime import datetime
    with rasterio.open(stack) as src:

        harmonics = False
        if 'harmonics' in metrics:
            print(' INFO: Calculating harmonics')
            if not datelist:
                print(' WARNING: Harmonics need the datelist. Harmonics will not be calculated')
            else:
                harmonics = True
                metrics.remove('harmonics')
                metrics.extend(['amplitude', 'phase', 'residuals'])
        
        if 'percentiles' in metrics:
            metrics.remove('percentiles')
            metrics.extend(['p95', 'p5'])
            
        # get metadata
        meta = src.profile

        # update driver and reduced band count
        meta.update({'driver': 'GTiff'})
        meta.update({'count': 1})

        # write all different output files into a dictionary
        metric_dict = {}
        for metric in metrics:
            filename = '{}.{}.tif'.format(out_prefix, metric)
            metric_dict[metric] = rasterio.open(
                filename, 'w', **meta)

        # scaling factors in case we have to rescale to integer
        minimums = {'avg': -30, 'max': -30, 'min': -30,
                    'std': 0.00001, 'cov': 0.00001}
        maximums = {'avg': 5, 'max': 5, 'min': 5, 'std': 15, 'cov': 1}

        
        if harmonics:
            # construct independent variables
            dates, sines, cosines = [], [], []
            two_pi = np.multiply(2, np.pi)
            for date in sorted(datelist):
                
                delta = difference_in_years(datetime.strptime('700101',"%y%m%d"), datetime.strptime(date,"%y%m%d"))
                dates.append(delta)
                sines.append(np.sin(np.multiply(two_pi, delta - 0.5)))
                cosines.append(np.cos(np.multiply(two_pi, delta - 0.5)))
            
            X = np.array([dates, cosines, sines])
    
        # loop through blocks
        for _, window in src.block_windows(1):

            # read array with all bands
            stack = src.read(range(1, src.count + 1), window=window)

            if rescale_to_datatype is True and meta['dtype'] != 'float32':
                stack = ras.rescale_to_float(stack, meta['dtype'])

            # transform to power
            if to_power is True:
                stack = ras.convert_to_power(stack)

            # outlier removal (only applies if there are more than 5 bands)
            if outlier_removal is True and src.count >= 5:
                stack = remove_outliers(stack)

            # get stats
            arr = {}
            arr['p95'], arr['p5'] = (np.nan_to_num(nan_percentile(stack, [95, 5]))
                                     if 'p95' in metrics else (False, False))
            arr['median'] = (np.nan_to_num(np.nanmedian(stack, axis=0))
                          if 'median' in metrics else False)
            arr['avg'] = (np.nan_to_num(np.nanmean(stack, axis=0))
                          if 'avg' in metrics else False)
            arr['max'] = (np.nan_to_num(np.nanmax(stack, axis=0))
                          if 'max' in metrics else False)
            arr['min'] = (np.nan_to_num(np.nanmin(stack, axis=0))
                          if 'min' in metrics else False)
            arr['std'] = (np.nan_to_num(np.nanstd(stack, axis=0))
                          if 'std' in metrics else False)
            arr['cov'] = (np.nan_to_num(stats.variation(stack, axis=0,
                                                        nan_policy='omit'))
                          if 'cov' in metrics else False)
            
            if harmonics:
                
                stack_size = (stack.shape[1], stack.shape[2])
                if to_power is True:
                    y = ras.convert_to_db(stack).reshape(stack.shape[0], -1)
                else:
                    y = stack.reshape(stack.shape[0], -1)
                    
                x, residuals, _, _ = np.linalg.lstsq(X.T, y)
                arr['amplitude'] = np.hypot(x[1], x[2]).reshape(stack_size)
                arr['phase'] = np.arctan2(x[2], x[1]).reshape(stack_size)
                arr['residuals'] = np.sqrt(np.divide(residuals, stack.shape[0])).reshape(stack_size)
                
                
            # the metrics to be re-turned to dB, in case to_power is True
            metrics_to_convert = ['avg', 'min', 'max', 'p95', 'p5', 'median']

            # do the back conversions and write to disk loop
            for metric in metrics:
                
                if to_power is True and metric in metrics_to_convert:
                    arr[metric] = ras.convert_to_db(arr[metric])

                if rescale_to_datatype is True and meta['dtype'] != 'float32':
                    arr[metric] = ras.scale_to_int(arr[metric], meta['dtype'],
                                                   minimums[metric],
                                                   maximums[metric])

                # write to dest
                metric_dict[metric].write(
                    np.float32(arr[metric]), window=window, indexes=1)
                metric_dict[metric].update_tags(1, 
                    BAND_NAME='{}_{}'.format(os.path.basename(out_prefix), metric))
                metric_dict[metric].set_band_description(1, 
                    '{}_{}'.format(os.path.basename(out_prefix), metric))

    # close the output files
    for metric in metrics:
        # close rio opening
        metric_dict[metric].close()
        # construct filename
        filename = '{}.{}.tif'.format(out_prefix, metric)
        return_code = h.check_out_tiff(filename)
        if return_code != 0:
            # remove all files and return
            for metric in metrics:
                filename = '{}.{}.tif'.format(out_prefix, metric)
                os.remove(filename)
            
            return return_code
        
    if return_code == 0:
        dirname = os.path.dirname(out_prefix)
        check_file = opj(dirname, '.{}.processed'.format(os.path.basename(out_prefix)))
        with open(str(check_file), 'w') as file:
            file.write('passed all tests \n')