python source code of test_cross

import numpy as np
import pandas as pd
import pandas.util.testing as tm
import unittest

try:
    from timeseriescv.cross_validation import (BaseTimeSeriesCrossValidator, PurgedWalkForwardCV, CombPurgedKFoldCV, purge, embargo,
                                               compute_fold_bounds)
except:
    pass
try:
    from timeseriescv.cross_validation import (BaseTimeSeriesCrossValidator, PurgedWalkForwardCV, CombPurgedKFoldCV, purge, embargo,
                                               compute_fold_bounds)
except:
    pass
from unittest import TestCase


def create_random_sample_set(n_samples, time_shift='120m', randomize_times=False, freq='60T'):
    # Create artificial data
    tm.K = 3
    tm.N = n_samples
    # Random data frame with an hourly index
    test_df = tm.makeTimeDataFrame(freq=freq)
    # Turn the index into a column labeled 'index'
    test_df = test_df.reset_index()
    if randomize_times:
        tm.K = 1
        # Subtract and adds random time deltas to the index column, to create the prediction and evaluation times
        rand_fact = tm.makeDataFrame().reset_index(drop=True).squeeze().iloc[:len(test_df)].abs()
        test_df['index'] = test_df['index'].subtract(rand_fact.apply(lambda x: x * pd.Timedelta(time_shift)))
        rand_fact = tm.makeDataFrame().reset_index(drop=True).squeeze().iloc[:len(test_df)].abs()
        test_df['index2'] = test_df['index'].add(rand_fact.apply(lambda x: x * pd.Timedelta(time_shift)))
    else:
        test_df['index2'] = test_df['index'].apply(lambda x: x + pd.Timedelta(time_shift))
    # Sort the data frame by prediction time
    test_df = test_df.sort_values('index')
    X = test_df[['A', 'B', 'C']]
    pred_times = test_df['index']
    exit_times = test_df['index2']
    return X, pred_times, exit_times


def prepare_cv_object(cv: BaseTimeSeriesCrossValidator, n_samples: int, time_shift: str, randomlize_times: bool):
    X, pred_times, eval_times = create_random_sample_set(n_samples=n_samples, time_shift=time_shift,
                                                         randomize_times=randomlize_times)
    cv.X = X
    cv.pred_times = pred_times
    cv.eval_times = eval_times
    cv.indices = np.arange(X.shape[0])


def prepare_time_inhomogeneous_cv_object(cv: BaseTimeSeriesCrossValidator):
    """
    Creates a sample set consisting in 11 samples at 2h intervals, spanning 20h, as well as 10 samples at 59m intervals,
    with the first samples of each group occurring at the same time.

    pred_times and eval_times have the following values:
                pred_times          eval_times
    0  2000-01-01 00:00:00 2000-01-01 01:00:00
    1  2000-01-01 00:00:00 2000-01-01 01:00:00
    2  2000-01-01 00:59:00 2000-01-01 01:59:00
    3  2000-01-01 01:58:00 2000-01-01 02:58:00
    4  2000-01-01 02:00:00 2000-01-01 03:00:00
    5  2000-01-01 02:57:00 2000-01-01 03:57:00
    6  2000-01-01 03:56:00 2000-01-01 04:56:00
    7  2000-01-01 04:00:00 2000-01-01 05:00:00
    8  2000-01-01 04:55:00 2000-01-01 05:55:00
    9  2000-01-01 05:54:00 2000-01-01 06:54:00
    10 2000-01-01 06:00:00 2000-01-01 07:00:00
    11 2000-01-01 06:53:00 2000-01-01 07:53:00
    12 2000-01-01 07:52:00 2000-01-01 08:52:00
    13 2000-01-01 08:00:00 2000-01-01 09:00:00
    14 2000-01-01 08:51:00 2000-01-01 09:51:00
    15 2000-01-01 10:00:00 2000-01-01 11:00:00
    16 2000-01-01 12:00:00 2000-01-01 13:00:00
    17 2000-01-01 14:00:00 2000-01-01 15:00:00
    18 2000-01-01 16:00:00 2000-01-01 17:00:00
    19 2000-01-01 18:00:00 2000-01-01 19:00:00
    20 2000-01-01 20:00:00 2000-01-01 21:00:00
    """
    X1, pred_times1, eval_times1 = create_random_sample_set(n_samples=11, time_shift='1H', freq='2H')
    X2, pred_times2, eval_times2 = create_random_sample_set(n_samples=10, time_shift='1H', freq='59T')
    data1 = pd.concat([X1, pred_times1, eval_times1], axis=1)
    data2 = pd.concat([X2, pred_times2, eval_times2], axis=1)
    data = pd.concat([data1, data2], axis=0, ignore_index=True)
    data = data.sort_values(by=data.columns[3])
    data = data.reset_index(drop=True)
    X = data.iloc[:, 0:3]
    pred_times = data.iloc[:, 3]
    eval_times = data.iloc[:, 4]

    cv.X = X
    cv.pred_times = pred_times
    cv.eval_times = eval_times
    cv.indices = np.arange(X.shape[0])


class TestPurgedWalkForwardCV(TestCase):
    def test_split(self):
        """
        Apply split to the sample described in the docstring of prepare_time_inhomogeneous_cv_object with n_splits = 5.
        Inspection shows that the pairs test-train sets should respectively be
        1. Train: [0 : 12], test: [13 : 16] (Sample 12 purged from the train set.)
        2. Train: [0 : 16], test: [16, 17]
        3. Train: [0 : 18], test: [18 : 21]
        """
        cv = PurgedWalkForwardCV(n_splits=5)
        prepare_time_inhomogeneous_cv_object(cv)
        count = 0
        for train_set, test_set in cv.split(cv.X, pred_times=cv.pred_times, eval_times=cv.eval_times,
                                            split_by_time=True):
            count += 1
            if count == 1:
                result_train = np.arange(12)
                result_test = np.arange(13, 16)
                self.assertTrue(np.array_equal(result_train, train_set))
                self.assertTrue(np.array_equal(result_test, test_set))
            if count == 2:
                result_train = np.arange(16)
                result_test = np.arange(16, 18)
                self.assertTrue(np.array_equal(result_train, train_set))
                self.assertTrue(np.array_equal(result_test, test_set))
            if count == 3:
                result_train = np.arange(18)
                result_test = np.arange(18, 21)
                self.assertTrue(np.array_equal(result_train, train_set))
                self.assertTrue(np.array_equal(result_test, test_set))


class TestCombPurgedKFoldCV(TestCase):

    def test_split(self):
        """
        Apply split to the sample described in the docstring of prepare_time_inhomogeneous_cv_object, with n_splits = 4
        and n_test_splits = 2. The folds are [0 : 6], [6 : 11], [11 : 16], [16 : 21]. We use an embargo of zero.
        Inspection shows that the pairs test-train sets should respectively be
        [...]
        3. Train: folds 1 and 4, samples [0, 1, 2, 3, 4, 16, 17, 18, 19, 20]. Test: folds 2 and 3, samples [6, 7, 8, 9,
         10, 11, 12, 13, 14, 15]. Sample 5 is purged from the train set.
        4. Train: folds 2 and 3, samples [7, 8, 9, 10, 11, 12, 13, 14, 15]. Test: folds 1 and 4, samples [0, 1, 2, 3, 4,
         5, 16, 17, 18, 19, 20]. Sample 6 is embargoed.
        [...]
        """
        cv = CombPurgedKFoldCV(n_splits=4, n_test_splits=2)
        prepare_time_inhomogeneous_cv_object(cv)
        count = 0
        for train_set, test_set in cv.split(cv.X, pred_times=cv.pred_times, eval_times=cv.eval_times):
            count += 1
            if count == 3:
                result_train = np.array([0, 1, 2, 3, 4, 16, 17, 18, 19, 20])
                result_test = np.array([6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
                self.assertTrue(np.array_equal(result_train, train_set))
                self.assertTrue(np.array_equal(result_test, test_set))
            if count == 4:
                result_train = np.array([7, 8, 9, 10, 11, 12, 13, 14, 15])
                result_test = np.array([0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20])
                self.assertTrue(np.array_equal(result_train, train_set))
                self.assertTrue(np.array_equal(result_test, test_set))

    def test_compute_test_set(self):
        """
        We consider a sample set of size 10 with test folds [2:4], [4:6] and [8:10]. The function should return the
        aggregated bounds [2:6], [8:10], as well as the corresponding test indices.
        """
        fold_bound_list = [(2, 4), (4, 6), (8, 10)]
        result1 = [(2, 6), (8, 10)]
        result2 = np.array([2, 3, 4, 5, 8, 9])

        cv = CombPurgedKFoldCV(n_splits=5)
        prepare_cv_object(cv, n_samples=10, time_shift='120m', randomlize_times=False)
        agg_fold_bound_list, test_indices = cv.compute_test_set(fold_bound_list)
        self.assertEqual(result1, agg_fold_bound_list)
        self.assertTrue(np.array_equal(result2, test_indices))


class TestComputeFoldBounds(TestCase):
    def test_by_samples(self):
        """
        Use a 10 sample set, with 5 folds. The fold left bounds are at 0, 2, 4, 6, and 8.
        """
        cv = PurgedWalkForwardCV(n_splits=5)
        prepare_cv_object(cv, n_samples=10, time_shift='120m', randomlize_times=False)
        result = [0, 2, 4, 6, 8]
        self.assertEqual(result, compute_fold_bounds(cv, False))

    def test_by_time(self):
        """
        Create a sample set as described in the docstring of prepare_time_inhomogeneous_cv_object. Inspection shows
        that the fold left bounds are at 0, 7, 13, 16, 18.
        """
        cv = PurgedWalkForwardCV(n_splits=5)
        prepare_time_inhomogeneous_cv_object(cv)
        result = [0, 7, 13, 16, 18]
        self.assertTrue(all(result[i] == compute_fold_bounds(cv, True)[i] for i in range(5)))


class TestPurge(TestCase):

    def test_traintest(self):
        """
        Generate a 2n sample data set consisting of
        - hourly samples
        - two folds, with a train fold followed by a test fold, starting at sample n + 1
        For the first assert statement, a fixed 119m window between the prediction and the the evaluation times. This
        results in sample n to be purged.
        For the second assert statement, as 120m window is chosen, resulting in samples n - 1 and n to be purged.
        """
        cv = BaseTimeSeriesCrossValidator(n_splits=2)
        n = 6
        test_fold_start = n + 1
        test_fold_end = 2 * n

        prepare_cv_object(cv, n_samples=2 * n, time_shift='119m', randomlize_times=False)
        train_indices = cv.indices[:n]
        result = cv.indices[0:n]
        self.assertTrue(np.array_equal(result, purge(cv, train_indices, test_fold_start, test_fold_end)))

        prepare_cv_object(cv, n_samples=2 * n, time_shift='120m', randomlize_times=False)
        result = cv.indices[0:n - 1]
        self.assertTrue(np.array_equal(result, purge(cv, train_indices, test_fold_start, test_fold_end)))

    def test_testtrain(self):
        """
        Generate a similar sample, but with the test set preceding the train set, which starts at n. No sample should
        be purged.
        """
        cv = BaseTimeSeriesCrossValidator(n_splits=2)
        n = 6
        test_fold_start = 0
        test_fold_end = n

        prepare_cv_object(cv, n_samples=2 * n, time_shift='120m', randomlize_times=False)
        train_indices = cv.indices[n:]
        result = cv.indices[n:]
        self.assertTrue(np.array_equal(result, purge(cv, train_indices, test_fold_start, test_fold_end)))


class TestEmbargo(TestCase):

    def test_zero_embargo(self):
        """
        Generate a 2n sample data set consisting of
        - hourly samples
        - two folds, with a test fold followed by a train fold, starting at sample n
        For the first assert statement, a fixed 119m window between the prediction and the the evaluation times. This
        results in sample n to be embargoed.
        For the second assert statement, the window is set to 120m, causing samples n and n + 1 to be embargoed.
        """
        cv = CombPurgedKFoldCV(n_splits=2, n_test_splits=1)
        n = 6
        test_fold_end = n

        prepare_cv_object(cv, n_samples=2 * n, time_shift='119m', randomlize_times=False)
        cv.embargo_td = pd.Timedelta(minutes=0)
        train_indices = cv.indices[n:]
        test_indices = cv.indices[:n]
        result = cv.indices[n + 1:]
        self.assertTrue(np.array_equal(result, embargo(cv, train_indices, test_indices, test_fold_end)))

        prepare_cv_object(cv, n_samples=2 * n, time_shift='120m', randomlize_times=False)
        result = cv.indices[n + 2:]
        self.assertTrue(np.array_equal(result, embargo(cv, train_indices, test_indices, test_fold_end)))

    def test_nonzero_embargo(self):
        """
        Same with an embargo delay of 2h. two more samples have to be embargoed in each case.
        """
        cv = CombPurgedKFoldCV(n_splits=2, n_test_splits=1)
        n = 6
        test_fold_end = n

        prepare_cv_object(cv, n_samples=2 * n, time_shift='119m', randomlize_times=False)
        cv.embargo_td = pd.Timedelta(minutes=120)
        train_indices = cv.indices[n:]
        test_indices = cv.indices[:n]
        result = cv.indices[n + 3:]

        self.assertTrue(np.array_equal(result, embargo(cv, train_indices, test_indices, test_fold_end)))

        prepare_cv_object(cv, n_samples=2 * n, time_shift='120m', randomlize_times=False)
        result = cv.indices[n + 4:]
        self.assertTrue(np.array_equal(result, embargo(cv, train_indices, test_indices, test_fold_end)))


if __name__ == '__main__':
    unittest.main()