Skip to content

This part of the project documentation focuses on an information-oriented approach. Use it as a reference for the technical implementation of the mlpForecaster project code.

TimeSeriesSplitter

TimeSeriesSplitter(
    df,
    forecast_len=1,
    incremental_len=None,
    n_splits=None,
    min_train_len=None,
    window_type="expanding",
    date_col=None,
)

Cross validation splitter for time series data

Parameters

df : pd.DataFrame DataFrame object containing time index, response, and other features forecast_len : int forecast length; default as 1 incremental_len : int the number of observations between each successive backtest period; default as forecast_len n_splits : int; default None number of splits; when n_splits is specified, min_train_len will be ignored min_train_len : int the minimum number of observations required for the training period window_type : {'expanding', 'rolling }; default 'expanding' split scheme date_col : str optional for user to provide date columns; note that it stills uses discrete index as splitting scheme while date_col is used for better visualization only

Attributes

_split_scheme : dict{split_meta} meta data of ways to split train and test set

Source code in mlpforecast/evaluation/backtester.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def __init__(
    self,
    df,
    forecast_len=1,
    incremental_len=None,
    n_splits=None,
    min_train_len=None,
    window_type="expanding",
    date_col=None,
):
    """Initializes object with DataFrame and splits data

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame object containing time index, response, and other features
    forecast_len : int
        forecast length; default as 1
    incremental_len : int
        the number of observations between each successive backtest period; default as forecast_len
    n_splits : int; default None
        number of splits; when n_splits is specified, min_train_len will be ignored
    min_train_len : int
        the minimum number of observations required for the training period
    window_type : {'expanding', 'rolling }; default 'expanding'
        split scheme
    date_col : str
        optional for user to provide date columns; note that it stills uses discrete index
        as splitting scheme while `date_col` is used for better visualization only

    Attributes
    ----------
    _split_scheme : dict{split_meta}
        meta data of ways to split train and test set
    """

    self.df = df.copy()
    self.min_train_len = min_train_len
    self.incremental_len = incremental_len
    self.forecast_len = forecast_len
    self.n_splits = n_splits
    self.window_type = window_type
    self.date_col = None
    self.dt_array = None

    if date_col is not None:
        self.date_col = date_col
        # support cases for multiple observations
        self.dt_array = pd.to_datetime(np.sort(self.df[self.date_col].unique()))

    self._set_defaults()

    # validate
    self._validate_params()

    # init meta data of how to split
    self._split_scheme = {}

    # timeseries cross validation split
    self._set_split_scheme()

split

split()

Returns

iterables with (train_df, test_df, scheme, split_key) where train_df : pd.DataFrame data split for training test_df : pd.DataFrame data split for testing/validation scheme : dict derived from self._split_scheme split_key : int index of the iteration

Source code in mlpforecast/evaluation/backtester.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
def split(self):
    """
    Returns
    -------
    iterables with (train_df, test_df, scheme, split_key) where
    train_df : pd.DataFrame
        data split for training
    test_df : pd.DataFrame
        data split for testing/validation
    scheme : dict
        derived from self._split_scheme
    split_key : int
         index of the iteration
    """
    if self.date_col is None:
        for split_key, scheme in self._split_scheme.items():
            train_df = self.df.iloc[scheme["train_idx"], :].reset_index(drop=True)
            test_df = self.df.iloc[scheme["test_idx"], :].reset_index(drop=True)
            yield train_df, test_df, scheme, split_key
    else:
        for split_key, scheme in self._split_scheme.items():
            train_df = self.df.loc[
                (self.df[self.date_col] >= scheme["train_period"][0])
                & (self.df[self.date_col] <= scheme["train_period"][1]),
                :,
            ].reset_index(drop=True)
            test_df = self.df.loc[
                (self.df[self.date_col] >= scheme["test_period"][0])
                & (self.df[self.date_col] <= scheme["test_period"][1]),
                :,
            ].reset_index(drop=True)
            yield train_df, test_df, scheme, split_key