TimeSeriesSplitter(
df,
forecast_len=1,
incremental_len=None,
n_splits=None,
min_train_len=None,
window_type="expanding",
date_col=None,
)
Cross validation splitter for time series data
Parameters
df : pd.DataFrame
DataFrame object containing time index, response, and other features
forecast_len : int
forecast length; default as 1
incremental_len : int
the number of observations between each successive backtest period; default as forecast_len
n_splits : int; default None
number of splits; when n_splits is specified, min_train_len will be ignored
min_train_len : int
the minimum number of observations required for the training period
window_type : {'expanding', 'rolling }; default 'expanding'
split scheme
date_col : str
optional for user to provide date columns; note that it stills uses discrete index
as splitting scheme while date_col is used for better visualization only
Attributes
_split_scheme : dict{split_meta}
meta data of ways to split train and test set
Source code in mlpforecast/evaluation/backtester.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74 | def __init__(
self,
df,
forecast_len=1,
incremental_len=None,
n_splits=None,
min_train_len=None,
window_type="expanding",
date_col=None,
):
"""Initializes object with DataFrame and splits data
Parameters
----------
df : pd.DataFrame
DataFrame object containing time index, response, and other features
forecast_len : int
forecast length; default as 1
incremental_len : int
the number of observations between each successive backtest period; default as forecast_len
n_splits : int; default None
number of splits; when n_splits is specified, min_train_len will be ignored
min_train_len : int
the minimum number of observations required for the training period
window_type : {'expanding', 'rolling }; default 'expanding'
split scheme
date_col : str
optional for user to provide date columns; note that it stills uses discrete index
as splitting scheme while `date_col` is used for better visualization only
Attributes
----------
_split_scheme : dict{split_meta}
meta data of ways to split train and test set
"""
self.df = df.copy()
self.min_train_len = min_train_len
self.incremental_len = incremental_len
self.forecast_len = forecast_len
self.n_splits = n_splits
self.window_type = window_type
self.date_col = None
self.dt_array = None
if date_col is not None:
self.date_col = date_col
# support cases for multiple observations
self.dt_array = pd.to_datetime(np.sort(self.df[self.date_col].unique()))
self._set_defaults()
# validate
self._validate_params()
# init meta data of how to split
self._split_scheme = {}
# timeseries cross validation split
self._set_split_scheme()
|
split
Returns
iterables with (train_df, test_df, scheme, split_key) where
train_df : pd.DataFrame
data split for training
test_df : pd.DataFrame
data split for testing/validation
scheme : dict
derived from self._split_scheme
split_key : int
index of the iteration
Source code in mlpforecast/evaluation/backtester.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181 | def split(self):
"""
Returns
-------
iterables with (train_df, test_df, scheme, split_key) where
train_df : pd.DataFrame
data split for training
test_df : pd.DataFrame
data split for testing/validation
scheme : dict
derived from self._split_scheme
split_key : int
index of the iteration
"""
if self.date_col is None:
for split_key, scheme in self._split_scheme.items():
train_df = self.df.iloc[scheme["train_idx"], :].reset_index(drop=True)
test_df = self.df.iloc[scheme["test_idx"], :].reset_index(drop=True)
yield train_df, test_df, scheme, split_key
else:
for split_key, scheme in self._split_scheme.items():
train_df = self.df.loc[
(self.df[self.date_col] >= scheme["train_period"][0])
& (self.df[self.date_col] <= scheme["train_period"][1]),
:,
].reset_index(drop=True)
test_df = self.df.loc[
(self.df[self.date_col] >= scheme["test_period"][0])
& (self.df[self.date_col] <= scheme["test_period"][1]),
:,
].reset_index(drop=True)
yield train_df, test_df, scheme, split_key
|