This part of the project documentation focuses on an information-oriented approach. Use it as a reference for the technical implementation of the mlpForecaster project code.

TimeSeriesSplitter

TimeSeriesSplitter(
    df,
    forecast_len=1,
    incremental_len=None,
    n_splits=None,
    min_train_len=None,
    window_type="expanding",
    date_col=None,
)

Cross validation splitter for time series data

Parameters

df : pd.DataFrame DataFrame object containing time index, response, and other features forecast_len : int forecast length; default as 1 incremental_len : int the number of observations between each successive backtest period; default as forecast_len n_splits : int; default None number of splits; when n_splits is specified, min_train_len will be ignored min_train_len : int the minimum number of observations required for the training period window_type : {'expanding', 'rolling }; default 'expanding' split scheme date_col : str optional for user to provide date columns; note that it stills uses discrete index as splitting scheme while date_col is used for better visualization only

Attributes

_split_scheme : dict{split_meta} meta data of ways to split train and test set

Source code in mlpforecast/evaluation/backtester.py

def __init__(
    self,
    df,
    forecast_len=1,
    incremental_len=None,
    n_splits=None,
    min_train_len=None,
    window_type="expanding",
    date_col=None,
):
    """Initializes object with DataFrame and splits data

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame object containing time index, response, and other features
    forecast_len : int
        forecast length; default as 1
    incremental_len : int
        the number of observations between each successive backtest period; default as forecast_len
    n_splits : int; default None
        number of splits; when n_splits is specified, min_train_len will be ignored
    min_train_len : int
        the minimum number of observations required for the training period
    window_type : {'expanding', 'rolling }; default 'expanding'
        split scheme
    date_col : str
        optional for user to provide date columns; note that it stills uses discrete index
        as splitting scheme while `date_col` is used for better visualization only

    Attributes
    ----------
    _split_scheme : dict{split_meta}
        meta data of ways to split train and test set
    """

    self.df = df.copy()
    self.min_train_len = min_train_len
    self.incremental_len = incremental_len
    self.forecast_len = forecast_len
    self.n_splits = n_splits
    self.window_type = window_type
    self.date_col = None
    self.dt_array = None

    if date_col is not None:
        self.date_col = date_col
        # support cases for multiple observations
        self.dt_array = pd.to_datetime(np.sort(self.df[self.date_col].unique()))

    self._set_defaults()

    # validate
    self._validate_params()

    # init meta data of how to split
    self._split_scheme = {}

    # timeseries cross validation split
    self._set_split_scheme()

split

split()

Returns

iterables with (train_df, test_df, scheme, split_key) where train_df : pd.DataFrame data split for training test_df : pd.DataFrame data split for testing/validation scheme : dict derived from self._split_scheme split_key : int index of the iteration

Source code in mlpforecast/evaluation/backtester.py

def split(self):
    """
    Returns
    -------
    iterables with (train_df, test_df, scheme, split_key) where
    train_df : pd.DataFrame
        data split for training
    test_df : pd.DataFrame
        data split for testing/validation
    scheme : dict
        derived from self._split_scheme
    split_key : int
         index of the iteration
    """
    if self.date_col is None:
        for split_key, scheme in self._split_scheme.items():
            train_df = self.df.iloc[scheme["train_idx"], :].reset_index(drop=True)
            test_df = self.df.iloc[scheme["test_idx"], :].reset_index(drop=True)
            yield train_df, test_df, scheme, split_key
    else:
        for split_key, scheme in self._split_scheme.items():
            train_df = self.df.loc[
                (self.df[self.date_col] >= scheme["train_period"][0])
                & (self.df[self.date_col] <= scheme["train_period"][1]),
                :,
            ].reset_index(drop=True)
            test_df = self.df.loc[
                (self.df[self.date_col] >= scheme["test_period"][0])
                & (self.df[self.date_col] <= scheme["test_period"][1]),
                :,
            ].reset_index(drop=True)
            yield train_df, test_df, scheme, split_key