ML Point Forecasting with Multiple Models#

What you’ll build

A multi-model comparison of CatBoost, XGBoost, LightGBM, and Linear Regression all trained on the MLVS-PT net-load dataset, ranked by a styled metrics table so you can identify the best model for your problem.

Prerequisites

01 - Getting Started (DataPipelineConfig, ForecasterConfig, TwigaForecaster.fit)
03 - Feature Engineering (understanding what features go into the model)
04 - Time Series Differencing (stationarity checks before modelling)
Python: list comprehensions, basic sklearn familiarity

Learning objectives

By the end of this notebook you will be able to:

Configure and train multiple ML models (CatBoost, XGBoost, LightGBM, Linear Regression) using a single shared DataPipelineConfig
Explain the difference between gradient-boosted trees, linear regression, and ensemble strategies
Use Twiga’s model registry (get_model) to look up and instantiate model classes by name
Compare models using a formatted metrics table and correctly interpret MAE, RMSE, Correlation, and WMAPE
Decide when a mean ensemble outperforms individual models and when it does not

1. Setup#

import warnings

from great_tables import GT
from IPython.display import clear_output
from lets_plot import LetsPlot
import pandas as pd
from sklearn.preprocessing import RobustScaler, StandardScaler

LetsPlot.setup_html()

from twiga import TwigaForecaster
from twiga.core.config import DataPipelineConfig, ForecasterConfig
from twiga.core.plot import (
    plot_acf,
    plot_density,
    plot_forecast,
    plot_forecast_grid,
    plot_metrics_bar,
    plot_timeseries,
)
from twiga.core.plot.gt import twiga_report
from twiga.core.utils import configure, get_logger

warnings.filterwarnings("ignore")

configure()
log = get_logger("tutorials")

Load data#

The dataset covers Madeira, Portugal (32.37°N, 16.27°W) at 30-minute resolution. We load only the columns we need: timestamp, net load (target), and two exogenous variables.

data = pd.read_parquet("../data/MLVS-PT.parquet")
data = data[["timestamp", "NetLoad(kW)", "Ghi", "Temperature"]]
data["timestamp"] = pd.to_datetime(data["timestamp"])
data = data.drop_duplicates(subset="timestamp").reset_index(drop=True)
# Restrict to 2019-2020 to keep tutorial execution fast
data = data[(data["timestamp"] >= "2019-01-01") & (data["timestamp"] <= "2020-12-31")].reset_index(drop=True)

log.info("Shape: %s", data.shape)
GT(data.head())

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[2], line 1
----> 1 data = pd.read_parquet("../data/MLVS-PT.parquet")
data = data[["timestamp", "NetLoad(kW)", "Ghi", "Temperature"]]
data["timestamp"] = pd.to_datetime(data["timestamp"])
data = data.drop_duplicates(subset="timestamp").reset_index(drop=True)

File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/parquet.py:669, in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, dtype_backend, filesystem, filters, **kwargs)
   use_nullable_dtypes = False
check_dtype_backend(dtype_backend)
--> 669 return impl.read(
   path,
   columns=columns,
   filters=filters,
   storage_options=storage_options,
   use_nullable_dtypes=use_nullable_dtypes,
   dtype_backend=dtype_backend,
   filesystem=filesystem,
   **kwargs,
)

File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/parquet.py:258, in PyArrowImpl.read(self, path, columns, filters, use_nullable_dtypes, dtype_backend, storage_options, filesystem, **kwargs)
if manager == "array":
   to_pandas_kwargs["split_blocks"] = True
--> 258 path_or_handle, handles, filesystem = _get_path_or_handle(
   path,
   filesystem,
   storage_options=storage_options,
   mode="rb",
)
try:
   pa_table = self.api.parquet.read_table(
       path_or_handle,
       columns=columns,
   (...)    270         **kwargs,
   )

File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/parquet.py:141, in _get_path_or_handle(path, fs, storage_options, mode, is_dir)
handles = None
if (
   not fs
   and not is_dir
   (...)    139     # fsspec resources can also point to directories
   # this branch is used for example when reading from non-fsspec URLs
--> 141     handles = get_handle(
       path_or_handle, mode, is_text=False, storage_options=storage_options
   )
   fs = None
   path_or_handle = handles.handle

File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/common.py:882, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
       handle = open(
           handle,
           ioargs.mode,
   (...)    878             newline="",
       )
   else:
       # Binary mode
--> 882         handle = open(handle, ioargs.mode)
   handles.append(handle)
# Convert BytesIO or file objects passed with an encoding

FileNotFoundError: [Errno 2] No such file or directory: '../data/MLVS-PT.parquet'

Train / val / test splits#

We use the same fixed temporal split as in all other tutorials. The table below summarises the three periods.

from great_tables import GT, md

from twiga.core.plot.gt import twiga_gt

splits_df = pd.DataFrame(
    {
        "Split": ["train", "val", "test"],
        "Period": ["before 2020-01-01", "2020-01-01 – 2020-06-30", "2020-07-01 onwards"],
        "Purpose": [
            "Model fitting — all historical data the model learns from",
            "Early stopping — used to prevent overfitting during training",
            "Evaluation — held-out period never seen during training",
        ],
    }
)

twiga_gt(
    GT(splits_df)
    .tab_header(
        title=md("**Temporal Train / Val / Test Splits**"),
        subtitle="MLVS-PT dataset — 30-minute resolution",
    )
    .cols_label(
        Split=md("**Split**"),
        Period=md("**Period**"),
        Purpose=md("**Purpose**"),
    )
    .tab_source_note("Twiga Forecast"),
    n_rows=len(splits_df),
)

train_df = data[data["timestamp"] < "2020-01-01"].reset_index(drop=True)
val_df = data[(data["timestamp"] >= "2020-01-01") & (data["timestamp"] < "2020-07-01")].reset_index(drop=True)
test_df = data[data["timestamp"] >= "2020-07-01"].reset_index(drop=True)

log.info(
    f"train : {train_df.shape[0]:,} rows  ({train_df['timestamp'].min().date()} → {train_df['timestamp'].max().date()})"
)
log.info(f"val   : {val_df.shape[0]:,} rows  ({val_df['timestamp'].min().date()} → {val_df['timestamp'].max().date()})")
log.info(
    f"test  : {test_df.shape[0]:,} rows  ({test_df['timestamp'].min().date()} → {test_df['timestamp'].max().date()})"
)

2. Shared data config#

All ML models in this notebook share the same DataPipelineConfig and ForecasterConfig. We define them once here and reuse them throughout.

Key choices:

forecast_horizon=48: predict the next 24 hours (48 × 30 min)
lookback_window_size=96: use the previous 48 hours of history as input
calendar_features: hour-of-day and a day/night indicator capture diurnal patterns
exogenous_features: global horizontal irradiance (Ghi) improves solar-driven load forecasts

data_config = DataPipelineConfig(
    target_feature="NetLoad(kW)",
    period="30min",
    latitude=32.371666,
    longitude=-16.274998,
    calendar_features=["hour", "day_night"],
    exogenous_features=["Ghi"],
    forecast_horizon=48,
    lookback_window_size=96,
    input_scaler=StandardScaler(),
    target_scaler=RobustScaler(),
)

train_config = ForecasterConfig(
    split_freq="months",
    train_size=3,
    test_size=1,
)

data_config

3. Linear Regression#

Key concept - Linear regression as a baseline

A linear model predicts the target as a weighted sum of input features. It has no hyperparameters to tune, trains in milliseconds, and always converges. Its primary role here is to set a performance floor: if a more complex model cannot beat linear regression, something is wrong with the features or the training setup - not the model choice.

Use it first. Beat it second.

Linear Regression is the simplest baseline. It has no hyperparameters to tune and trains in seconds. Use it to establish a lower bound on expected accuracy before trying more powerful models.

from twiga.models.ml import LINEAREGConfig

linear_config = LINEAREGConfig()

forecaster_linear = TwigaForecaster(
    data_params=data_config,
    model_params=[linear_config],
    train_params=train_config,
)
forecaster_linear.fit(train_df=train_df, val_df=val_df)
clear_output()
log.info("Linear Regression training complete.")

pred_linear, metric_linear = forecaster_linear.evaluate_point_forecast(test_df=test_df)
clear_output()

from great_tables import GT, md
import pandas as pd

from twiga.core.plot.gt import twiga_gt

m = metric_linear[["mae", "rmse", "corr"]].mean().round(3)
summary_df = pd.DataFrame({"Metric": m.index.str.upper(), "Value": m.values})
twiga_gt(
    GT(summary_df)
    .tab_header(title=md("**Linear Regression — Mean Metrics**"), subtitle="Averaged across evaluation folds")
    .cols_label(Metric=md("**Metric**"), Value=md("**Value**"))
    .tab_source_note("Lower is better for MAE/RMSE · Higher is better for Corr"),
    n_rows=len(summary_df),
)

Interpretation - Linear Regression establishes our baseline. An MAE around 3.4 kW on a target that ranges from ~10 to ~150 kW means roughly 2 - 5% relative error. Any model below this baseline is worse than a simple weighted sum - check your features if that happens.

4. LightGBM#

Key concept - Gradient boosting

Gradient boosting builds an ensemble of shallow decision trees sequentially: each new tree corrects the residual errors left by all previous trees. The result is a model that can capture complex non-linear interactions between features (e.g., solar angle × hour-of-day) that a linear model cannot represent. LightGBM is a particularly fast implementation that uses histogram-based splitting and leaf-wise growth, making it 5 - 10× faster than classic GBM on large tabular datasets.

LightGBM is a gradient-boosted tree model that is fast to train and often achieves strong accuracy on tabular time-series problems. It supports early stopping via the validation set.

from twiga.models.ml import LIGHTGBMConfig

lg_config = LIGHTGBMConfig()

forecaster_lg = TwigaForecaster(
    data_params=data_config,
    model_params=[lg_config],
    train_params=train_config,
)
forecaster_lg.fit(train_df=train_df, val_df=val_df)
clear_output()
log.info("LightGBM training complete.")

pred_lg, metric_lg = forecaster_lg.evaluate_point_forecast(test_df=test_df)
clear_output()

m = metric_lg[["mae", "rmse", "corr"]].mean().round(3)
summary_df = pd.DataFrame({"Metric": m.index.str.upper(), "Value": m.values})
twiga_gt(
    GT(summary_df)
    .tab_header(title=md("**LightGBM — Mean Metrics**"), subtitle="Averaged across evaluation folds")
    .cols_label(Metric=md("**Metric**"), Value=md("**Value**"))
    .tab_source_note("Lower is better for MAE/RMSE · Higher is better for Corr"),
    n_rows=len(summary_df),
)

Interpretation - Compare LightGBM’s MAE against Linear Regression. A meaningful improvement confirms that non-linear feature interactions exist in this dataset (solar irradiance × time-of-day is a prime example). If LightGBM barely beats linear, check whether your lookback_window_size captures enough seasonality.

5. XGBoost#

Key concept - XGBoost vs LightGBM

XGBoost and LightGBM are both gradient-boosted tree libraries but differ in their splitting strategy: XGBoost splits level-by-level (breadth-first) while LightGBM splits leaf-by-leaf (depth-first). In practice, LightGBM tends to be faster on large datasets while XGBoost can generalise better on smaller ones. Running both and comparing is the right approach - the winner is dataset-dependent.

XGBoost is another gradient-boosted tree implementation, similar to LightGBM. We set device="cpu" for reproducibility; switch to "cuda" if a GPU is available.

from twiga.models.ml import XGBOOSTConfig

xg_config = XGBOOSTConfig(device="cpu")

forecaster_xg = TwigaForecaster(
    data_params=data_config,
    model_params=[xg_config],
    train_params=train_config,
)
forecaster_xg.fit(train_df=train_df, val_df=val_df)
clear_output()
log.info("XGBoost training complete.")

pred_xg, metric_xg = forecaster_xg.evaluate_point_forecast(test_df=test_df)
clear_output()

m = metric_xg[["mae", "rmse", "corr"]].mean().round(3)
summary_df = pd.DataFrame({"Metric": m.index.str.upper(), "Value": m.values})
twiga_gt(
    GT(summary_df)
    .tab_header(title=md("**XGBoost — Mean Metrics**"), subtitle="Averaged across evaluation folds")
    .cols_label(Metric=md("**Metric**"), Value=md("**Value**"))
    .tab_source_note("Lower is better for MAE/RMSE · Higher is better for Corr"),
    n_rows=len(summary_df),
)

Interpretation - XGBoost and LightGBM should produce similar MAE on this dataset. A large gap (> 0.5 kW) between them is a signal to tune hyperparameters (Tutorial 10) rather than concluding one library is fundamentally better.

7. All models together: multi-model forecaster#

Key concept - model registry

Twiga maintains an internal registry of all supported models. You can look up any model class and its default config by name using get_model("lightgbm", domain="ml"), which returns (model_cls, config_cls). This lets you iterate over model names programmatically - useful when running automated comparisons or hyperparameter sweeps without hard-coding import paths. The registry enforces the domain separation: "ml" models are serialised with pickle, "nn" models with a PyTorch checkpoint.

Key concept - model comparison methodology

Comparing models fairly requires holding everything else constant: same data splits, same feature engineering, same evaluation metric, same test period. Twiga enforces this by design - all models in a single TwigaForecaster share the same DataPipelineConfig and ForecasterConfig. The only variable is the model itself, so any difference in metrics is attributable to the model alone.

Key concept - ensemble forecasting

An ensemble combines predictions from multiple models to reduce variance. The ensemble_strategy="mean" option averages predictions across all registered models at evaluation time. Ensembles almost always improve over the weakest constituent model and often beat the best individual model, particularly when the models make different types of errors. The cost is interpretability and inference time.

Passing multiple model configs to TwigaForecaster trains them all in a single call. Setting ensemble_strategy="mean" in evaluate_point_forecast also returns an ensemble forecast alongside the per-model predictions, which are all stored in pred_all and metric_all.

forecaster_all = TwigaForecaster(
    data_params=data_config,
    model_params=[linear_config, lg_config, xg_config],
    train_params=train_config,
)
forecaster_all.fit(train_df=train_df, val_df=val_df)
clear_output()
log.info("All models training complete.")

pred_all, metric_all = forecaster_all.evaluate_point_forecast(
    test_df=test_df,
    ensemble_strategy="mean",
)
clear_output()

models_evaluated = metric_all["Model"].unique().tolist()
info_df = pd.DataFrame({"Models evaluated": models_evaluated})
twiga_gt(
    GT(info_df)
    .tab_header(title=md("**Multi-model evaluation complete**"), subtitle="All models + ensemble ready for comparison")
    .cols_label(**{"Models evaluated": md("**Models evaluated**")})
    .tab_source_note("Twiga Forecast"),
    n_rows=len(info_df),
)

8. Results table#

We aggregate per-fold metrics across all models and render a formatted comparison table. Lower is better for MAE, SMAPE, and RMSE; higher is better for Corr.

The guide below explains each metric before you read the comparison table.

from great_tables import GT, md

from twiga.core.plot.gt import twiga_gt

metrics_guide = pd.DataFrame(
    {
        "Metric": ["MAE", "RMSE", "Corr", "SMAPE"],
        "What it measures": [
            "Mean absolute error (same units as target)",
            "Root-mean-squared error (penalises large spikes)",
            "Pearson correlation between forecast and actual",
            "Symmetric mean absolute percentage error",
        ],
        "Direction": ["Lower is better", "Lower is better", "Higher is better", "Lower is better"],
        "Rule of thumb": [
            "Directly interpretable in kW",
            "If RMSE >> MAE, outlier errors dominate",
            "> 0.95 excellent · > 0.90 good",
            "< 5% excellent · < 10% good",
        ],
    }
)

twiga_gt(
    GT(metrics_guide)
    .tab_header(
        title=md("**Point Forecast Metric Guide**"),
        subtitle="How to interpret the comparison table below",
    )
    .cols_label(
        Metric=md("**Metric**"),
        **{"What it measures": md("**What it measures**")},
        Direction=md("**Direction**"),
        **{"Rule of thumb": md("**Rule of thumb**")},
    )
    .tab_source_note("Twiga Forecast"),
    n_rows=len(metrics_guide),
)

res = metric_all.groupby("Model")[["mae", "corr", "nbias", "rmse", "wmape", "smape"]].mean().round(2).reset_index()
res = res.rename(
    columns={
        "mae": "MAE",
        "corr": "Corr",
        "wmape": "WMAPE",
        "smape": "SMAPE",
        "nbias": "NBIAS",
        "rmse": "RMSE",
    }
)

twiga_report(
    res,
    ["MAE", "Corr", "SMAPE", "RMSE"],
    ["MAE", "SMAPE", "RMSE"],
    ["Corr"],
)

Interpretation - Look at the teal-highlighted cells: they mark the best value per column. If the ENSEMBLE row is highlighted across all columns, the ensemble is strictly better - a reliable outcome when individual models make uncorrelated errors. If one model beats the ensemble on MAE, consider dropping the weakest model from the ensemble. A difference of < 0.1 kW MAE is within noise; focus on models that are consistently better across multiple metrics.

9. Visualise best model predictions#

We plot the first 7 days (7 × 48 = 336 steps) of the test set for one model. Adjust model_name to compare any of the trained models.

p = plot_forecast_grid(
    pred_all,
    actual_col="Actual",
    forecast_col="forecast",
    model_col="Model",
    n_samples_per_model=7 * 48,
    y_label="Net Load (kW)",
    title="Point forecasts — first 7 days of test set",
)
p

Wrapping up#

What you did

Loaded the MLVS-PT dataset and created chronological train/val/test splits
Defined a single shared DataPipelineConfig reused across all models
Trained a Linear Regression baseline and established a performance floor
Trained LightGBM and XGBoost gradient-boosted tree models
Combined all models into one multi-model TwigaForecaster with ensemble_strategy="mean"
Compared models using MAE, RMSE, Correlation, and SMAPE in a styled metrics table
Visualised actuals vs. forecasts for all models side-by-side

Key takeaways

Always train a linear baseline first - it is free and sets the quality bar.
Gradient-boosted trees (LightGBM, XGBoost) capture non-linear feature interactions that linear models cannot.
A shared DataPipelineConfig guarantees a fair comparison - only the model varies.
Ensembles reduce prediction variance and often beat individual models at the cost of interpretability.
Small metric differences (< 0.1 kW MAE) are noise; prioritise models that win consistently across multiple metrics.

What’s next?#

06 - Backtesting & Evaluation

A single train/test split gives one performance number that depends heavily on which period you happened to pick. NB06 shows you how to run multi-fold time-based cross-validation with forecaster.backtesting(), analyse per-fold metric variance, and choose between expanding and rolling window strategies.

# ruff: noqa: E501, E701, E702
from IPython.display import HTML

_TEAL = "#107591"
_TEAL_MID = "#069fac"
_TEAL_LIGHT = "#e8f5f8"
_TEAL_BEST = "#d0ecf1"
_TEXT_DARK = "#2d3748"
_TEXT_MUTED = "#718096"
_WHITE = "#ffffff"

steps = [
    {
        "num": "01",
        "title": "Getting Started",
        "desc": "Load data · configure pipeline · train LightGBM · evaluate",
        "tags": ["data", "config", "train"],
        "active": False,
    },
    {
        "num": "04",
        "title": "Time Series Differencing",
        "desc": "Stationarity · first-order differencing · inversion",
        "tags": ["differencing", "stationarity"],
        "active": False,
    },
    {
        "num": "05",
        "title": "ML Point Forecasting",
        "desc": "CatBoost · XGBoost · LightGBM · model comparison",
        "tags": ["catboost", "xgboost", "lightgbm"],
        "active": True,
    },
    {
        "num": "06",
        "title": "Backtesting & Evaluation",
        "desc": "Rolling-window backtesting · fold-level metrics",
        "tags": ["backtesting", "evaluation", "metrics"],
        "active": False,
    },
    {
        "num": "08",
        "title": "Quantile Regression",
        "desc": "First probabilistic step — prediction intervals",
        "tags": ["probabilistic", "quantile", "intervals"],
        "active": False,
    },
]


def _badge(t, bg, fg):
    return f'<span style="display:inline-block;background:{bg};color:{fg};font-size:10px;font-weight:600;padding:2px 7px;border-radius:10px;letter-spacing:.3px;margin:2px 2px 0 0;">{t}</span>'


cards_html = ""
for i, s in enumerate(steps):
    a = s["active"]
    cb = _TEAL if a else _WHITE
    cbo = _TEAL if a else "#d1ecf1"
    nb = _TEAL_MID if a else _TEAL_LIGHT
    nf = _WHITE if a else _TEAL
    tf = _WHITE if a else _TEXT_DARK
    df2 = "#cce8ef" if a else _TEXT_MUTED
    bb = "#0d5f75" if a else _TEAL_BEST
    bf = "#b8e4ed" if a else _TEAL
    yh = (
        f'<span style="float:right;background:{_TEAL_MID};color:{_WHITE};font-size:10px;font-weight:700;padding:2px 10px;border-radius:12px;">★ you are here</span>'
        if a
        else ""
    )
    badges = "".join(_badge(t, bb, bf) for t in s["tags"])
    cards_html += f'<div style="background:{cb};border:2px solid {cbo};border-radius:12px;padding:16px 20px;display:flex;align-items:flex-start;gap:16px;box-shadow:{"0 4px 14px rgba(16,117,145,.25)" if a else "0 1px 4px rgba(0,0,0,.06)"};"><div style="min-width:44px;height:44px;background:{nb};color:{nf};border-radius:50%;display:flex;align-items:center;justify-content:center;font-size:15px;font-weight:800;flex-shrink:0;">{s["num"]}</div><div style="flex:1;"><div style="font-size:15px;font-weight:700;color:{tf};margin-bottom:4px;">{s["title"]}{yh}</div><div style="font-size:12.5px;color:{df2};margin-bottom:8px;line-height:1.5;">{s["desc"]}</div><div>{badges}</div></div></div>'
    if i < len(steps) - 1:
        cards_html += f'<div style="display:flex;justify-content:center;height:32px;"><svg width="24" height="32" viewBox="0 0 24 32" fill="none"><line x1="12" y1="0" x2="12" y2="24" stroke="{_TEAL_MID}" stroke-width="2" stroke-dasharray="4 3"/><polygon points="6,20 18,20 12,30" fill="{_TEAL_MID}"/></svg></div>'

html = f'<div style="font-family:Inter,\'Segoe UI\',sans-serif;max-width:640px;margin:8px 0;"><div style="background:linear-gradient(135deg,{_TEAL} 0%,{_TEAL_MID} 100%);border-radius:12px 12px 0 0;padding:14px 20px;display:flex;align-items:center;gap:10px;"><svg width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="{_WHITE}" stroke-width="2"><path d="M12 2L2 7l10 5 10-5-10-5z"/><path d="M2 17l10 5 10-5"/><path d="M2 12l10 5 10-5"/></svg><span style="color:{_WHITE};font-size:14px;font-weight:700;">Twiga Learning Path — Point Forecasting Track</span></div><div style="border:2px solid {_TEAL_LIGHT};border-top:none;border-radius:0 0 12px 12px;padding:20px 20px 16px;background:#f9fdfe;display:flex;flex-direction:column;">{cards_html}<div style="margin-top:16px;font-size:11.5px;color:{_TEXT_MUTED};text-align:center;border-top:1px solid {_TEAL_LIGHT};padding-top:12px;">Next: explore <span style="color:{_TEAL};font-weight:600;">probabilistic forecasting</span> (08–10) or <span style="color:{_TEAL};font-weight:600;">hyperparameter tuning</span> (11).</div></div></div>'
HTML(html)