ML Point Forecasting with Multiple Models#
What you’ll build
A multi-model comparison of CatBoost, XGBoost, LightGBM, and Linear Regression all trained on the MLVS-PT net-load dataset, ranked by a styled metrics table so you can identify the best model for your problem.
Prerequisites
01 - Getting Started (DataPipelineConfig, ForecasterConfig, TwigaForecaster.fit)
03 - Feature Engineering (understanding what features go into the model)
04 - Time Series Differencing (stationarity checks before modelling)
Python: list comprehensions, basic sklearn familiarity
Learning objectives
By the end of this notebook you will be able to:
Configure and train multiple ML models (CatBoost, XGBoost, LightGBM, Linear Regression) using a single shared
DataPipelineConfigExplain the difference between gradient-boosted trees, linear regression, and ensemble strategies
Use Twiga’s model registry (
get_model) to look up and instantiate model classes by nameCompare models using a formatted metrics table and correctly interpret MAE, RMSE, Correlation, and WMAPE
Decide when a mean ensemble outperforms individual models and when it does not
1. Setup#
import warnings
from great_tables import GT
from IPython.display import clear_output
from lets_plot import LetsPlot
import pandas as pd
from sklearn.preprocessing import RobustScaler, StandardScaler
LetsPlot.setup_html()
from twiga import TwigaForecaster
from twiga.core.config import DataPipelineConfig, ForecasterConfig
from twiga.core.plot import (
plot_acf,
plot_density,
plot_forecast,
plot_forecast_grid,
plot_metrics_bar,
plot_timeseries,
)
from twiga.core.plot.gt import twiga_report
from twiga.core.utils import configure, get_logger
warnings.filterwarnings("ignore")
configure()
log = get_logger("tutorials")
Load data#
The dataset covers Madeira, Portugal (32.37°N, 16.27°W) at 30-minute resolution. We load only the columns we need: timestamp, net load (target), and two exogenous variables.
data = pd.read_parquet("../data/MLVS-PT.parquet")
data = data[["timestamp", "NetLoad(kW)", "Ghi", "Temperature"]]
data["timestamp"] = pd.to_datetime(data["timestamp"])
data = data.drop_duplicates(subset="timestamp").reset_index(drop=True)
# Restrict to 2019-2020 to keep tutorial execution fast
data = data[(data["timestamp"] >= "2019-01-01") & (data["timestamp"] <= "2020-12-31")].reset_index(drop=True)
log.info("Shape: %s", data.shape)
GT(data.head())
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[2], line 1
----> 1 data = pd.read_parquet("../data/MLVS-PT.parquet")
2 data = data[["timestamp", "NetLoad(kW)", "Ghi", "Temperature"]]
3 data["timestamp"] = pd.to_datetime(data["timestamp"])
4 data = data.drop_duplicates(subset="timestamp").reset_index(drop=True)
File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/parquet.py:669, in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, dtype_backend, filesystem, filters, **kwargs)
666 use_nullable_dtypes = False
667 check_dtype_backend(dtype_backend)
--> 669 return impl.read(
670 path,
671 columns=columns,
672 filters=filters,
673 storage_options=storage_options,
674 use_nullable_dtypes=use_nullable_dtypes,
675 dtype_backend=dtype_backend,
676 filesystem=filesystem,
677 **kwargs,
678 )
File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/parquet.py:258, in PyArrowImpl.read(self, path, columns, filters, use_nullable_dtypes, dtype_backend, storage_options, filesystem, **kwargs)
256 if manager == "array":
257 to_pandas_kwargs["split_blocks"] = True
--> 258 path_or_handle, handles, filesystem = _get_path_or_handle(
259 path,
260 filesystem,
261 storage_options=storage_options,
262 mode="rb",
263 )
264 try:
265 pa_table = self.api.parquet.read_table(
266 path_or_handle,
267 columns=columns,
(...) 270 **kwargs,
271 )
File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/parquet.py:141, in _get_path_or_handle(path, fs, storage_options, mode, is_dir)
131 handles = None
132 if (
133 not fs
134 and not is_dir
(...) 139 # fsspec resources can also point to directories
140 # this branch is used for example when reading from non-fsspec URLs
--> 141 handles = get_handle(
142 path_or_handle, mode, is_text=False, storage_options=storage_options
143 )
144 fs = None
145 path_or_handle = handles.handle
File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/common.py:882, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
873 handle = open(
874 handle,
875 ioargs.mode,
(...) 878 newline="",
879 )
880 else:
881 # Binary mode
--> 882 handle = open(handle, ioargs.mode)
883 handles.append(handle)
885 # Convert BytesIO or file objects passed with an encoding
FileNotFoundError: [Errno 2] No such file or directory: '../data/MLVS-PT.parquet'
Train / val / test splits#
We use the same fixed temporal split as in all other tutorials. The table below summarises the three periods.
from great_tables import GT, md
from twiga.core.plot.gt import twiga_gt
splits_df = pd.DataFrame(
{
"Split": ["train", "val", "test"],
"Period": ["before 2020-01-01", "2020-01-01 – 2020-06-30", "2020-07-01 onwards"],
"Purpose": [
"Model fitting — all historical data the model learns from",
"Early stopping — used to prevent overfitting during training",
"Evaluation — held-out period never seen during training",
],
}
)
twiga_gt(
GT(splits_df)
.tab_header(
title=md("**Temporal Train / Val / Test Splits**"),
subtitle="MLVS-PT dataset — 30-minute resolution",
)
.cols_label(
Split=md("**Split**"),
Period=md("**Period**"),
Purpose=md("**Purpose**"),
)
.tab_source_note("Twiga Forecast"),
n_rows=len(splits_df),
)
train_df = data[data["timestamp"] < "2020-01-01"].reset_index(drop=True)
val_df = data[(data["timestamp"] >= "2020-01-01") & (data["timestamp"] < "2020-07-01")].reset_index(drop=True)
test_df = data[data["timestamp"] >= "2020-07-01"].reset_index(drop=True)
log.info(
f"train : {train_df.shape[0]:,} rows ({train_df['timestamp'].min().date()} → {train_df['timestamp'].max().date()})"
)
log.info(f"val : {val_df.shape[0]:,} rows ({val_df['timestamp'].min().date()} → {val_df['timestamp'].max().date()})")
log.info(
f"test : {test_df.shape[0]:,} rows ({test_df['timestamp'].min().date()} → {test_df['timestamp'].max().date()})"
)
3. Linear Regression#
Key concept - Linear regression as a baseline
A linear model predicts the target as a weighted sum of input features. It has no hyperparameters to tune, trains in milliseconds, and always converges. Its primary role here is to set a performance floor: if a more complex model cannot beat linear regression, something is wrong with the features or the training setup - not the model choice.
Use it first. Beat it second.
Linear Regression is the simplest baseline. It has no hyperparameters to tune and trains in seconds. Use it to establish a lower bound on expected accuracy before trying more powerful models.
from twiga.models.ml import LINEAREGConfig
linear_config = LINEAREGConfig()
forecaster_linear = TwigaForecaster(
data_params=data_config,
model_params=[linear_config],
train_params=train_config,
)
forecaster_linear.fit(train_df=train_df, val_df=val_df)
clear_output()
log.info("Linear Regression training complete.")
pred_linear, metric_linear = forecaster_linear.evaluate_point_forecast(test_df=test_df)
clear_output()
from great_tables import GT, md
import pandas as pd
from twiga.core.plot.gt import twiga_gt
m = metric_linear[["mae", "rmse", "corr"]].mean().round(3)
summary_df = pd.DataFrame({"Metric": m.index.str.upper(), "Value": m.values})
twiga_gt(
GT(summary_df)
.tab_header(title=md("**Linear Regression — Mean Metrics**"), subtitle="Averaged across evaluation folds")
.cols_label(Metric=md("**Metric**"), Value=md("**Value**"))
.tab_source_note("Lower is better for MAE/RMSE · Higher is better for Corr"),
n_rows=len(summary_df),
)
Interpretation - Linear Regression establishes our baseline. An MAE around 3.4 kW on a target that ranges from ~10 to ~150 kW means roughly 2 - 5% relative error. Any model below this baseline is worse than a simple weighted sum - check your features if that happens.
4. LightGBM#
Key concept - Gradient boosting
Gradient boosting builds an ensemble of shallow decision trees sequentially: each new tree corrects the residual errors left by all previous trees. The result is a model that can capture complex non-linear interactions between features (e.g., solar angle × hour-of-day) that a linear model cannot represent. LightGBM is a particularly fast implementation that uses histogram-based splitting and leaf-wise growth, making it 5 - 10× faster than classic GBM on large tabular datasets.
LightGBM is a gradient-boosted tree model that is fast to train and often achieves strong accuracy on tabular time-series problems. It supports early stopping via the validation set.
from twiga.models.ml import LIGHTGBMConfig
lg_config = LIGHTGBMConfig()
forecaster_lg = TwigaForecaster(
data_params=data_config,
model_params=[lg_config],
train_params=train_config,
)
forecaster_lg.fit(train_df=train_df, val_df=val_df)
clear_output()
log.info("LightGBM training complete.")
pred_lg, metric_lg = forecaster_lg.evaluate_point_forecast(test_df=test_df)
clear_output()
m = metric_lg[["mae", "rmse", "corr"]].mean().round(3)
summary_df = pd.DataFrame({"Metric": m.index.str.upper(), "Value": m.values})
twiga_gt(
GT(summary_df)
.tab_header(title=md("**LightGBM — Mean Metrics**"), subtitle="Averaged across evaluation folds")
.cols_label(Metric=md("**Metric**"), Value=md("**Value**"))
.tab_source_note("Lower is better for MAE/RMSE · Higher is better for Corr"),
n_rows=len(summary_df),
)
Interpretation - Compare LightGBM’s MAE against Linear Regression. A meaningful improvement confirms that non-linear feature interactions exist in this dataset (solar irradiance × time-of-day is a prime example). If LightGBM barely beats linear, check whether your
lookback_window_sizecaptures enough seasonality.
5. XGBoost#
Key concept - XGBoost vs LightGBM
XGBoost and LightGBM are both gradient-boosted tree libraries but differ in their splitting strategy: XGBoost splits level-by-level (breadth-first) while LightGBM splits leaf-by-leaf (depth-first). In practice, LightGBM tends to be faster on large datasets while XGBoost can generalise better on smaller ones. Running both and comparing is the right approach - the winner is dataset-dependent.
XGBoost is another gradient-boosted tree implementation, similar to LightGBM.
We set device="cpu" for reproducibility; switch to "cuda" if a GPU is available.
from twiga.models.ml import XGBOOSTConfig
xg_config = XGBOOSTConfig(device="cpu")
forecaster_xg = TwigaForecaster(
data_params=data_config,
model_params=[xg_config],
train_params=train_config,
)
forecaster_xg.fit(train_df=train_df, val_df=val_df)
clear_output()
log.info("XGBoost training complete.")
pred_xg, metric_xg = forecaster_xg.evaluate_point_forecast(test_df=test_df)
clear_output()
m = metric_xg[["mae", "rmse", "corr"]].mean().round(3)
summary_df = pd.DataFrame({"Metric": m.index.str.upper(), "Value": m.values})
twiga_gt(
GT(summary_df)
.tab_header(title=md("**XGBoost — Mean Metrics**"), subtitle="Averaged across evaluation folds")
.cols_label(Metric=md("**Metric**"), Value=md("**Value**"))
.tab_source_note("Lower is better for MAE/RMSE · Higher is better for Corr"),
n_rows=len(summary_df),
)
Interpretation - XGBoost and LightGBM should produce similar MAE on this dataset. A large gap (> 0.5 kW) between them is a signal to tune hyperparameters (Tutorial 10) rather than concluding one library is fundamentally better.
7. All models together: multi-model forecaster#
Key concept - model registry
Twiga maintains an internal registry of all supported models. You can look up any model class and its default config by name using
get_model("lightgbm", domain="ml"), which returns(model_cls, config_cls). This lets you iterate over model names programmatically - useful when running automated comparisons or hyperparameter sweeps without hard-coding import paths. The registry enforces thedomainseparation:"ml"models are serialised with pickle,"nn"models with a PyTorch checkpoint.
Key concept - model comparison methodology
Comparing models fairly requires holding everything else constant: same data splits, same feature engineering, same evaluation metric, same test period. Twiga enforces this by design - all models in a single
TwigaForecastershare the sameDataPipelineConfigandForecasterConfig. The only variable is the model itself, so any difference in metrics is attributable to the model alone.
Key concept - ensemble forecasting
An ensemble combines predictions from multiple models to reduce variance. The
ensemble_strategy="mean"option averages predictions across all registered models at evaluation time. Ensembles almost always improve over the weakest constituent model and often beat the best individual model, particularly when the models make different types of errors. The cost is interpretability and inference time.
Passing multiple model configs to TwigaForecaster trains them all in a single call.
Setting ensemble_strategy="mean" in evaluate_point_forecast also returns an ensemble forecast
alongside the per-model predictions, which are all stored in pred_all and metric_all.
forecaster_all = TwigaForecaster(
data_params=data_config,
model_params=[linear_config, lg_config, xg_config],
train_params=train_config,
)
forecaster_all.fit(train_df=train_df, val_df=val_df)
clear_output()
log.info("All models training complete.")
pred_all, metric_all = forecaster_all.evaluate_point_forecast(
test_df=test_df,
ensemble_strategy="mean",
)
clear_output()
models_evaluated = metric_all["Model"].unique().tolist()
info_df = pd.DataFrame({"Models evaluated": models_evaluated})
twiga_gt(
GT(info_df)
.tab_header(title=md("**Multi-model evaluation complete**"), subtitle="All models + ensemble ready for comparison")
.cols_label(**{"Models evaluated": md("**Models evaluated**")})
.tab_source_note("Twiga Forecast"),
n_rows=len(info_df),
)
8. Results table#
We aggregate per-fold metrics across all models and render a formatted comparison table. Lower is better for MAE, SMAPE, and RMSE; higher is better for Corr.
The guide below explains each metric before you read the comparison table.
from great_tables import GT, md
from twiga.core.plot.gt import twiga_gt
metrics_guide = pd.DataFrame(
{
"Metric": ["MAE", "RMSE", "Corr", "SMAPE"],
"What it measures": [
"Mean absolute error (same units as target)",
"Root-mean-squared error (penalises large spikes)",
"Pearson correlation between forecast and actual",
"Symmetric mean absolute percentage error",
],
"Direction": ["Lower is better", "Lower is better", "Higher is better", "Lower is better"],
"Rule of thumb": [
"Directly interpretable in kW",
"If RMSE >> MAE, outlier errors dominate",
"> 0.95 excellent · > 0.90 good",
"< 5% excellent · < 10% good",
],
}
)
twiga_gt(
GT(metrics_guide)
.tab_header(
title=md("**Point Forecast Metric Guide**"),
subtitle="How to interpret the comparison table below",
)
.cols_label(
Metric=md("**Metric**"),
**{"What it measures": md("**What it measures**")},
Direction=md("**Direction**"),
**{"Rule of thumb": md("**Rule of thumb**")},
)
.tab_source_note("Twiga Forecast"),
n_rows=len(metrics_guide),
)
res = metric_all.groupby("Model")[["mae", "corr", "nbias", "rmse", "wmape", "smape"]].mean().round(2).reset_index()
res = res.rename(
columns={
"mae": "MAE",
"corr": "Corr",
"wmape": "WMAPE",
"smape": "SMAPE",
"nbias": "NBIAS",
"rmse": "RMSE",
}
)
twiga_report(
res,
["MAE", "Corr", "SMAPE", "RMSE"],
["MAE", "SMAPE", "RMSE"],
["Corr"],
)
Interpretation - Look at the teal-highlighted cells: they mark the best value per column. If the ENSEMBLE row is highlighted across all columns, the ensemble is strictly better - a reliable outcome when individual models make uncorrelated errors. If one model beats the ensemble on MAE, consider dropping the weakest model from the ensemble. A difference of < 0.1 kW MAE is within noise; focus on models that are consistently better across multiple metrics.
9. Visualise best model predictions#
We plot the first 7 days (7 × 48 = 336 steps) of the test set for one model.
Adjust model_name to compare any of the trained models.
p = plot_forecast_grid(
pred_all,
actual_col="Actual",
forecast_col="forecast",
model_col="Model",
n_samples_per_model=7 * 48,
y_label="Net Load (kW)",
title="Point forecasts — first 7 days of test set",
)
p
Wrapping up#
What you did
Loaded the MLVS-PT dataset and created chronological train/val/test splits
Defined a single shared
DataPipelineConfigreused across all modelsTrained a Linear Regression baseline and established a performance floor
Trained LightGBM and XGBoost gradient-boosted tree models
Combined all models into one multi-model
TwigaForecasterwithensemble_strategy="mean"Compared models using MAE, RMSE, Correlation, and SMAPE in a styled metrics table
Visualised actuals vs. forecasts for all models side-by-side
Key takeaways
Always train a linear baseline first - it is free and sets the quality bar.
Gradient-boosted trees (LightGBM, XGBoost) capture non-linear feature interactions that linear models cannot.
A shared
DataPipelineConfigguarantees a fair comparison - only the model varies.Ensembles reduce prediction variance and often beat individual models at the cost of interpretability.
Small metric differences (< 0.1 kW MAE) are noise; prioritise models that win consistently across multiple metrics.
What’s next?#
06 - Backtesting & Evaluation
A single train/test split gives one performance number that depends heavily on which period you happened to pick. NB06 shows you how to run multi-fold time-based cross-validation with forecaster.backtesting(), analyse per-fold metric variance, and choose between expanding and rolling window strategies.
# ruff: noqa: E501, E701, E702
from IPython.display import HTML
_TEAL = "#107591"
_TEAL_MID = "#069fac"
_TEAL_LIGHT = "#e8f5f8"
_TEAL_BEST = "#d0ecf1"
_TEXT_DARK = "#2d3748"
_TEXT_MUTED = "#718096"
_WHITE = "#ffffff"
steps = [
{
"num": "01",
"title": "Getting Started",
"desc": "Load data · configure pipeline · train LightGBM · evaluate",
"tags": ["data", "config", "train"],
"active": False,
},
{
"num": "04",
"title": "Time Series Differencing",
"desc": "Stationarity · first-order differencing · inversion",
"tags": ["differencing", "stationarity"],
"active": False,
},
{
"num": "05",
"title": "ML Point Forecasting",
"desc": "CatBoost · XGBoost · LightGBM · model comparison",
"tags": ["catboost", "xgboost", "lightgbm"],
"active": True,
},
{
"num": "06",
"title": "Backtesting & Evaluation",
"desc": "Rolling-window backtesting · fold-level metrics",
"tags": ["backtesting", "evaluation", "metrics"],
"active": False,
},
{
"num": "08",
"title": "Quantile Regression",
"desc": "First probabilistic step — prediction intervals",
"tags": ["probabilistic", "quantile", "intervals"],
"active": False,
},
]
def _badge(t, bg, fg):
return f'<span style="display:inline-block;background:{bg};color:{fg};font-size:10px;font-weight:600;padding:2px 7px;border-radius:10px;letter-spacing:.3px;margin:2px 2px 0 0;">{t}</span>'
cards_html = ""
for i, s in enumerate(steps):
a = s["active"]
cb = _TEAL if a else _WHITE
cbo = _TEAL if a else "#d1ecf1"
nb = _TEAL_MID if a else _TEAL_LIGHT
nf = _WHITE if a else _TEAL
tf = _WHITE if a else _TEXT_DARK
df2 = "#cce8ef" if a else _TEXT_MUTED
bb = "#0d5f75" if a else _TEAL_BEST
bf = "#b8e4ed" if a else _TEAL
yh = (
f'<span style="float:right;background:{_TEAL_MID};color:{_WHITE};font-size:10px;font-weight:700;padding:2px 10px;border-radius:12px;">★ you are here</span>'
if a
else ""
)
badges = "".join(_badge(t, bb, bf) for t in s["tags"])
cards_html += f'<div style="background:{cb};border:2px solid {cbo};border-radius:12px;padding:16px 20px;display:flex;align-items:flex-start;gap:16px;box-shadow:{"0 4px 14px rgba(16,117,145,.25)" if a else "0 1px 4px rgba(0,0,0,.06)"};"><div style="min-width:44px;height:44px;background:{nb};color:{nf};border-radius:50%;display:flex;align-items:center;justify-content:center;font-size:15px;font-weight:800;flex-shrink:0;">{s["num"]}</div><div style="flex:1;"><div style="font-size:15px;font-weight:700;color:{tf};margin-bottom:4px;">{s["title"]}{yh}</div><div style="font-size:12.5px;color:{df2};margin-bottom:8px;line-height:1.5;">{s["desc"]}</div><div>{badges}</div></div></div>'
if i < len(steps) - 1:
cards_html += f'<div style="display:flex;justify-content:center;height:32px;"><svg width="24" height="32" viewBox="0 0 24 32" fill="none"><line x1="12" y1="0" x2="12" y2="24" stroke="{_TEAL_MID}" stroke-width="2" stroke-dasharray="4 3"/><polygon points="6,20 18,20 12,30" fill="{_TEAL_MID}"/></svg></div>'
html = f'<div style="font-family:Inter,\'Segoe UI\',sans-serif;max-width:640px;margin:8px 0;"><div style="background:linear-gradient(135deg,{_TEAL} 0%,{_TEAL_MID} 100%);border-radius:12px 12px 0 0;padding:14px 20px;display:flex;align-items:center;gap:10px;"><svg width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="{_WHITE}" stroke-width="2"><path d="M12 2L2 7l10 5 10-5-10-5z"/><path d="M2 17l10 5 10-5"/><path d="M2 12l10 5 10-5"/></svg><span style="color:{_WHITE};font-size:14px;font-weight:700;">Twiga Learning Path — Point Forecasting Track</span></div><div style="border:2px solid {_TEAL_LIGHT};border-top:none;border-radius:0 0 12px 12px;padding:20px 20px 16px;background:#f9fdfe;display:flex;flex-direction:column;">{cards_html}<div style="margin-top:16px;font-size:11.5px;color:{_TEXT_MUTED};text-align:center;border-top:1px solid {_TEAL_LIGHT};padding-top:12px;">Next: explore <span style="color:{_TEAL};font-weight:600;">probabilistic forecasting</span> (08–10) or <span style="color:{_TEAL};font-weight:600;">hyperparameter tuning</span> (11).</div></div></div>'
HTML(html)