Baseline Benchmarking#

What you’ll build

A complete baseline benchmark comparing Naive, Seasonal Naive, Window Average, and Drift against LightGBM on the MLVS-PT dataset - with skill scores and forecast traces to diagnose whether a learned model is actually earning its complexity.

Prerequisites

01 - Getting Started
03 - Feature Engineering

Learning objectives

By the end of this notebook you will be able to:

Explain why baselines must come before advanced models - if LightGBM cannot beat Seasonal Naive, the problem is upstream of the model
Configure and run all four parameter-free baselines through TwigaForecaster
Read a benchmark table and identify the strongest reference model
Compute and interpret MAE and RMSE skill scores relative to the best baseline
Visualise forecast traces to see where and how each model fails

import warnings

warnings.filterwarnings("ignore")

from great_tables import GT, md
from IPython.display import clear_output
from lets_plot import LetsPlot
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler, StandardScaler

LetsPlot.setup_html()

from twiga import TwigaForecaster
from twiga.core.config import DataPipelineConfig, ForecasterConfig
from twiga.core.plot import (
    plot_forecast_grid,
    plot_metrics_bar,
    plot_timeseries,
)
from twiga.core.plot.gt import twiga_gt, twiga_report
from twiga.core.utils import configure, get_logger
from twiga.models.baseline.context_parrot_model import CONTEXTPARROTConfig
from twiga.models.baseline.drift_model import DRIFTConfig
from twiga.models.baseline.naive_model import NAIVEConfig
from twiga.models.baseline.seasonal_naive_model import SEASONALNAIVEConfig
from twiga.models.baseline.window_average_model import WINDOWAVERAGEConfig
from twiga.models.ml import LIGHTGBMConfig

configure()
log = get_logger(name="tutorial-15")

1. Load Data#

We use the MLVS-PT dataset - 30-minute net electrical load readings from a distribution substation in Madeira, Portugal. It exhibits strong daily and weekly seasonality driven by residential consumption patterns, making it a realistic benchmark for energy forecasting.

We keep only the two columns needed for this tutorial: timestamp and NetLoad(kW).

raw = pd.read_parquet("../data/MLVS-PT.parquet")
df = raw[["timestamp", "NetLoad(kW)"]].copy()
df = df.sort_values("timestamp").reset_index(drop=True)
# Restrict to 2019-2020 to keep tutorial execution fast
df = df[(df["timestamp"] >= "2019-01-01") & (df["timestamp"] <= "2020-12-31")].reset_index(drop=True)

log.info("Shape  : %s", df.shape)
log.info("Period : %s  ->  %s", df["timestamp"].min().date(), df["timestamp"].max().date())
log.info(
    "Target : min=%.3f  max=%.3f  mean=%.3f", df["NetLoad(kW)"].min(), df["NetLoad(kW)"].max(), df["NetLoad(kW)"].mean()
)

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[2], line 1
----> 1 raw = pd.read_parquet("../data/MLVS-PT.parquet")
df = raw[["timestamp", "NetLoad(kW)"]].copy()
df = df.sort_values("timestamp").reset_index(drop=True)
# Restrict to 2019-2020 to keep tutorial execution fast

File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/parquet.py:669, in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, dtype_backend, filesystem, filters, **kwargs)
   use_nullable_dtypes = False
check_dtype_backend(dtype_backend)
--> 669 return impl.read(
   path,
   columns=columns,
   filters=filters,
   storage_options=storage_options,
   use_nullable_dtypes=use_nullable_dtypes,
   dtype_backend=dtype_backend,
   filesystem=filesystem,
   **kwargs,
)

File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/parquet.py:258, in PyArrowImpl.read(self, path, columns, filters, use_nullable_dtypes, dtype_backend, storage_options, filesystem, **kwargs)
if manager == "array":
   to_pandas_kwargs["split_blocks"] = True
--> 258 path_or_handle, handles, filesystem = _get_path_or_handle(
   path,
   filesystem,
   storage_options=storage_options,
   mode="rb",
)
try:
   pa_table = self.api.parquet.read_table(
       path_or_handle,
       columns=columns,
   (...)    270         **kwargs,
   )

File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/parquet.py:141, in _get_path_or_handle(path, fs, storage_options, mode, is_dir)
handles = None
if (
   not fs
   and not is_dir
   (...)    139     # fsspec resources can also point to directories
   # this branch is used for example when reading from non-fsspec URLs
--> 141     handles = get_handle(
       path_or_handle, mode, is_text=False, storage_options=storage_options
   )
   fs = None
   path_or_handle = handles.handle

File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/common.py:882, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
       handle = open(
           handle,
           ioargs.mode,
   (...)    878             newline="",
       )
   else:
       # Binary mode
--> 882         handle = open(handle, ioargs.mode)
   handles.append(handle)
# Convert BytesIO or file objects passed with an encoding

FileNotFoundError: [Errno 2] No such file or directory: '../data/MLVS-PT.parquet'

twiga_gt(
    GT(df.head())
    .tab_header(
        title=md("**MLVS-PT — Raw Data Sample**"),
        subtitle="First 5 rows · 30-minute net electrical load",
    )
    .cols_label(
        timestamp=md("**Timestamp**"),
        **{"NetLoad(kW)": md("**Net Load (kW)**")},
    )
    .tab_source_note("MLVS-PT dataset · Net electrical load · 30-minute resolution"),
    n_rows=5,
)

p = plot_timeseries(
    df,
    y_cols=["NetLoad(kW)"],
    date_col="timestamp",
    title="MLVS-PT — Net Electrical Load (full series)",
    y_label="Net Load (kW)",
    x_label="Date",
    n_samples=3000,
    fig_size=(820, 280),
)
p

2. Train / Test Split#

For time series you must never shuffle rows before splitting. We hold out the last six months as the test set - these rows are never seen during training or CV fold construction.

Key concept - chronological splits

Shuffling a time series before splitting leaks the future into the training window. A model trained on data from 2021 and evaluated on 2020 appears accurate but fails completely in production. Always split by time.

train_df = df[df["timestamp"] < "2020-07-01"].reset_index(drop=True)
test_df = df[df["timestamp"] >= "2020-07-01"].reset_index(drop=True)

log.info(
    "train : %d rows  (%s -> %s)",
    len(train_df),
    train_df["timestamp"].min().date(),
    train_df["timestamp"].max().date(),
)
log.info(
    "test  : %d rows  (%s -> %s)",
    len(test_df),
    test_df["timestamp"].min().date(),
    test_df["timestamp"].max().date(),
)

split_summary = pd.DataFrame(
    {
        "Split": ["Train", "Test"],
        "Start": [
            str(train_df["timestamp"].min().date()),
            str(test_df["timestamp"].min().date()),
        ],
        "End": [
            str(train_df["timestamp"].max().date()),
            str(test_df["timestamp"].max().date()),
        ],
        "Rows": [f"{len(train_df):,}", f"{len(test_df):,}"],
        "Duration": ["~29 months", "~6 months"],
        "Purpose": ["Model learning + CV folds", "Final honest evaluation"],
    }
)

twiga_gt(
    GT(split_summary)
    .tab_header(
        title=md("**Dataset Splits**"),
        subtitle="Chronological — no shuffling, no overlap",
    )
    .cols_label(
        Split=md("**Split**"),
        Start=md("**Start**"),
        End=md("**End**"),
        Rows=md("**Rows**"),
        Duration=md("**Duration**"),
        Purpose=md("**Purpose**"),
    )
    .tab_source_note("MLVS-PT dataset · 30-minute resolution"),
    n_rows=len(split_summary),
)

3. Configure the Data Pipeline#

DataPipelineConfig describes what to forecast and how to build input features. We use a 96-step (48 h) lookback window and a 48-step (24 h) forecast horizon, matching the setup from Tutorial 01.

lags and windows are specified in days - Twiga multiplies by n_samples (48 for 30-min data) to convert to steps. So lags=[1, 7] creates lag features at 1 day (48 steps) and 1 week (336 steps) ago.

Key constraint for baseline models

Even parameter-free baselines flow through the same DataPipeline as ML models. The pipeline places raw target values in the leading feature columns so that X[:, t, :num_targets] contains the observed target at time step t. Twiga’s default pipeline satisfies this automatically - target lags are prepended first.

Why two pipeline configs?#

The stride parameter controls the step between consecutive sliding windows:

Config	`stride`	Use case
`data_config`	`1` (default)	ML / NN training - more windows = more training samples
`baseline_data_config`	`forecast_horizon`	Baseline evaluation - non-overlapping windows give independent, honest metrics

With stride=1, consecutive windows overlap by lookback_window_size − 1 steps, producing ~N correlated predictions that overstate statistical confidence in backtesting metrics. Setting stride=forecast_horizon ensures each prediction window is independent.

# Shared feature-engineering settings
_pipeline_kwargs = {
    "target_feature": "NetLoad(kW)",
    "period": "30min",
    "latitude": 32.371666,
    "longitude": -16.274998,
    "lookback_window_size": 96,  # 48 hours of 30-min context
    "forecast_horizon": 48,  # predict the next 24 hours
    "calendar_features": ["hour", "day_night"],
    "exogenous_features": [],
    "input_scaler": StandardScaler(),
    "target_scaler": RobustScaler(),
}

# ML models — stride=1 (fully overlapping, maximises training samples)
data_config = DataPipelineConfig(**_pipeline_kwargs)

# Baseline models — stride=forecast_horizon (non-overlapping, independent forecasts)
baseline_data_config = DataPipelineConfig(
    **_pipeline_kwargs, stride=48
)  # 7.5 days of context to capture weekly seasonality
baseline_data_config.lookback_window_size = (
    48 * 7
)  # 7 days of context for baseline models to capture weekly seasonality
train_config = ForecasterConfig(
    split_freq="months",
    train_size=3,
    test_size=1,
    window="expanding",
    project_name="baseline_benchmark",
    seed=42,
)

log.info("ML config   : stride=%d", data_config.stride)
log.info("Baseline config : stride=%d", baseline_data_config.stride)

4. Baseline Models#

All four models are parameter-free - no gradient descent, no hyperparameter search, and typically under a second per CV fold. Their purpose is to set the minimum bar that any trained model must clear before it earns a place in production.

Rule of thumb - always run Seasonal Naive first

On any series with clear daily or weekly periodicity, Seasonal Naive is the hardest cheap baseline to beat. If LightGBM cannot outperform it on MAE, the problem is in your data or features - not the model architecture.

baseline_overview = pd.DataFrame(
    {
        "Model": ["NAIVEModel", "SEASONALNAIVEModel", "WINDOWAVERAGEModel", "DRIFTModel", "CONTEXTPARROTModel"],
        "Strategy": [
            "Repeat the last observed value for all horizon steps",
            "Repeat the value observed exactly m steps ago (seasonal lag)",
            "Broadcast the mean of the last window_size observations",
            "Extrapolate the linear trend within the input window",
            "1-nearest-neighbour lookup in delay-embedded context space",
        ],
        "Best for": [
            "Strongly autocorrelated, non-seasonal series",
            "Series with clear daily or weekly periodicity",
            "Mean-reverting or noisy signals",
            "Slowly and smoothly trending series",
            "Chaotic or nonlinearly recurrent signals (e.g. energy, weather)",
        ],
    }
)

twiga_gt(
    GT(baseline_overview)
    .tab_header(
        title=md("**Twiga Baseline Models**"),
        subtitle="Five parameter-free reference models — no training required",
    )
    .cols_label(**{c: md(f"**{c}**") for c in baseline_overview.columns})
    .tab_source_note("twiga.models.baseline"),
    n_rows=len(baseline_overview),
)

4.1 Naive (window_last)#

The simplest possible forecast: for every test window, repeat the last observed target value across all 48 horizon steps. This is the classical persistence forecast - y-hat_{t+h} = y_t for all h.

naive_config = NAIVEConfig(strategy="last")

forecaster_naive = TwigaForecaster(
    data_params=baseline_data_config,
    model_params=[naive_config],
    train_params=train_config,
)
forecaster_naive.fit(train_df=train_df)
clear_output()

pred_naive, metric_naive = forecaster_naive.evaluate_point_forecast(test_df=test_df)
clear_output()

m = metric_naive[["mae", "rmse"]].mean().round(3)
log.info("Naive (window_last) complete.")
# log.info("  MAE=%.3f  RMSE=%.3f  Corr=%.3f", m["mae"], m["rmse"], m["corr"])

4.2 Seasonal Naive#

Repeat the value observed exactly one season ago. With period="1D" and freq="30min", the seasonal lag is 48 steps - every 30-min slot s is predicted as the observed value at the same slot yesterday. This is the canonical benchmark for any series with daily periodicity.

seasonal_config = SEASONALNAIVEConfig(period="7D", freq="30min")

forecaster_seasonal = TwigaForecaster(
    data_params=baseline_data_config,
    model_params=[seasonal_config],
    train_params=train_config,
)
forecaster_seasonal.fit(train_df=train_df)
clear_output()

pred_seasonal, metric_seasonal = forecaster_seasonal.evaluate_point_forecast(test_df=test_df)
clear_output()

m = (
    metric_seasonal[
        [
            "mae",
            "rmse",
        ]
    ]
    .mean()
    .round(3)
)
log.info("Seasonal Naive (1D) complete.")
log.info("  MAE=%.3f  RMSE=%.3f", m["mae"], m["rmse"])

4.3 Window Average#

Broadcast the mean of the last 48 observed steps (one day) as the prediction for all horizon steps. Robust to isolated outliers; performs well on mean-reverting signals where any single recent observation is noisy.

wavg_config = WINDOWAVERAGEConfig(window_size=48 * 8)

forecaster_wavg = TwigaForecaster(
    data_params=baseline_data_config,
    model_params=[wavg_config],
    train_params=train_config,
)
forecaster_wavg.fit(train_df=train_df)
clear_output()

pred_wavg, metric_wavg = forecaster_wavg.evaluate_point_forecast(test_df=test_df)
clear_output()

m = metric_wavg[["mae", "rmse"]].mean().round(3)
log.info("Window Average (48 steps = 24 h) complete.")
log.info("  MAE=%.3f  RMSE=%.3f", m["mae"], m["rmse"])

4.4 Drift#

Fit a slope to all observations in the input window and extrapolate it forward. Sensible for slowly trending series; can diverge badly on mean-reverting or oscillating signals.

drift_config = DRIFTConfig()

forecaster_drift = TwigaForecaster(
    data_params=baseline_data_config,
    model_params=[drift_config],
    train_params=train_config,
)
forecaster_drift.fit(train_df=train_df)
clear_output()

pred_drift, metric_drift = forecaster_drift.evaluate_point_forecast(test_df=test_df)
clear_output()

m = metric_drift[["mae", "rmse"]].mean().round(3)
log.info("Drift complete.")
log.info("  MAE=%.3f  RMSE=%.3f", m["mae"], m["rmse"])

4.5 Context Parrot#

Context parroting is a 1-nearest-neighbour strategy in delay-embedded space, formalised by Zhang & Gilpin (2024) as a deliberate baseline after observing that the Chronos foundation model implicitly exhibits this behaviour on chaotic systems.

Given a lookback window of length L:

The last D target values form the query motif.
Every length-D sub-window earlier in the context is a candidate (the final D positions are excluded).
The candidate with minimum Euclidean distance to the query is selected.
The H values immediately following the best match are returned as the forecast.

The embedding_dim (D) is the Takens delay-embedding dimension - larger values capture more context structure at the cost of requiring a longer lookback. min_seq_len = 2 * embedding_dim + 1 must be satisfied by the pipeline’s lookback_window_size.

Reference: Zhang, Z., & Gilpin, W. (2024). Chaos as an interpretable benchmark for forecasting and data-driven modelling. arXiv:2407.18857.

# embedding_dim=96 → one full day on 30-min data; lookback_window_size=336 >> min_seq_len=97
parrot_config = CONTEXTPARROTConfig(embedding_dim=96)

forecaster_parrot = TwigaForecaster(
    data_params=baseline_data_config,
    model_params=[parrot_config],
    train_params=train_config,
)
forecaster_parrot.fit(train_df=train_df)
clear_output()

pred_parrot, metric_parrot = forecaster_parrot.evaluate_point_forecast(test_df=test_df)
clear_output()

m = metric_parrot[["mae", "rmse"]].mean().round(3)
log.info("Context Parrot (embedding_dim=48) complete.")
log.info("  MAE=%.3f  RMSE=%.3f ", m["mae"], m["rmse"])

5. ML Comparison: LightGBM#

We now run LightGBM through the same pipeline and CV schedule. Default hyperparameters only - the goal is a fair apples-to-apples comparison, not a tuned champion model.

We also pass Ghi (global horizontal irradiance) as an exogenous feature, which is known for the full horizon (solar forecasts are available). The baselines do not use this signal - so any improvement from LightGBM is partly due to the additional input.

lgb_config = LIGHTGBMConfig()

forecaster_lgb = TwigaForecaster(
    data_params=data_config,
    model_params=[lgb_config],
    train_params=train_config,
)
forecaster_lgb.fit(train_df=train_df)
clear_output()

6. Benchmark Results#

We aggregate metrics across all CV folds and display them side-by-side. Lower is better for MAE, RMSE, SMAPE; higher is better for Corr.

metrics_guide = pd.DataFrame(
    {
        "Metric": ["MAE", "RMSE", "Corr"],
        "Full name": ["Mean Absolute Error", "Root Mean Squared Error", "Pearson Correlation"],
        "Direction": ["lower = better", "lower = better", "higher = better"],
        "Interpretation": [
            "Average absolute forecast error in kW — directly interpretable",
            "Penalises large errors more than MAE — sensitive to demand peaks",
            "Linear association between forecast and actual (0 = random, 1 = perfect)",
        ],
    }
)

twiga_gt(
    GT(metrics_guide)
    .tab_header(
        title=md("**Evaluation Metrics**"),
        subtitle="What each number means and which direction is better",
    )
    .cols_label(**{c: md(f"**{c}**") for c in metrics_guide.columns})
    .tab_source_note("twiga.core.metrics"),
    n_rows=len(metrics_guide),
)

metrics_all = pd.concat(
    [metric_naive, metric_seasonal, metric_wavg, metric_parrot],
    ignore_index=True,
)

res = (
    metrics_all.groupby("Model", sort=False)[["mae", "rmse", "corr", "wmape", "smape", "nbias"]]
    .mean()
    .round(4)
    .reset_index()
)
res = res.rename(
    columns={
        "mae": "MAE",
        "corr": "Corr",
        "wmape": "WMAPE",
        "smape": "SMAPE",
        "nbias": "NBIAS",
        "rmse": "RMSE",
    }
)

twiga_report(
    res,
    ["MAE", "Corr", "SMAPE", "RMSE"],
    ["MAE", "SMAPE", "RMSE"],
    ["Corr"],
)

7. Skill Scores#

A skill score measures the percentage improvement of a model over a reference baseline:

\[\text{SS}_{\text{MAE}} = \left(1 - \frac{\text{MAE}_{\text{model}}}{\text{MAE}_{\text{reference}}}\right) \times 100\%\]

Score	Meaning
> 0 %	Model beats the baseline by that margin
= 0 %	Tied - added complexity brings no benefit
< 0 %	Model is worse than the naive reference - investigate data and features

We use the best baseline (lowest MAE across the four parameter-free models) as the reference.

baseline_names = ["NAIVE", "SEASONAL_NAIVE", "WINDOW_AVERAGE", "DRIFT"]

baseline_rows = res[res["Model"].isin(baseline_names)]
best_mae = baseline_rows["MAE"].min()
best_rmse = baseline_rows["RMSE"].min()
best_name = baseline_rows.loc[baseline_rows["MAE"].idxmin(), "Model"]

log.info("Best baseline : %s", best_name)
log.info("  Reference MAE  = %.4f", best_mae)
log.info("  Reference RMSE = %.4f", best_rmse)

skill = res[["Model", "MAE", "RMSE"]].copy()
skill["SS_MAE (%)"] = ((1 - skill["MAE"] / best_mae) * 100).round(1)
skill["SS_RMSE (%)"] = ((1 - skill["RMSE"] / best_rmse) * 100).round(1)

twiga_gt(
    GT(skill)
    .tab_header(
        title=md("**Skill Scores vs Best Baseline**"),
        subtitle=md(f"Reference: **{best_name}** — positive = better than reference"),
    )
    .cols_label(
        Model=md("**Model**"),
        MAE=md("**MAE**"),
        RMSE=md("**RMSE**"),
        **{"SS_MAE (%)": md("**SS MAE (%)**")},
        **{"SS_RMSE (%)": md("**SS RMSE (%)**")},
    )
    .tab_source_note("Skill score = (1 - model_error / reference_error) x 100 %"),
    n_rows=len(skill),
)

skill_bar = skill[["Model", "SS_MAE (%)"]].rename(columns={"SS_MAE (%)": "SS_MAE"})

p = plot_metrics_bar(
    skill_bar,
    metric_col="SS_MAE",
    model_col="Model",
    lower_is_better=False,
    title="MAE Skill Score vs Best Baseline (%)",
    x_label="Skill Score (%) — positive = better than reference",
    horizontal=True,
    fig_size=(620, 340),
)
p

8. Forecast Visualisation#

Plotting the forecast traces reveals where and how each model fails. Seven days of 30-minute predictions show whether a model tracks the daily cycle, overshoots trend changes, or degrades over the horizon.

preds_all = pd.concat(
    [pred_naive, pred_seasonal, pred_parrot],
    ignore_index=True,
)

p = plot_forecast_grid(
    preds_all,
    actual_col="Actual",
    forecast_col="forecast",
    model_col="Model",
    n_samples_per_model=7 * 48,
    y_label="Net Load (kW)",
    title="Baseline + ML forecast traces — first 7 days of test set",
    fig_width=1200,
)
p

p.show()

Wrapping up#

What you did

Loaded and explored the MLVS-PT 30-min dataset
Configured a shared data pipeline (lookback=96, horizon=48)
Ran all four parameter-free baselines through TwigaForecaster
Ran LightGBM for ML comparison using the same protocol
Built a side-by-side benchmark table and identified the strongest baseline
Computed MAE and RMSE skill scores relative to the best reference
Plotted forecast traces to understand each model’s failure mode

Key takeaways

Always run baselines before ML - they are fast, interpretable, and diagnostic
Seasonal Naive is the hardest cheap baseline to beat on any periodic series
A skill score near 0 % means added complexity provides no value
A negative skill score is a red flag - investigate data and features before tuning
Forecast traces reveal failure modes that aggregate metrics hide

What’s next?#

# ruff: noqa: E501, E701, E702
from IPython.display import HTML

_TEAL = "#107591"
_TEAL_MID = "#069fac"
_TEAL_LIGHT = "#e8f5f8"
_TEAL_BEST = "#d0ecf1"
_TEXT_DARK = "#2d3748"
_TEXT_MUTED = "#718096"
_WHITE = "#ffffff"

steps = [
    {
        "num": "01",
        "title": "Getting Started",
        "desc": "Load data · configure pipeline · train LightGBM",
        "tags": ["data", "config", "train"],
        "active": False,
    },
    {
        "num": "03",
        "title": "Feature Engineering",
        "desc": "Lag, rolling-window, and calendar features",
        "tags": ["features", "lags", "calendar"],
        "active": False,
    },
    {
        "num": "15",
        "title": "Baseline Benchmarking",
        "desc": "Naive · SeasonalNaive · WindowAverage · Drift — skill scores vs. ML",
        "tags": ["baseline", "naive", "skill score"],
        "active": True,
    },
    {
        "num": "04",
        "title": "ML Point Forecasting",
        "desc": "CatBoost · XGBoost · LightGBM — multi-model comparison",
        "tags": ["catboost", "xgboost", "lightgbm"],
        "active": False,
    },
    {
        "num": "10",
        "title": "Hyperparameter Tuning",
        "desc": "Optuna-based HPO with resumable SQLite studies",
        "tags": ["optuna", "HPO", "tuning"],
        "active": False,
    },
]
track_name = "Beginner Track"


def _badge(t, bg, fg):
    return f'<span style="display:inline-block;background:{bg};color:{fg};font-size:10px;font-weight:600;padding:2px 7px;border-radius:10px;letter-spacing:.3px;margin:2px 2px 0 0;">{t}</span>'


cards_html = ""
for i, s in enumerate(steps):
    a = s["active"]
    cb = _TEAL if a else _WHITE
    cbo = _TEAL if a else "#d1ecf1"
    nb = _TEAL_MID if a else _TEAL_LIGHT
    nf = _WHITE if a else _TEAL
    tf = _WHITE if a else _TEXT_DARK
    df2 = "#cce8ef" if a else _TEXT_MUTED
    bb = "#0d5f75" if a else _TEAL_BEST
    bf = "#b8e4ed" if a else _TEAL
    yh = (
        f'<span style="float:right;background:{_TEAL_MID};color:{_WHITE};font-size:10px;font-weight:700;padding:2px 10px;border-radius:12px;">\u2605 you are here</span>'
        if a
        else ""
    )
    badges = "".join(_badge(t, bb, bf) for t in s["tags"])
    cards_html += f'<div style="background:{cb};border:2px solid {cbo};border-radius:12px;padding:16px 20px;display:flex;align-items:flex-start;gap:16px;box-shadow:{"0 4px 14px rgba(16,117,145,.25)" if a else "0 1px 4px rgba(0,0,0,.06)"};"><div style="min-width:44px;height:44px;background:{nb};color:{nf};border-radius:50%;display:flex;align-items:center;justify-content:center;font-size:15px;font-weight:800;flex-shrink:0;">{s["num"]}</div><div style="flex:1;"><div style="font-size:15px;font-weight:700;color:{tf};margin-bottom:4px;">{s["title"]}{yh}</div><div style="font-size:12.5px;color:{df2};margin-bottom:8px;line-height:1.5;">{s["desc"]}</div><div>{badges}</div></div></div>'
    if i < len(steps) - 1:
        cards_html += f'<div style="display:flex;justify-content:center;height:32px;"><svg width="24" height="32" viewBox="0 0 24 32" fill="none"><line x1="12" y1="0" x2="12" y2="24" stroke="{_TEAL_MID}" stroke-width="2" stroke-dasharray="4 3"/><polygon points="6,20 18,20 12,30" fill="{_TEAL_MID}"/></svg></div>'
html = f'<div style="font-family:Inter,\'Segoe UI\',sans-serif;max-width:640px;margin:8px 0;"><div style="background:linear-gradient(135deg,{_TEAL} 0%,{_TEAL_MID} 100%);border-radius:12px 12px 0 0;padding:14px 20px;display:flex;align-items:center;gap:10px;"><svg width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="{_WHITE}" stroke-width="2"><path d="M12 2L2 7l10 5 10-5-10-5z"/><path d="M2 17l10 5 10-5"/><path d="M2 12l10 5 10-5"/></svg><span style="color:{_WHITE};font-size:14px;font-weight:700;">Twiga Learning Path — {track_name}</span></div><div style="border:2px solid {_TEAL_LIGHT};border-top:none;border-radius:0 0 12px 12px;padding:20px 20px 16px;background:#f9fdfe;display:flex;flex-direction:column;">{cards_html}</div></div>'
HTML(html)