Baseline Benchmarking#
What you’ll build
A complete baseline benchmark comparing Naive, Seasonal Naive, Window Average, and Drift against LightGBM on the MLVS-PT dataset - with skill scores and forecast traces to diagnose whether a learned model is actually earning its complexity.
Prerequisites
01 - Getting Started
03 - Feature Engineering
Learning objectives
By the end of this notebook you will be able to:
Explain why baselines must come before advanced models - if LightGBM cannot beat Seasonal Naive, the problem is upstream of the model
Configure and run all four parameter-free baselines through
TwigaForecasterRead a benchmark table and identify the strongest reference model
Compute and interpret MAE and RMSE skill scores relative to the best baseline
Visualise forecast traces to see where and how each model fails
import warnings
warnings.filterwarnings("ignore")
from great_tables import GT, md
from IPython.display import clear_output
from lets_plot import LetsPlot
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler, StandardScaler
LetsPlot.setup_html()
from twiga import TwigaForecaster
from twiga.core.config import DataPipelineConfig, ForecasterConfig
from twiga.core.plot import (
plot_forecast_grid,
plot_metrics_bar,
plot_timeseries,
)
from twiga.core.plot.gt import twiga_gt, twiga_report
from twiga.core.utils import configure, get_logger
from twiga.models.baseline.context_parrot_model import CONTEXTPARROTConfig
from twiga.models.baseline.drift_model import DRIFTConfig
from twiga.models.baseline.naive_model import NAIVEConfig
from twiga.models.baseline.seasonal_naive_model import SEASONALNAIVEConfig
from twiga.models.baseline.window_average_model import WINDOWAVERAGEConfig
from twiga.models.ml import LIGHTGBMConfig
configure()
log = get_logger(name="tutorial-15")
1. Load Data#
We use the MLVS-PT dataset - 30-minute net electrical load readings from a distribution substation in Madeira, Portugal. It exhibits strong daily and weekly seasonality driven by residential consumption patterns, making it a realistic benchmark for energy forecasting.
We keep only the two columns needed for this tutorial: timestamp and NetLoad(kW).
raw = pd.read_parquet("../data/MLVS-PT.parquet")
df = raw[["timestamp", "NetLoad(kW)"]].copy()
df = df.sort_values("timestamp").reset_index(drop=True)
# Restrict to 2019-2020 to keep tutorial execution fast
df = df[(df["timestamp"] >= "2019-01-01") & (df["timestamp"] <= "2020-12-31")].reset_index(drop=True)
log.info("Shape : %s", df.shape)
log.info("Period : %s -> %s", df["timestamp"].min().date(), df["timestamp"].max().date())
log.info(
"Target : min=%.3f max=%.3f mean=%.3f", df["NetLoad(kW)"].min(), df["NetLoad(kW)"].max(), df["NetLoad(kW)"].mean()
)
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[2], line 1
----> 1 raw = pd.read_parquet("../data/MLVS-PT.parquet")
2 df = raw[["timestamp", "NetLoad(kW)"]].copy()
3 df = df.sort_values("timestamp").reset_index(drop=True)
4 # Restrict to 2019-2020 to keep tutorial execution fast
File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/parquet.py:669, in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, dtype_backend, filesystem, filters, **kwargs)
666 use_nullable_dtypes = False
667 check_dtype_backend(dtype_backend)
--> 669 return impl.read(
670 path,
671 columns=columns,
672 filters=filters,
673 storage_options=storage_options,
674 use_nullable_dtypes=use_nullable_dtypes,
675 dtype_backend=dtype_backend,
676 filesystem=filesystem,
677 **kwargs,
678 )
File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/parquet.py:258, in PyArrowImpl.read(self, path, columns, filters, use_nullable_dtypes, dtype_backend, storage_options, filesystem, **kwargs)
256 if manager == "array":
257 to_pandas_kwargs["split_blocks"] = True
--> 258 path_or_handle, handles, filesystem = _get_path_or_handle(
259 path,
260 filesystem,
261 storage_options=storage_options,
262 mode="rb",
263 )
264 try:
265 pa_table = self.api.parquet.read_table(
266 path_or_handle,
267 columns=columns,
(...) 270 **kwargs,
271 )
File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/parquet.py:141, in _get_path_or_handle(path, fs, storage_options, mode, is_dir)
131 handles = None
132 if (
133 not fs
134 and not is_dir
(...) 139 # fsspec resources can also point to directories
140 # this branch is used for example when reading from non-fsspec URLs
--> 141 handles = get_handle(
142 path_or_handle, mode, is_text=False, storage_options=storage_options
143 )
144 fs = None
145 path_or_handle = handles.handle
File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/common.py:882, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
873 handle = open(
874 handle,
875 ioargs.mode,
(...) 878 newline="",
879 )
880 else:
881 # Binary mode
--> 882 handle = open(handle, ioargs.mode)
883 handles.append(handle)
885 # Convert BytesIO or file objects passed with an encoding
FileNotFoundError: [Errno 2] No such file or directory: '../data/MLVS-PT.parquet'
twiga_gt(
GT(df.head())
.tab_header(
title=md("**MLVS-PT — Raw Data Sample**"),
subtitle="First 5 rows · 30-minute net electrical load",
)
.cols_label(
timestamp=md("**Timestamp**"),
**{"NetLoad(kW)": md("**Net Load (kW)**")},
)
.tab_source_note("MLVS-PT dataset · Net electrical load · 30-minute resolution"),
n_rows=5,
)
p = plot_timeseries(
df,
y_cols=["NetLoad(kW)"],
date_col="timestamp",
title="MLVS-PT — Net Electrical Load (full series)",
y_label="Net Load (kW)",
x_label="Date",
n_samples=3000,
fig_size=(820, 280),
)
p
2. Train / Test Split#
For time series you must never shuffle rows before splitting. We hold out the last six months as the test set - these rows are never seen during training or CV fold construction.
Key concept - chronological splits
Shuffling a time series before splitting leaks the future into the training window. A model trained on data from 2021 and evaluated on 2020 appears accurate but fails completely in production. Always split by time.
train_df = df[df["timestamp"] < "2020-07-01"].reset_index(drop=True)
test_df = df[df["timestamp"] >= "2020-07-01"].reset_index(drop=True)
log.info(
"train : %d rows (%s -> %s)",
len(train_df),
train_df["timestamp"].min().date(),
train_df["timestamp"].max().date(),
)
log.info(
"test : %d rows (%s -> %s)",
len(test_df),
test_df["timestamp"].min().date(),
test_df["timestamp"].max().date(),
)
split_summary = pd.DataFrame(
{
"Split": ["Train", "Test"],
"Start": [
str(train_df["timestamp"].min().date()),
str(test_df["timestamp"].min().date()),
],
"End": [
str(train_df["timestamp"].max().date()),
str(test_df["timestamp"].max().date()),
],
"Rows": [f"{len(train_df):,}", f"{len(test_df):,}"],
"Duration": ["~29 months", "~6 months"],
"Purpose": ["Model learning + CV folds", "Final honest evaluation"],
}
)
twiga_gt(
GT(split_summary)
.tab_header(
title=md("**Dataset Splits**"),
subtitle="Chronological — no shuffling, no overlap",
)
.cols_label(
Split=md("**Split**"),
Start=md("**Start**"),
End=md("**End**"),
Rows=md("**Rows**"),
Duration=md("**Duration**"),
Purpose=md("**Purpose**"),
)
.tab_source_note("MLVS-PT dataset · 30-minute resolution"),
n_rows=len(split_summary),
)
3. Configure the Data Pipeline#
DataPipelineConfig describes what to forecast and how to build input features. We use a 96-step (48 h) lookback window and a 48-step (24 h) forecast horizon, matching the setup from Tutorial 01.
lags and windows are specified in days - Twiga multiplies by n_samples (48 for 30-min data) to convert to steps. So lags=[1, 7] creates lag features at 1 day (48 steps) and 1 week (336 steps) ago.
Key constraint for baseline models
Even parameter-free baselines flow through the same
DataPipelineas ML models. The pipeline places raw target values in the leading feature columns so thatX[:, t, :num_targets]contains the observed target at time step t. Twiga’s default pipeline satisfies this automatically - target lags are prepended first.
Why two pipeline configs?#
The stride parameter controls the step between consecutive sliding windows:
Config |
|
Use case |
|---|---|---|
|
|
ML / NN training - more windows = more training samples |
|
|
Baseline evaluation - non-overlapping windows give independent, honest metrics |
With stride=1, consecutive windows overlap by lookback_window_size − 1 steps, producing ~N correlated predictions that overstate statistical confidence in backtesting metrics. Setting stride=forecast_horizon ensures each prediction window is independent.
# Shared feature-engineering settings
_pipeline_kwargs = {
"target_feature": "NetLoad(kW)",
"period": "30min",
"latitude": 32.371666,
"longitude": -16.274998,
"lookback_window_size": 96, # 48 hours of 30-min context
"forecast_horizon": 48, # predict the next 24 hours
"calendar_features": ["hour", "day_night"],
"exogenous_features": [],
"input_scaler": StandardScaler(),
"target_scaler": RobustScaler(),
}
# ML models — stride=1 (fully overlapping, maximises training samples)
data_config = DataPipelineConfig(**_pipeline_kwargs)
# Baseline models — stride=forecast_horizon (non-overlapping, independent forecasts)
baseline_data_config = DataPipelineConfig(
**_pipeline_kwargs, stride=48
) # 7.5 days of context to capture weekly seasonality
baseline_data_config.lookback_window_size = (
48 * 7
) # 7 days of context for baseline models to capture weekly seasonality
train_config = ForecasterConfig(
split_freq="months",
train_size=3,
test_size=1,
window="expanding",
project_name="baseline_benchmark",
seed=42,
)
log.info("ML config : stride=%d", data_config.stride)
log.info("Baseline config : stride=%d", baseline_data_config.stride)
4. Baseline Models#
All four models are parameter-free - no gradient descent, no hyperparameter search, and typically under a second per CV fold. Their purpose is to set the minimum bar that any trained model must clear before it earns a place in production.
Rule of thumb - always run Seasonal Naive first
On any series with clear daily or weekly periodicity, Seasonal Naive is the hardest cheap baseline to beat. If LightGBM cannot outperform it on MAE, the problem is in your data or features - not the model architecture.
baseline_overview = pd.DataFrame(
{
"Model": ["NAIVEModel", "SEASONALNAIVEModel", "WINDOWAVERAGEModel", "DRIFTModel", "CONTEXTPARROTModel"],
"Strategy": [
"Repeat the last observed value for all horizon steps",
"Repeat the value observed exactly m steps ago (seasonal lag)",
"Broadcast the mean of the last window_size observations",
"Extrapolate the linear trend within the input window",
"1-nearest-neighbour lookup in delay-embedded context space",
],
"Best for": [
"Strongly autocorrelated, non-seasonal series",
"Series with clear daily or weekly periodicity",
"Mean-reverting or noisy signals",
"Slowly and smoothly trending series",
"Chaotic or nonlinearly recurrent signals (e.g. energy, weather)",
],
}
)
twiga_gt(
GT(baseline_overview)
.tab_header(
title=md("**Twiga Baseline Models**"),
subtitle="Five parameter-free reference models — no training required",
)
.cols_label(**{c: md(f"**{c}**") for c in baseline_overview.columns})
.tab_source_note("twiga.models.baseline"),
n_rows=len(baseline_overview),
)
4.1 Naive (window_last)#
The simplest possible forecast: for every test window, repeat the last observed target value across all 48 horizon steps. This is the classical persistence forecast - y-hat_{t+h} = y_t for all h.
naive_config = NAIVEConfig(strategy="last")
forecaster_naive = TwigaForecaster(
data_params=baseline_data_config,
model_params=[naive_config],
train_params=train_config,
)
forecaster_naive.fit(train_df=train_df)
clear_output()
pred_naive, metric_naive = forecaster_naive.evaluate_point_forecast(test_df=test_df)
clear_output()
m = metric_naive[["mae", "rmse"]].mean().round(3)
log.info("Naive (window_last) complete.")
# log.info(" MAE=%.3f RMSE=%.3f Corr=%.3f", m["mae"], m["rmse"], m["corr"])
4.2 Seasonal Naive#
Repeat the value observed exactly one season ago. With period="1D" and freq="30min", the seasonal lag is 48 steps - every 30-min slot s is predicted as the observed value at the same slot yesterday. This is the canonical benchmark for any series with daily periodicity.
seasonal_config = SEASONALNAIVEConfig(period="7D", freq="30min")
forecaster_seasonal = TwigaForecaster(
data_params=baseline_data_config,
model_params=[seasonal_config],
train_params=train_config,
)
forecaster_seasonal.fit(train_df=train_df)
clear_output()
pred_seasonal, metric_seasonal = forecaster_seasonal.evaluate_point_forecast(test_df=test_df)
clear_output()
m = (
metric_seasonal[
[
"mae",
"rmse",
]
]
.mean()
.round(3)
)
log.info("Seasonal Naive (1D) complete.")
log.info(" MAE=%.3f RMSE=%.3f", m["mae"], m["rmse"])
4.3 Window Average#
Broadcast the mean of the last 48 observed steps (one day) as the prediction for all horizon steps. Robust to isolated outliers; performs well on mean-reverting signals where any single recent observation is noisy.
wavg_config = WINDOWAVERAGEConfig(window_size=48 * 8)
forecaster_wavg = TwigaForecaster(
data_params=baseline_data_config,
model_params=[wavg_config],
train_params=train_config,
)
forecaster_wavg.fit(train_df=train_df)
clear_output()
pred_wavg, metric_wavg = forecaster_wavg.evaluate_point_forecast(test_df=test_df)
clear_output()
m = metric_wavg[["mae", "rmse"]].mean().round(3)
log.info("Window Average (48 steps = 24 h) complete.")
log.info(" MAE=%.3f RMSE=%.3f", m["mae"], m["rmse"])
4.4 Drift#
Fit a slope to all observations in the input window and extrapolate it forward. Sensible for slowly trending series; can diverge badly on mean-reverting or oscillating signals.
drift_config = DRIFTConfig()
forecaster_drift = TwigaForecaster(
data_params=baseline_data_config,
model_params=[drift_config],
train_params=train_config,
)
forecaster_drift.fit(train_df=train_df)
clear_output()
pred_drift, metric_drift = forecaster_drift.evaluate_point_forecast(test_df=test_df)
clear_output()
m = metric_drift[["mae", "rmse"]].mean().round(3)
log.info("Drift complete.")
log.info(" MAE=%.3f RMSE=%.3f", m["mae"], m["rmse"])
4.5 Context Parrot#
Context parroting is a 1-nearest-neighbour strategy in delay-embedded space, formalised by Zhang & Gilpin (2024) as a deliberate baseline after observing that the Chronos foundation model implicitly exhibits this behaviour on chaotic systems.
Given a lookback window of length L:
The last D target values form the query motif.
Every length-D sub-window earlier in the context is a candidate (the final D positions are excluded).
The candidate with minimum Euclidean distance to the query is selected.
The H values immediately following the best match are returned as the forecast.
The embedding_dim (D) is the Takens delay-embedding dimension - larger values
capture more context structure at the cost of requiring a longer lookback.
min_seq_len = 2 * embedding_dim + 1 must be satisfied by the pipeline’s lookback_window_size.
Reference: Zhang, Z., & Gilpin, W. (2024). Chaos as an interpretable benchmark for forecasting and data-driven modelling. arXiv:2407.18857.
# embedding_dim=96 → one full day on 30-min data; lookback_window_size=336 >> min_seq_len=97
parrot_config = CONTEXTPARROTConfig(embedding_dim=96)
forecaster_parrot = TwigaForecaster(
data_params=baseline_data_config,
model_params=[parrot_config],
train_params=train_config,
)
forecaster_parrot.fit(train_df=train_df)
clear_output()
pred_parrot, metric_parrot = forecaster_parrot.evaluate_point_forecast(test_df=test_df)
clear_output()
m = metric_parrot[["mae", "rmse"]].mean().round(3)
log.info("Context Parrot (embedding_dim=48) complete.")
log.info(" MAE=%.3f RMSE=%.3f ", m["mae"], m["rmse"])
5. ML Comparison: LightGBM#
We now run LightGBM through the same pipeline and CV schedule. Default hyperparameters only - the goal is a fair apples-to-apples comparison, not a tuned champion model.
We also pass Ghi (global horizontal irradiance) as an exogenous feature, which is known for the full horizon (solar forecasts are available). The baselines do not use this signal - so any improvement from LightGBM is partly due to the additional input.
lgb_config = LIGHTGBMConfig()
forecaster_lgb = TwigaForecaster(
data_params=data_config,
model_params=[lgb_config],
train_params=train_config,
)
forecaster_lgb.fit(train_df=train_df)
clear_output()
6. Benchmark Results#
We aggregate metrics across all CV folds and display them side-by-side. Lower is better for MAE, RMSE, SMAPE; higher is better for Corr.
metrics_guide = pd.DataFrame(
{
"Metric": ["MAE", "RMSE", "Corr"],
"Full name": ["Mean Absolute Error", "Root Mean Squared Error", "Pearson Correlation"],
"Direction": ["lower = better", "lower = better", "higher = better"],
"Interpretation": [
"Average absolute forecast error in kW — directly interpretable",
"Penalises large errors more than MAE — sensitive to demand peaks",
"Linear association between forecast and actual (0 = random, 1 = perfect)",
],
}
)
twiga_gt(
GT(metrics_guide)
.tab_header(
title=md("**Evaluation Metrics**"),
subtitle="What each number means and which direction is better",
)
.cols_label(**{c: md(f"**{c}**") for c in metrics_guide.columns})
.tab_source_note("twiga.core.metrics"),
n_rows=len(metrics_guide),
)
metrics_all = pd.concat(
[metric_naive, metric_seasonal, metric_wavg, metric_parrot],
ignore_index=True,
)
res = (
metrics_all.groupby("Model", sort=False)[["mae", "rmse", "corr", "wmape", "smape", "nbias"]]
.mean()
.round(4)
.reset_index()
)
res = res.rename(
columns={
"mae": "MAE",
"corr": "Corr",
"wmape": "WMAPE",
"smape": "SMAPE",
"nbias": "NBIAS",
"rmse": "RMSE",
}
)
twiga_report(
res,
["MAE", "Corr", "SMAPE", "RMSE"],
["MAE", "SMAPE", "RMSE"],
["Corr"],
)
7. Skill Scores#
A skill score measures the percentage improvement of a model over a reference baseline:
Score |
Meaning |
|---|---|
> 0 % |
Model beats the baseline by that margin |
= 0 % |
Tied - added complexity brings no benefit |
< 0 % |
Model is worse than the naive reference - investigate data and features |
We use the best baseline (lowest MAE across the four parameter-free models) as the reference.
baseline_names = ["NAIVE", "SEASONAL_NAIVE", "WINDOW_AVERAGE", "DRIFT"]
baseline_rows = res[res["Model"].isin(baseline_names)]
best_mae = baseline_rows["MAE"].min()
best_rmse = baseline_rows["RMSE"].min()
best_name = baseline_rows.loc[baseline_rows["MAE"].idxmin(), "Model"]
log.info("Best baseline : %s", best_name)
log.info(" Reference MAE = %.4f", best_mae)
log.info(" Reference RMSE = %.4f", best_rmse)
skill = res[["Model", "MAE", "RMSE"]].copy()
skill["SS_MAE (%)"] = ((1 - skill["MAE"] / best_mae) * 100).round(1)
skill["SS_RMSE (%)"] = ((1 - skill["RMSE"] / best_rmse) * 100).round(1)
twiga_gt(
GT(skill)
.tab_header(
title=md("**Skill Scores vs Best Baseline**"),
subtitle=md(f"Reference: **{best_name}** — positive = better than reference"),
)
.cols_label(
Model=md("**Model**"),
MAE=md("**MAE**"),
RMSE=md("**RMSE**"),
**{"SS_MAE (%)": md("**SS MAE (%)**")},
**{"SS_RMSE (%)": md("**SS RMSE (%)**")},
)
.tab_source_note("Skill score = (1 - model_error / reference_error) x 100 %"),
n_rows=len(skill),
)
skill_bar = skill[["Model", "SS_MAE (%)"]].rename(columns={"SS_MAE (%)": "SS_MAE"})
p = plot_metrics_bar(
skill_bar,
metric_col="SS_MAE",
model_col="Model",
lower_is_better=False,
title="MAE Skill Score vs Best Baseline (%)",
x_label="Skill Score (%) — positive = better than reference",
horizontal=True,
fig_size=(620, 340),
)
p
8. Forecast Visualisation#
Plotting the forecast traces reveals where and how each model fails. Seven days of 30-minute predictions show whether a model tracks the daily cycle, overshoots trend changes, or degrades over the horizon.
preds_all = pd.concat(
[pred_naive, pred_seasonal, pred_parrot],
ignore_index=True,
)
p = plot_forecast_grid(
preds_all,
actual_col="Actual",
forecast_col="forecast",
model_col="Model",
n_samples_per_model=7 * 48,
y_label="Net Load (kW)",
title="Baseline + ML forecast traces — first 7 days of test set",
fig_width=1200,
)
p
p.show()
Wrapping up#
What you did
Loaded and explored the MLVS-PT 30-min dataset
Configured a shared data pipeline (
lookback=96,horizon=48)Ran all four parameter-free baselines through
TwigaForecasterRan LightGBM for ML comparison using the same protocol
Built a side-by-side benchmark table and identified the strongest baseline
Computed MAE and RMSE skill scores relative to the best reference
Plotted forecast traces to understand each model’s failure mode
Key takeaways
Always run baselines before ML - they are fast, interpretable, and diagnostic
Seasonal Naive is the hardest cheap baseline to beat on any periodic series
A skill score near 0 % means added complexity provides no value
A negative skill score is a red flag - investigate data and features before tuning
Forecast traces reveal failure modes that aggregate metrics hide
What’s next?#
# ruff: noqa: E501, E701, E702
from IPython.display import HTML
_TEAL = "#107591"
_TEAL_MID = "#069fac"
_TEAL_LIGHT = "#e8f5f8"
_TEAL_BEST = "#d0ecf1"
_TEXT_DARK = "#2d3748"
_TEXT_MUTED = "#718096"
_WHITE = "#ffffff"
steps = [
{
"num": "01",
"title": "Getting Started",
"desc": "Load data · configure pipeline · train LightGBM",
"tags": ["data", "config", "train"],
"active": False,
},
{
"num": "03",
"title": "Feature Engineering",
"desc": "Lag, rolling-window, and calendar features",
"tags": ["features", "lags", "calendar"],
"active": False,
},
{
"num": "15",
"title": "Baseline Benchmarking",
"desc": "Naive · SeasonalNaive · WindowAverage · Drift — skill scores vs. ML",
"tags": ["baseline", "naive", "skill score"],
"active": True,
},
{
"num": "04",
"title": "ML Point Forecasting",
"desc": "CatBoost · XGBoost · LightGBM — multi-model comparison",
"tags": ["catboost", "xgboost", "lightgbm"],
"active": False,
},
{
"num": "10",
"title": "Hyperparameter Tuning",
"desc": "Optuna-based HPO with resumable SQLite studies",
"tags": ["optuna", "HPO", "tuning"],
"active": False,
},
]
track_name = "Beginner Track"
def _badge(t, bg, fg):
return f'<span style="display:inline-block;background:{bg};color:{fg};font-size:10px;font-weight:600;padding:2px 7px;border-radius:10px;letter-spacing:.3px;margin:2px 2px 0 0;">{t}</span>'
cards_html = ""
for i, s in enumerate(steps):
a = s["active"]
cb = _TEAL if a else _WHITE
cbo = _TEAL if a else "#d1ecf1"
nb = _TEAL_MID if a else _TEAL_LIGHT
nf = _WHITE if a else _TEAL
tf = _WHITE if a else _TEXT_DARK
df2 = "#cce8ef" if a else _TEXT_MUTED
bb = "#0d5f75" if a else _TEAL_BEST
bf = "#b8e4ed" if a else _TEAL
yh = (
f'<span style="float:right;background:{_TEAL_MID};color:{_WHITE};font-size:10px;font-weight:700;padding:2px 10px;border-radius:12px;">\u2605 you are here</span>'
if a
else ""
)
badges = "".join(_badge(t, bb, bf) for t in s["tags"])
cards_html += f'<div style="background:{cb};border:2px solid {cbo};border-radius:12px;padding:16px 20px;display:flex;align-items:flex-start;gap:16px;box-shadow:{"0 4px 14px rgba(16,117,145,.25)" if a else "0 1px 4px rgba(0,0,0,.06)"};"><div style="min-width:44px;height:44px;background:{nb};color:{nf};border-radius:50%;display:flex;align-items:center;justify-content:center;font-size:15px;font-weight:800;flex-shrink:0;">{s["num"]}</div><div style="flex:1;"><div style="font-size:15px;font-weight:700;color:{tf};margin-bottom:4px;">{s["title"]}{yh}</div><div style="font-size:12.5px;color:{df2};margin-bottom:8px;line-height:1.5;">{s["desc"]}</div><div>{badges}</div></div></div>'
if i < len(steps) - 1:
cards_html += f'<div style="display:flex;justify-content:center;height:32px;"><svg width="24" height="32" viewBox="0 0 24 32" fill="none"><line x1="12" y1="0" x2="12" y2="24" stroke="{_TEAL_MID}" stroke-width="2" stroke-dasharray="4 3"/><polygon points="6,20 18,20 12,30" fill="{_TEAL_MID}"/></svg></div>'
html = f'<div style="font-family:Inter,\'Segoe UI\',sans-serif;max-width:640px;margin:8px 0;"><div style="background:linear-gradient(135deg,{_TEAL} 0%,{_TEAL_MID} 100%);border-radius:12px 12px 0 0;padding:14px 20px;display:flex;align-items:center;gap:10px;"><svg width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="{_WHITE}" stroke-width="2"><path d="M12 2L2 7l10 5 10-5-10-5z"/><path d="M2 17l10 5 10-5"/><path d="M2 12l10 5 10-5"/></svg><span style="color:{_WHITE};font-size:14px;font-weight:700;">Twiga Learning Path — {track_name}</span></div><div style="border:2px solid {_TEAL_LIGHT};border-top:none;border-radius:0 0 12px 12px;padding:20px 20px 16px;background:#f9fdfe;display:flex;flex-direction:column;">{cards_html}</div></div>'
HTML(html)