Baseline Benchmarking#

Beginner Python Twiga Time


What you’ll build

A complete baseline benchmark comparing Naive, Seasonal Naive, Window Average, and Drift against LightGBM on the MLVS-PT dataset - with skill scores and forecast traces to diagnose whether a learned model is actually earning its complexity.

Prerequisites

  • 01 - Getting Started

  • 03 - Feature Engineering

Learning objectives

By the end of this notebook you will be able to:

  1. Explain why baselines must come before advanced models - if LightGBM cannot beat Seasonal Naive, the problem is upstream of the model

  2. Configure and run all four parameter-free baselines through TwigaForecaster

  3. Read a benchmark table and identify the strongest reference model

  4. Compute and interpret MAE and RMSE skill scores relative to the best baseline

  5. Visualise forecast traces to see where and how each model fails

import warnings

warnings.filterwarnings("ignore")

from great_tables import GT, md
from IPython.display import clear_output
from lets_plot import LetsPlot
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler, StandardScaler

LetsPlot.setup_html()

from twiga import TwigaForecaster
from twiga.core.config import DataPipelineConfig, ForecasterConfig
from twiga.core.plot import (
    plot_forecast_grid,
    plot_metrics_bar,
    plot_timeseries,
)
from twiga.core.plot.gt import twiga_gt, twiga_report
from twiga.core.utils import configure, get_logger
from twiga.models.baseline.context_parrot_model import CONTEXTPARROTConfig
from twiga.models.baseline.drift_model import DRIFTConfig
from twiga.models.baseline.naive_model import NAIVEConfig
from twiga.models.baseline.seasonal_naive_model import SEASONALNAIVEConfig
from twiga.models.baseline.window_average_model import WINDOWAVERAGEConfig
from twiga.models.ml import LIGHTGBMConfig

configure()
log = get_logger(name="tutorial-15")

1. Load Data#

We use the MLVS-PT dataset - 30-minute net electrical load readings from a distribution substation in Madeira, Portugal. It exhibits strong daily and weekly seasonality driven by residential consumption patterns, making it a realistic benchmark for energy forecasting.

We keep only the two columns needed for this tutorial: timestamp and NetLoad(kW).

raw = pd.read_parquet("../data/MLVS-PT.parquet")
df = raw[["timestamp", "NetLoad(kW)"]].copy()
df = df.sort_values("timestamp").reset_index(drop=True)
# Restrict to 2019-2020 to keep tutorial execution fast
df = df[(df["timestamp"] >= "2019-01-01") & (df["timestamp"] <= "2020-12-31")].reset_index(drop=True)

log.info("Shape  : %s", df.shape)
log.info("Period : %s  ->  %s", df["timestamp"].min().date(), df["timestamp"].max().date())
log.info(
    "Target : min=%.3f  max=%.3f  mean=%.3f", df["NetLoad(kW)"].min(), df["NetLoad(kW)"].max(), df["NetLoad(kW)"].mean()
)
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[2], line 1
----> 1 raw = pd.read_parquet("../data/MLVS-PT.parquet")
      2 df = raw[["timestamp", "NetLoad(kW)"]].copy()
      3 df = df.sort_values("timestamp").reset_index(drop=True)
      4 # Restrict to 2019-2020 to keep tutorial execution fast

File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/parquet.py:669, in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, dtype_backend, filesystem, filters, **kwargs)
    666     use_nullable_dtypes = False
    667 check_dtype_backend(dtype_backend)
--> 669 return impl.read(
    670     path,
    671     columns=columns,
    672     filters=filters,
    673     storage_options=storage_options,
    674     use_nullable_dtypes=use_nullable_dtypes,
    675     dtype_backend=dtype_backend,
    676     filesystem=filesystem,
    677     **kwargs,
    678 )

File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/parquet.py:258, in PyArrowImpl.read(self, path, columns, filters, use_nullable_dtypes, dtype_backend, storage_options, filesystem, **kwargs)
    256 if manager == "array":
    257     to_pandas_kwargs["split_blocks"] = True
--> 258 path_or_handle, handles, filesystem = _get_path_or_handle(
    259     path,
    260     filesystem,
    261     storage_options=storage_options,
    262     mode="rb",
    263 )
    264 try:
    265     pa_table = self.api.parquet.read_table(
    266         path_or_handle,
    267         columns=columns,
   (...)    270         **kwargs,
    271     )

File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/parquet.py:141, in _get_path_or_handle(path, fs, storage_options, mode, is_dir)
    131 handles = None
    132 if (
    133     not fs
    134     and not is_dir
   (...)    139     # fsspec resources can also point to directories
    140     # this branch is used for example when reading from non-fsspec URLs
--> 141     handles = get_handle(
    142         path_or_handle, mode, is_text=False, storage_options=storage_options
    143     )
    144     fs = None
    145     path_or_handle = handles.handle

File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/common.py:882, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    873         handle = open(
    874             handle,
    875             ioargs.mode,
   (...)    878             newline="",
    879         )
    880     else:
    881         # Binary mode
--> 882         handle = open(handle, ioargs.mode)
    883     handles.append(handle)
    885 # Convert BytesIO or file objects passed with an encoding

FileNotFoundError: [Errno 2] No such file or directory: '../data/MLVS-PT.parquet'
twiga_gt(
    GT(df.head())
    .tab_header(
        title=md("**MLVS-PT — Raw Data Sample**"),
        subtitle="First 5 rows · 30-minute net electrical load",
    )
    .cols_label(
        timestamp=md("**Timestamp**"),
        **{"NetLoad(kW)": md("**Net Load (kW)**")},
    )
    .tab_source_note("MLVS-PT dataset · Net electrical load · 30-minute resolution"),
    n_rows=5,
)
p = plot_timeseries(
    df,
    y_cols=["NetLoad(kW)"],
    date_col="timestamp",
    title="MLVS-PT — Net Electrical Load (full series)",
    y_label="Net Load (kW)",
    x_label="Date",
    n_samples=3000,
    fig_size=(820, 280),
)
p

2. Train / Test Split#

For time series you must never shuffle rows before splitting. We hold out the last six months as the test set - these rows are never seen during training or CV fold construction.

Key concept - chronological splits

Shuffling a time series before splitting leaks the future into the training window. A model trained on data from 2021 and evaluated on 2020 appears accurate but fails completely in production. Always split by time.

train_df = df[df["timestamp"] < "2020-07-01"].reset_index(drop=True)
test_df = df[df["timestamp"] >= "2020-07-01"].reset_index(drop=True)

log.info(
    "train : %d rows  (%s -> %s)",
    len(train_df),
    train_df["timestamp"].min().date(),
    train_df["timestamp"].max().date(),
)
log.info(
    "test  : %d rows  (%s -> %s)",
    len(test_df),
    test_df["timestamp"].min().date(),
    test_df["timestamp"].max().date(),
)
split_summary = pd.DataFrame(
    {
        "Split": ["Train", "Test"],
        "Start": [
            str(train_df["timestamp"].min().date()),
            str(test_df["timestamp"].min().date()),
        ],
        "End": [
            str(train_df["timestamp"].max().date()),
            str(test_df["timestamp"].max().date()),
        ],
        "Rows": [f"{len(train_df):,}", f"{len(test_df):,}"],
        "Duration": ["~29 months", "~6 months"],
        "Purpose": ["Model learning + CV folds", "Final honest evaluation"],
    }
)

twiga_gt(
    GT(split_summary)
    .tab_header(
        title=md("**Dataset Splits**"),
        subtitle="Chronological — no shuffling, no overlap",
    )
    .cols_label(
        Split=md("**Split**"),
        Start=md("**Start**"),
        End=md("**End**"),
        Rows=md("**Rows**"),
        Duration=md("**Duration**"),
        Purpose=md("**Purpose**"),
    )
    .tab_source_note("MLVS-PT dataset · 30-minute resolution"),
    n_rows=len(split_summary),
)

3. Configure the Data Pipeline#

DataPipelineConfig describes what to forecast and how to build input features. We use a 96-step (48 h) lookback window and a 48-step (24 h) forecast horizon, matching the setup from Tutorial 01.

lags and windows are specified in days - Twiga multiplies by n_samples (48 for 30-min data) to convert to steps. So lags=[1, 7] creates lag features at 1 day (48 steps) and 1 week (336 steps) ago.

Key constraint for baseline models

Even parameter-free baselines flow through the same DataPipeline as ML models. The pipeline places raw target values in the leading feature columns so that X[:, t, :num_targets] contains the observed target at time step t. Twiga’s default pipeline satisfies this automatically - target lags are prepended first.

Why two pipeline configs?#

The stride parameter controls the step between consecutive sliding windows:

Config

stride

Use case

data_config

1 (default)

ML / NN training - more windows = more training samples

baseline_data_config

forecast_horizon

Baseline evaluation - non-overlapping windows give independent, honest metrics

With stride=1, consecutive windows overlap by lookback_window_size 1 steps, producing ~N correlated predictions that overstate statistical confidence in backtesting metrics. Setting stride=forecast_horizon ensures each prediction window is independent.

# Shared feature-engineering settings
_pipeline_kwargs = {
    "target_feature": "NetLoad(kW)",
    "period": "30min",
    "latitude": 32.371666,
    "longitude": -16.274998,
    "lookback_window_size": 96,  # 48 hours of 30-min context
    "forecast_horizon": 48,  # predict the next 24 hours
    "calendar_features": ["hour", "day_night"],
    "exogenous_features": [],
    "input_scaler": StandardScaler(),
    "target_scaler": RobustScaler(),
}

# ML models — stride=1 (fully overlapping, maximises training samples)
data_config = DataPipelineConfig(**_pipeline_kwargs)

# Baseline models — stride=forecast_horizon (non-overlapping, independent forecasts)
baseline_data_config = DataPipelineConfig(
    **_pipeline_kwargs, stride=48
)  # 7.5 days of context to capture weekly seasonality
baseline_data_config.lookback_window_size = (
    48 * 7
)  # 7 days of context for baseline models to capture weekly seasonality
train_config = ForecasterConfig(
    split_freq="months",
    train_size=3,
    test_size=1,
    window="expanding",
    project_name="baseline_benchmark",
    seed=42,
)

log.info("ML config   : stride=%d", data_config.stride)
log.info("Baseline config : stride=%d", baseline_data_config.stride)

4. Baseline Models#

All four models are parameter-free - no gradient descent, no hyperparameter search, and typically under a second per CV fold. Their purpose is to set the minimum bar that any trained model must clear before it earns a place in production.

Rule of thumb - always run Seasonal Naive first

On any series with clear daily or weekly periodicity, Seasonal Naive is the hardest cheap baseline to beat. If LightGBM cannot outperform it on MAE, the problem is in your data or features - not the model architecture.

baseline_overview = pd.DataFrame(
    {
        "Model": ["NAIVEModel", "SEASONALNAIVEModel", "WINDOWAVERAGEModel", "DRIFTModel", "CONTEXTPARROTModel"],
        "Strategy": [
            "Repeat the last observed value for all horizon steps",
            "Repeat the value observed exactly m steps ago (seasonal lag)",
            "Broadcast the mean of the last window_size observations",
            "Extrapolate the linear trend within the input window",
            "1-nearest-neighbour lookup in delay-embedded context space",
        ],
        "Best for": [
            "Strongly autocorrelated, non-seasonal series",
            "Series with clear daily or weekly periodicity",
            "Mean-reverting or noisy signals",
            "Slowly and smoothly trending series",
            "Chaotic or nonlinearly recurrent signals (e.g. energy, weather)",
        ],
    }
)

twiga_gt(
    GT(baseline_overview)
    .tab_header(
        title=md("**Twiga Baseline Models**"),
        subtitle="Five parameter-free reference models — no training required",
    )
    .cols_label(**{c: md(f"**{c}**") for c in baseline_overview.columns})
    .tab_source_note("twiga.models.baseline"),
    n_rows=len(baseline_overview),
)

4.1 Naive (window_last)#

The simplest possible forecast: for every test window, repeat the last observed target value across all 48 horizon steps. This is the classical persistence forecast - y-hat_{t+h} = y_t for all h.

naive_config = NAIVEConfig(strategy="last")

forecaster_naive = TwigaForecaster(
    data_params=baseline_data_config,
    model_params=[naive_config],
    train_params=train_config,
)
forecaster_naive.fit(train_df=train_df)
clear_output()

pred_naive, metric_naive = forecaster_naive.evaluate_point_forecast(test_df=test_df)
clear_output()

m = metric_naive[["mae", "rmse"]].mean().round(3)
log.info("Naive (window_last) complete.")
# log.info("  MAE=%.3f  RMSE=%.3f  Corr=%.3f", m["mae"], m["rmse"], m["corr"])

4.2 Seasonal Naive#

Repeat the value observed exactly one season ago. With period="1D" and freq="30min", the seasonal lag is 48 steps - every 30-min slot s is predicted as the observed value at the same slot yesterday. This is the canonical benchmark for any series with daily periodicity.

seasonal_config = SEASONALNAIVEConfig(period="7D", freq="30min")

forecaster_seasonal = TwigaForecaster(
    data_params=baseline_data_config,
    model_params=[seasonal_config],
    train_params=train_config,
)
forecaster_seasonal.fit(train_df=train_df)
clear_output()

pred_seasonal, metric_seasonal = forecaster_seasonal.evaluate_point_forecast(test_df=test_df)
clear_output()

m = (
    metric_seasonal[
        [
            "mae",
            "rmse",
        ]
    ]
    .mean()
    .round(3)
)
log.info("Seasonal Naive (1D) complete.")
log.info("  MAE=%.3f  RMSE=%.3f", m["mae"], m["rmse"])

4.3 Window Average#

Broadcast the mean of the last 48 observed steps (one day) as the prediction for all horizon steps. Robust to isolated outliers; performs well on mean-reverting signals where any single recent observation is noisy.

wavg_config = WINDOWAVERAGEConfig(window_size=48 * 8)

forecaster_wavg = TwigaForecaster(
    data_params=baseline_data_config,
    model_params=[wavg_config],
    train_params=train_config,
)
forecaster_wavg.fit(train_df=train_df)
clear_output()

pred_wavg, metric_wavg = forecaster_wavg.evaluate_point_forecast(test_df=test_df)
clear_output()

m = metric_wavg[["mae", "rmse"]].mean().round(3)
log.info("Window Average (48 steps = 24 h) complete.")
log.info("  MAE=%.3f  RMSE=%.3f", m["mae"], m["rmse"])

4.4 Drift#

Fit a slope to all observations in the input window and extrapolate it forward. Sensible for slowly trending series; can diverge badly on mean-reverting or oscillating signals.

drift_config = DRIFTConfig()

forecaster_drift = TwigaForecaster(
    data_params=baseline_data_config,
    model_params=[drift_config],
    train_params=train_config,
)
forecaster_drift.fit(train_df=train_df)
clear_output()

pred_drift, metric_drift = forecaster_drift.evaluate_point_forecast(test_df=test_df)
clear_output()

m = metric_drift[["mae", "rmse"]].mean().round(3)
log.info("Drift complete.")
log.info("  MAE=%.3f  RMSE=%.3f", m["mae"], m["rmse"])

4.5 Context Parrot#

Context parroting is a 1-nearest-neighbour strategy in delay-embedded space, formalised by Zhang & Gilpin (2024) as a deliberate baseline after observing that the Chronos foundation model implicitly exhibits this behaviour on chaotic systems.

Given a lookback window of length L:

  1. The last D target values form the query motif.

  2. Every length-D sub-window earlier in the context is a candidate (the final D positions are excluded).

  3. The candidate with minimum Euclidean distance to the query is selected.

  4. The H values immediately following the best match are returned as the forecast.

The embedding_dim (D) is the Takens delay-embedding dimension - larger values capture more context structure at the cost of requiring a longer lookback. min_seq_len = 2 * embedding_dim + 1 must be satisfied by the pipeline’s lookback_window_size.

Reference: Zhang, Z., & Gilpin, W. (2024). Chaos as an interpretable benchmark for forecasting and data-driven modelling. arXiv:2407.18857.

# embedding_dim=96 → one full day on 30-min data; lookback_window_size=336 >> min_seq_len=97
parrot_config = CONTEXTPARROTConfig(embedding_dim=96)

forecaster_parrot = TwigaForecaster(
    data_params=baseline_data_config,
    model_params=[parrot_config],
    train_params=train_config,
)
forecaster_parrot.fit(train_df=train_df)
clear_output()

pred_parrot, metric_parrot = forecaster_parrot.evaluate_point_forecast(test_df=test_df)
clear_output()

m = metric_parrot[["mae", "rmse"]].mean().round(3)
log.info("Context Parrot (embedding_dim=48) complete.")
log.info("  MAE=%.3f  RMSE=%.3f ", m["mae"], m["rmse"])

5. ML Comparison: LightGBM#

We now run LightGBM through the same pipeline and CV schedule. Default hyperparameters only - the goal is a fair apples-to-apples comparison, not a tuned champion model.

We also pass Ghi (global horizontal irradiance) as an exogenous feature, which is known for the full horizon (solar forecasts are available). The baselines do not use this signal - so any improvement from LightGBM is partly due to the additional input.

lgb_config = LIGHTGBMConfig()

forecaster_lgb = TwigaForecaster(
    data_params=data_config,
    model_params=[lgb_config],
    train_params=train_config,
)
forecaster_lgb.fit(train_df=train_df)
clear_output()

6. Benchmark Results#

We aggregate metrics across all CV folds and display them side-by-side. Lower is better for MAE, RMSE, SMAPE; higher is better for Corr.

metrics_guide = pd.DataFrame(
    {
        "Metric": ["MAE", "RMSE", "Corr"],
        "Full name": ["Mean Absolute Error", "Root Mean Squared Error", "Pearson Correlation"],
        "Direction": ["lower = better", "lower = better", "higher = better"],
        "Interpretation": [
            "Average absolute forecast error in kW — directly interpretable",
            "Penalises large errors more than MAE — sensitive to demand peaks",
            "Linear association between forecast and actual (0 = random, 1 = perfect)",
        ],
    }
)

twiga_gt(
    GT(metrics_guide)
    .tab_header(
        title=md("**Evaluation Metrics**"),
        subtitle="What each number means and which direction is better",
    )
    .cols_label(**{c: md(f"**{c}**") for c in metrics_guide.columns})
    .tab_source_note("twiga.core.metrics"),
    n_rows=len(metrics_guide),
)
metrics_all = pd.concat(
    [metric_naive, metric_seasonal, metric_wavg, metric_parrot],
    ignore_index=True,
)

res = (
    metrics_all.groupby("Model", sort=False)[["mae", "rmse", "corr", "wmape", "smape", "nbias"]]
    .mean()
    .round(4)
    .reset_index()
)
res = res.rename(
    columns={
        "mae": "MAE",
        "corr": "Corr",
        "wmape": "WMAPE",
        "smape": "SMAPE",
        "nbias": "NBIAS",
        "rmse": "RMSE",
    }
)

twiga_report(
    res,
    ["MAE", "Corr", "SMAPE", "RMSE"],
    ["MAE", "SMAPE", "RMSE"],
    ["Corr"],
)

7. Skill Scores#

A skill score measures the percentage improvement of a model over a reference baseline:

\[\text{SS}_{\text{MAE}} = \left(1 - \frac{\text{MAE}_{\text{model}}}{\text{MAE}_{\text{reference}}}\right) \times 100\%\]

Score

Meaning

> 0 %

Model beats the baseline by that margin

= 0 %

Tied - added complexity brings no benefit

< 0 %

Model is worse than the naive reference - investigate data and features

We use the best baseline (lowest MAE across the four parameter-free models) as the reference.

baseline_names = ["NAIVE", "SEASONAL_NAIVE", "WINDOW_AVERAGE", "DRIFT"]

baseline_rows = res[res["Model"].isin(baseline_names)]
best_mae = baseline_rows["MAE"].min()
best_rmse = baseline_rows["RMSE"].min()
best_name = baseline_rows.loc[baseline_rows["MAE"].idxmin(), "Model"]

log.info("Best baseline : %s", best_name)
log.info("  Reference MAE  = %.4f", best_mae)
log.info("  Reference RMSE = %.4f", best_rmse)

skill = res[["Model", "MAE", "RMSE"]].copy()
skill["SS_MAE (%)"] = ((1 - skill["MAE"] / best_mae) * 100).round(1)
skill["SS_RMSE (%)"] = ((1 - skill["RMSE"] / best_rmse) * 100).round(1)
twiga_gt(
    GT(skill)
    .tab_header(
        title=md("**Skill Scores vs Best Baseline**"),
        subtitle=md(f"Reference: **{best_name}** — positive = better than reference"),
    )
    .cols_label(
        Model=md("**Model**"),
        MAE=md("**MAE**"),
        RMSE=md("**RMSE**"),
        **{"SS_MAE (%)": md("**SS MAE (%)**")},
        **{"SS_RMSE (%)": md("**SS RMSE (%)**")},
    )
    .tab_source_note("Skill score = (1 - model_error / reference_error) x 100 %"),
    n_rows=len(skill),
)
skill_bar = skill[["Model", "SS_MAE (%)"]].rename(columns={"SS_MAE (%)": "SS_MAE"})

p = plot_metrics_bar(
    skill_bar,
    metric_col="SS_MAE",
    model_col="Model",
    lower_is_better=False,
    title="MAE Skill Score vs Best Baseline (%)",
    x_label="Skill Score (%) — positive = better than reference",
    horizontal=True,
    fig_size=(620, 340),
)
p

8. Forecast Visualisation#

Plotting the forecast traces reveals where and how each model fails. Seven days of 30-minute predictions show whether a model tracks the daily cycle, overshoots trend changes, or degrades over the horizon.

preds_all = pd.concat(
    [pred_naive, pred_seasonal, pred_parrot],
    ignore_index=True,
)

p = plot_forecast_grid(
    preds_all,
    actual_col="Actual",
    forecast_col="forecast",
    model_col="Model",
    n_samples_per_model=7 * 48,
    y_label="Net Load (kW)",
    title="Baseline + ML forecast traces — first 7 days of test set",
    fig_width=1200,
)
p
p.show()

Wrapping up#

What you did

  • Loaded and explored the MLVS-PT 30-min dataset

  • Configured a shared data pipeline (lookback=96, horizon=48)

  • Ran all four parameter-free baselines through TwigaForecaster

  • Ran LightGBM for ML comparison using the same protocol

  • Built a side-by-side benchmark table and identified the strongest baseline

  • Computed MAE and RMSE skill scores relative to the best reference

  • Plotted forecast traces to understand each model’s failure mode

Key takeaways

  1. Always run baselines before ML - they are fast, interpretable, and diagnostic

  2. Seasonal Naive is the hardest cheap baseline to beat on any periodic series

  3. A skill score near 0 % means added complexity provides no value

  4. A negative skill score is a red flag - investigate data and features before tuning

  5. Forecast traces reveal failure modes that aggregate metrics hide


What’s next?#

# ruff: noqa: E501, E701, E702
from IPython.display import HTML

_TEAL = "#107591"
_TEAL_MID = "#069fac"
_TEAL_LIGHT = "#e8f5f8"
_TEAL_BEST = "#d0ecf1"
_TEXT_DARK = "#2d3748"
_TEXT_MUTED = "#718096"
_WHITE = "#ffffff"

steps = [
    {
        "num": "01",
        "title": "Getting Started",
        "desc": "Load data · configure pipeline · train LightGBM",
        "tags": ["data", "config", "train"],
        "active": False,
    },
    {
        "num": "03",
        "title": "Feature Engineering",
        "desc": "Lag, rolling-window, and calendar features",
        "tags": ["features", "lags", "calendar"],
        "active": False,
    },
    {
        "num": "15",
        "title": "Baseline Benchmarking",
        "desc": "Naive · SeasonalNaive · WindowAverage · Drift — skill scores vs. ML",
        "tags": ["baseline", "naive", "skill score"],
        "active": True,
    },
    {
        "num": "04",
        "title": "ML Point Forecasting",
        "desc": "CatBoost · XGBoost · LightGBM — multi-model comparison",
        "tags": ["catboost", "xgboost", "lightgbm"],
        "active": False,
    },
    {
        "num": "10",
        "title": "Hyperparameter Tuning",
        "desc": "Optuna-based HPO with resumable SQLite studies",
        "tags": ["optuna", "HPO", "tuning"],
        "active": False,
    },
]
track_name = "Beginner Track"


def _badge(t, bg, fg):
    return f'<span style="display:inline-block;background:{bg};color:{fg};font-size:10px;font-weight:600;padding:2px 7px;border-radius:10px;letter-spacing:.3px;margin:2px 2px 0 0;">{t}</span>'


cards_html = ""
for i, s in enumerate(steps):
    a = s["active"]
    cb = _TEAL if a else _WHITE
    cbo = _TEAL if a else "#d1ecf1"
    nb = _TEAL_MID if a else _TEAL_LIGHT
    nf = _WHITE if a else _TEAL
    tf = _WHITE if a else _TEXT_DARK
    df2 = "#cce8ef" if a else _TEXT_MUTED
    bb = "#0d5f75" if a else _TEAL_BEST
    bf = "#b8e4ed" if a else _TEAL
    yh = (
        f'<span style="float:right;background:{_TEAL_MID};color:{_WHITE};font-size:10px;font-weight:700;padding:2px 10px;border-radius:12px;">\u2605 you are here</span>'
        if a
        else ""
    )
    badges = "".join(_badge(t, bb, bf) for t in s["tags"])
    cards_html += f'<div style="background:{cb};border:2px solid {cbo};border-radius:12px;padding:16px 20px;display:flex;align-items:flex-start;gap:16px;box-shadow:{"0 4px 14px rgba(16,117,145,.25)" if a else "0 1px 4px rgba(0,0,0,.06)"};"><div style="min-width:44px;height:44px;background:{nb};color:{nf};border-radius:50%;display:flex;align-items:center;justify-content:center;font-size:15px;font-weight:800;flex-shrink:0;">{s["num"]}</div><div style="flex:1;"><div style="font-size:15px;font-weight:700;color:{tf};margin-bottom:4px;">{s["title"]}{yh}</div><div style="font-size:12.5px;color:{df2};margin-bottom:8px;line-height:1.5;">{s["desc"]}</div><div>{badges}</div></div></div>'
    if i < len(steps) - 1:
        cards_html += f'<div style="display:flex;justify-content:center;height:32px;"><svg width="24" height="32" viewBox="0 0 24 32" fill="none"><line x1="12" y1="0" x2="12" y2="24" stroke="{_TEAL_MID}" stroke-width="2" stroke-dasharray="4 3"/><polygon points="6,20 18,20 12,30" fill="{_TEAL_MID}"/></svg></div>'
html = f'<div style="font-family:Inter,\'Segoe UI\',sans-serif;max-width:640px;margin:8px 0;"><div style="background:linear-gradient(135deg,{_TEAL} 0%,{_TEAL_MID} 100%);border-radius:12px 12px 0 0;padding:14px 20px;display:flex;align-items:center;gap:10px;"><svg width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="{_WHITE}" stroke-width="2"><path d="M12 2L2 7l10 5 10-5-10-5z"/><path d="M2 17l10 5 10-5"/><path d="M2 12l10 5 10-5"/></svg><span style="color:{_WHITE};font-size:14px;font-weight:700;">Twiga Learning Path — {track_name}</span></div><div style="border:2px solid {_TEAL_LIGHT};border-top:none;border-radius:0 0 12px 12px;padding:20px 20px 16px;background:#f9fdfe;display:flex;flex-direction:column;">{cards_html}</div></div>'
HTML(html)