Hyperparameter Tuning with Optuna#

What you’ll build

An Optuna-backed hyperparameter search for LightGBM using a typed SearchSpace, persisted to a resumable SQLite study, with a default-vs-tuned comparison table showing the metric improvement.

Prerequisites

05 - ML Point Forecasting (LightGBM, metric interpretation)
06 - Backtesting & Evaluation (cross-validation strategy)
Python: basic familiarity with hyperparameters

Learning objectives

By the end of this notebook you will be able to:

Explain Bayesian optimisation (TPE) and why it outperforms random/grid search
Define a typed search space using SearchSpace fields (int, float, categorical)
Run forecaster.tune() with a configurable number of Optuna trials
Resume an interrupted study from an SQLite database without losing prior trials
Compare default vs. tuned model performance and decide when further tuning has diminishing returns

1. Setup#

import warnings

from great_tables import GT
from IPython.display import clear_output
from lets_plot import LetsPlot
import pandas as pd
from sklearn.preprocessing import RobustScaler, StandardScaler

LetsPlot.setup_html()

from twiga import TwigaForecaster
from twiga.core.config import DataPipelineConfig, ForecasterConfig
from twiga.core.plot import plot_metrics_bar
from twiga.core.utils import configure, get_logger

warnings.filterwarnings("ignore")

configure()
log = get_logger("tutorials")

Load data#

The dataset covers Madeira, Portugal (32.37 °N, 16.27 °W) at 30-minute resolution. We keep only the columns we need: timestamp, net load (target), and two exogenous drivers.

data = pd.read_parquet("../data/MLVS-PT.parquet")
data = data[["timestamp", "NetLoad(kW)", "Ghi", "Temperature"]]
data["timestamp"] = pd.to_datetime(data["timestamp"])
data = data.drop_duplicates(subset="timestamp").reset_index(drop=True)
# Restrict to 2019-2020 to keep tutorial execution fast
data = data[(data["timestamp"] >= "2019-01-01") & (data["timestamp"] <= "2020-12-31")].reset_index(drop=True)

log.info("Shape: %s", data.shape)
GT(data.head())

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[2], line 1
----> 1 data = pd.read_parquet("../data/MLVS-PT.parquet")
data = data[["timestamp", "NetLoad(kW)", "Ghi", "Temperature"]]
data["timestamp"] = pd.to_datetime(data["timestamp"])
data = data.drop_duplicates(subset="timestamp").reset_index(drop=True)

File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/parquet.py:669, in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, dtype_backend, filesystem, filters, **kwargs)
   use_nullable_dtypes = False
check_dtype_backend(dtype_backend)
--> 669 return impl.read(
   path,
   columns=columns,
   filters=filters,
   storage_options=storage_options,
   use_nullable_dtypes=use_nullable_dtypes,
   dtype_backend=dtype_backend,
   filesystem=filesystem,
   **kwargs,
)

File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/parquet.py:258, in PyArrowImpl.read(self, path, columns, filters, use_nullable_dtypes, dtype_backend, storage_options, filesystem, **kwargs)
if manager == "array":
   to_pandas_kwargs["split_blocks"] = True
--> 258 path_or_handle, handles, filesystem = _get_path_or_handle(
   path,
   filesystem,
   storage_options=storage_options,
   mode="rb",
)
try:
   pa_table = self.api.parquet.read_table(
       path_or_handle,
       columns=columns,
   (...)    270         **kwargs,
   )

File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/parquet.py:141, in _get_path_or_handle(path, fs, storage_options, mode, is_dir)
handles = None
if (
   not fs
   and not is_dir
   (...)    139     # fsspec resources can also point to directories
   # this branch is used for example when reading from non-fsspec URLs
--> 141     handles = get_handle(
       path_or_handle, mode, is_text=False, storage_options=storage_options
   )
   fs = None
   path_or_handle = handles.handle

File ~/work/twiga-forecast/twiga-forecast/.venv/lib/python3.12/site-packages/pandas/io/common.py:882, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
       handle = open(
           handle,
           ioargs.mode,
   (...)    878             newline="",
       )
   else:
       # Binary mode
--> 882         handle = open(handle, ioargs.mode)
   handles.append(handle)
# Convert BytesIO or file objects passed with an encoding

FileNotFoundError: [Errno 2] No such file or directory: '../data/MLVS-PT.parquet'

Train / val / test splits#

Split	Period
train	before 2020-01-01
val	2020-01-01 - 2020-06-30
test	2020-07-01 onwards

train_df = data[data["timestamp"] < "2020-01-01"].reset_index(drop=True)
val_df = data[(data["timestamp"] >= "2020-01-01") & (data["timestamp"] < "2020-07-01")].reset_index(drop=True)
test_df = data[data["timestamp"] >= "2020-07-01"].reset_index(drop=True)

log.info(
    f"train : {train_df.shape[0]:,} rows  "
    f"({train_df['timestamp'].min().date()} -> {train_df['timestamp'].max().date()})"
)
log.info(
    f"val   : {val_df.shape[0]:,} rows  ({val_df['timestamp'].min().date()} -> {val_df['timestamp'].max().date()})"
)
log.info(
    f"test  : {test_df.shape[0]:,} rows  ({test_df['timestamp'].min().date()} -> {test_df['timestamp'].max().date()})"
)

Shared `DataPipelineConfig`#

All experiments in this notebook use the same data configuration. We define it once here and reuse it throughout.

data_config = DataPipelineConfig(
    target_feature="NetLoad(kW)",
    period="30min",
    latitude=32.371666,
    longitude=-16.274998,
    calendar_features=["hour", "day_night"],
    exogenous_features=["Ghi"],
    forecast_horizon=48,
    lookback_window_size=96,
    input_scaler=StandardScaler(),
    target_scaler=RobustScaler(),
)

data_config

2. What gets tuned: the search space#

Every Twiga model config carries a search_space field of type BaseSearchSpace. This object declares which parameters Optuna is allowed to sample, and within what ranges or from what categorical lists.

Fixed parameters are set directly on the config object and are never changed by the optimizer (e.g. boosting_type, objective, metric, verbose).

Tunable parameters are declared inside search_space. When tune() is called the get_optuna_params(trial) method merges both: fixed values form the baseline, and any key present in the search space overrides that baseline with a value sampled by the Optuna trial.

Key concept - Bayesian optimisation

Random search and grid search evaluate hyperparameter configurations independently - each trial ignores the outcomes of previous ones. This is wasteful when training is expensive.

Optuna’s TPE (Tree-structured Parzen Estimator) sampler builds a probabilistic surrogate model of the objective function from all previously evaluated trials. It then proposes the next configuration in a region where the surrogate predicts a good outcome - effectively learning where in the search space to look next.

A trial is one complete evaluation of the objective: sample a config → train the model on CV folds → report the mean validation RMSE → store the result. After ~20 trials the surrogate is warm enough to focus exploration on promising regions, outperforming random search for the same compute budget.

import optuna

from twiga.models.ml import LIGHTGBMConfig

optuna.logging.set_verbosity(optuna.logging.WARNING)

lg_config = LIGHTGBMConfig()
log.info("=== Fixed parameters (passed directly to LightGBM) ===")
log.info("%s", lg_config.model_dump())
log.info("=== Tunable search space (sampled by Optuna each trial) ===")
log.info("%s", lg_config.search_space)

Key tunable fields for LightGBM - displayed via the search space configured above.

from great_tables import GT, md
import pandas as pd

from twiga.core.plot.gt import twiga_gt

search_space_df = pd.DataFrame(
    {
        "Parameter": [
            "n_estimators",
            "learning_rate",
            "num_leaves",
            "subsample",
            "colsample_bytree",
            "min_data_in_leaf",
            "reg_alpha",
            "reg_lambda",
        ],
        "Search range": [
            "(100, 2000)",
            "(0.001, 0.3)",
            "(16, 256)",
            "(0.5, 1.0)",
            "(0.3, 1.0)",
            "(5, 100)",
            "(0.0, 10.0)",
            "(0.0, 10.0)",
        ],
        "Type": ["int", "float (log)", "int", "float", "float", "int", "float", "float"],
        "Effect": [
            "Number of boosting rounds — more rounds reduce bias at the cost of training time",
            "Step-size shrinkage — lower values need more estimators but generalise better",
            "Maximum leaf nodes per tree — primary complexity knob in LightGBM",
            "Fraction of rows used per tree — stochastic regularisation",
            "Fraction of features per tree — similar to random forests' feature subsampling",
            "Minimum samples per leaf — prevents overfitting on tiny leaf nodes",
            "L1 regularisation — promotes sparsity in leaf weights",
            "L2 regularisation — shrinks leaf weights toward zero",
        ],
    }
)

twiga_gt(
    GT(search_space_df)
    .tab_header(
        title=md("**LightGBM Tunable Search Space**"),
        subtitle="Parameters sampled by Optuna's TPE sampler each trial",
    )
    .cols_label(**{c: md(f"**{c}**") for c in search_space_df.columns})
    .tab_source_note("Fixed params (boosting_type, objective, metric, verbose) are never modified by the optimizer"),
    n_rows=len(search_space_df),
)

Key concept - search space definition

Twiga’s BaseSearchSpace maps parameter names to their sampling specifications:

Categorical - a Python list, e.g. linear_tree=[True, False]. Optuna uses trial.suggest_categorical() and can select any element.

Integer range - a (low, high) tuple of integers, e.g. num_leaves=(16, 256). Sampled with trial.suggest_int(low, high).

Float range - a (low, high) tuple of floats, e.g. learning_rate=(0.001, 0.3). Log-scale sampling is applied automatically when low > 0 and the ratio high / low > 5, which is appropriate for learning rates and regularisation strengths.

Fixed parameters - anything set directly on the config object (boosting_type, objective, metric, verbose) bypasses the search space entirely and is passed unchanged to the model. This lets you freeze architectural choices while tuning only numerical knobs.

3. ForecasterConfig with `project_name`#

The project_name field in ForecasterConfig does two things:

Names the Optuna study - Twiga creates (or reopens) a SQLite file at studies/<project_name>.db. Every trial’s parameters and objective value are persisted there automatically.
Names the checkpoint directory - for neural-network models, Lightning checkpoints are saved under a path derived from project_name, so the best weights survive process restarts.

Because the study is stored in SQLite, you can interrupt a tuning run at any time and resume it later simply by using the same project_name. Optuna will pick up exactly where it left off, reusing the already-evaluated trials to warm-start the surrogate model.

train_config = ForecasterConfig(
    project_name="tuning-tutorial",  # -> studies/tuning-tutorial.db
    split_freq="months",
    train_size=3,
    test_size=1,
    num_splits=2,
)

train_config

Key concept - overfitting HPO

A common mistake is to run HPO and evaluate on the same test set you use to report final metrics. This leaks information: you are effectively choosing the hyperparameters that happen to work best on that particular test period, not hyperparameters that generalise.

The correct protocol has three non-overlapping splits:

Split

Used for

Train

Model fitting during each trial

Validation

Optuna objective - the RMSE reported per trial

Test

Final honest evaluation, never seen during HPO

In Twiga, forecaster.tune(train_df, val_df, ...) uses the validation set as the HPO objective, leaving test_df untouched until the very end. This is the same reason ForecasterConfig distinguishes train_size from test_size - the test folds are reserved and never influence the surrogate model.

4. Running `tune()`#

forecaster.tune(train_df, val_df, num_trials) runs Bayesian optimisation over the declared search space. Under the hood each trial:

Samples a parameter set using the TPE (Tree-structured Parzen Estimator) sampler.
Trains the model on the cross-validation folds defined by ForecasterConfig.
Reports the mean validation RMSE to the Optuna study.
Persists the result to SQLite.

We use num_trials=5 (reduced to 5 from the default for speed) here for demonstration speed. In practice 50 - 200 trials give meaningful gains for tree models.

from twiga.models.ml import LIGHTGBMConfig

lg_config = LIGHTGBMConfig()
forecaster = TwigaForecaster(
    data_params=data_config,
    model_params=[lg_config],
    train_params=train_config,
)

# Run HPO — 5 trials for demonstration (increase for real experiments)
forecaster.tune(train_df=train_df, val_df=val_df, num_trials=5)
clear_output()
log.info("Tuning complete.")

5. Fitting with tuned parameters#

After tune() finishes, the forecaster internally loads the best trial’s parameters. Calling fit() then trains the final model with those parameters on the full train + val data before evaluating on the test set.

forecaster.fit(train_df=train_df, val_df=val_df)
clear_output()
pred_tuned, metric_tuned = forecaster.evaluate_point_forecast(test_df=test_df)
clear_output()
log.info("After tuning:")
log.info("\n%s", metric_tuned[["mae", "rmse", "corr"]].mean().round(3).to_string())

Compare with default (no tuning)#

We train an identical model using factory-default hyperparameters to quantify the benefit of HPO on this dataset.

lg_default = LIGHTGBMConfig()
forecaster_default = TwigaForecaster(
    data_params=data_config,
    model_params=[lg_default],
    train_params=ForecasterConfig(split_freq="months", train_size=3, test_size=1),
)
forecaster_default.fit(train_df=train_df, val_df=val_df)
clear_output()
pred_default, metric_default = forecaster_default.evaluate_point_forecast(test_df=test_df)
clear_output()
log.info("Default (no tuning):")
log.info("\n%s", metric_default[["mae", "rmse", "corr"]].mean().round(3).to_string())

Side-by-side comparison: Default vs. Tuned#

summary = {
    "Default": metric_default[["mae", "rmse", "corr"]].mean().round(3),
    "Tuned": metric_tuned[["mae", "rmse", "corr"]].mean().round(3),
}
summary_df = (
    pd.DataFrame(summary)
    .T.reset_index()
    .rename(columns={"index": "Configuration", "mae": "MAE", "rmse": "RMSE", "corr": "Corr"})
)
log.info("\n%s", summary_df.to_string(index=False))

from great_tables import GT, md

from twiga.core.plot.gt import twiga_gt, twiga_report

twiga_report(
    summary_df.rename(columns={"Configuration": "Model"}),
    ["MAE", "RMSE", "Corr"],
    minimize_cols=["MAE", "RMSE"],
    maximize_cols=["Corr"],
)

6. Tuning a neural-network model#

The same tune() API works for NN models. The key differences are:

NN configs are initialised with from_data_config(data_config) so that sequence dimensions are set automatically from the data pipeline.
We cap max_epochs=5 to keep the demo fast. In production use 50 - 200.
rich_progress_bar=False suppresses Lightning’s per-step progress bars inside the notebook.

The search space for MLPGAMConfig covers architecture choices (hidden_dim, num_layers, embedding_size, dropout), optimizer settings (learning_rate, batch_size), and the L1 sparsity coefficient (lambda_lasso).

from twiga.models.nn import MLPGAMConfig

nn_config = MLPGAMConfig.from_data_config(data_config)
nn_config.max_epochs = 2
nn_config.rich_progress_bar = False

nn_train_config = ForecasterConfig(
    split_freq="months",
    train_size=3,
    test_size=1,
    num_splits=2,
    project_name="tuning-nn-tutorial",  # separate study from the ML one
)

forecaster_nn = TwigaForecaster(
    data_params=data_config,
    model_params=[nn_config],
    train_params=nn_train_config,
)

# 3 trials — just enough to verify the pipeline works end-to-end
forecaster_nn.tune(train_df=train_df, val_df=val_df, num_trials=2)
clear_output()
log.info("NN tuning complete.")

7. Study persistence: resuming a study#

Optuna stores every trial in a SQLite database located at studies/<project_name>.db. This means:

You can interrupt a run (Ctrl-C, kernel restart, VM shutdown) at any point without losing progress.
To resume, simply construct ForecasterConfig with the same project_name and call tune() again. Optuna will reopen the existing study, load all previously evaluated trials, and continue sampling from where it left off.
The TPE surrogate is rebuilt from the stored trial history, so later trials benefit from all the information gathered in earlier runs.

This makes it practical to spread a large search (e.g., 200 trials) across multiple notebook sessions or overnight batch jobs without any manual bookkeeping.

8. Optuna Dashboard (optional)#

Optuna ships with an interactive web dashboard that lets you inspect trial history, parameter importances, and the objective landscape without writing any extra code.

Launch it from a terminal in the project root:

optuna-dashboard sqlite:///studies/tuning-tutorial.db

Or for the NN study:

optuna-dashboard sqlite:///studies/tuning-nn-tutorial.db

The dashboard will open at http://localhost:8080 and shows:

Trial history: objective value per trial, with best-so-far overlay
Parameter importances: which hyperparameters influenced the objective most
Parallel coordinate plot: relationships between parameter values and performance
Pareto front: for multi-objective studies (not used here)

No installation is needed if you installed the optional optuna-dashboard package (pip install optuna-dashboard).

9. Interpreting results: when to tune and when not to#

When tuning is worth the cost#

Large datasets with long training folds: each trial is expensive but the data provides enough signal to reliably distinguish good from bad configurations.
Production models: a one-time tuning investment amortises quickly across many inference calls.
Neural network architectures: the landscape is highly non-convex and defaults rarely perform well out of the box.

When to skip tuning#

Small datasets (fewer than ~100 samples per fold): cross-validation estimates of the objective are noisy; the optimizer will over-fit to noise in the fold splits rather than discover genuinely better hyperparameters.
Time-constrained deployments: if you need a model in production within minutes, default tree-model configs are already competitive and tuning adds latency without a guaranteed payoff.
Heavily regularised models: if the model is already underfitting, tuning hyperparameters without first improving feature engineering will have limited impact.

Trade-off rule of thumb#

Start with num_trials=20 for a quick sanity check. If the best trial improves on the default by more than 3 - 5 % on your key metric, extend to 100+ trials. If the improvement is marginal, invest the compute in richer features or a larger ensemble instead.

Wrapping up#

What you did

Inspected the LightGBM typed search space - integer, float, and categorical ranges
Added a project_name to ForecasterConfig to bind tuning to a SQLite study
Ran forecaster.tune() with Bayesian (TPE) optimisation across CV folds
Fit the final model with the best trial’s parameters and evaluated on the held-out test set
Compared tuned vs. default performance with a styled metric table
Tuned an NN model using the same API (MLPGAMConfig)
Understood how to resume a study by reusing project_name

Key takeaways

Bayesian (TPE) optimisation learns where to sample next from prior trials - outperforming random search for the same compute budget after ~20 trials.
Always use a separate validation set as the HPO objective; never optimise directly against the test set.
project_name maps to a SQLite file at studies/<name>.db - tuning is interruptible and resumable at no cost.
The same forecaster.tune() API works for ML and NN models; NN configs add architecture dimensions like hidden_dim and dropout to the search space.
If tuned improvement is below 3 % on your key metric, invest compute in richer features or a larger ensemble instead.

What’s next?#

Notebook 11 shows how to combine several models - LightGBM, XGBoost, CatBoost - into mean, median, and weighted ensembles, and explains when ensembles outperform any single well-tuned model.

# ruff: noqa: E501, E701, E702
from IPython.display import HTML

_TEAL = "#107591"
_TEAL_MID = "#069fac"
_TEAL_LIGHT = "#e8f5f8"
_TEAL_BEST = "#d0ecf1"
_TEXT_DARK = "#2d3748"
_TEXT_MUTED = "#718096"
_WHITE = "#ffffff"

steps = [
    {
        "num": "06",
        "title": "Backtesting & Evaluation",
        "desc": "Rolling-window backtesting · fold-level metrics",
        "tags": ["backtesting", "evaluation"],
        "active": False,
    },
    {
        "num": "10",
        "title": "Conformal Prediction",
        "desc": "CQR · CRC — coverage-guaranteed intervals",
        "tags": ["conformal", "CQR", "coverage"],
        "active": False,
    },
    {
        "num": "11",
        "title": "Hyperparameter Tuning",
        "desc": "Optuna TPE · typed search spaces · resumable SQLite study",
        "tags": ["optuna", "HPO", "Bayesian", "tuning"],
        "active": True,
    },
    {
        "num": "12",
        "title": "Ensemble Strategies",
        "desc": "Mean · median · weighted-mean ensembles",
        "tags": ["ensemble", "weighted"],
        "active": False,
    },
    {
        "num": "13",
        "title": "Custom Models",
        "desc": "Register your own model class in the Twiga registry",
        "tags": ["custom", "registry", "sklearn"],
        "active": False,
    },
]
track_name = "Advanced Track"
footer = 'Next: combine tuned models with <span style="color:#107591;font-weight:600;">Ensemble Strategies</span> (12) or extend the library with <span style="color:#107591;font-weight:600;">Custom Models</span> (13).'


def _b(t, bg, fg):
    return f'<span style="display:inline-block;background:{bg};color:{fg};font-size:10px;font-weight:600;padding:2px 7px;border-radius:10px;margin:2px 2px 0 0;">{t}</span>'


ch = ""
for i, s in enumerate(steps):
    a = s["active"]
    cb = _TEAL if a else _WHITE
    cbo = _TEAL if a else "#d1ecf1"
    nb = _TEAL_MID if a else _TEAL_LIGHT
    nf = _WHITE if a else _TEAL
    tf = _WHITE if a else _TEXT_DARK
    df = "#cce8ef" if a else _TEXT_MUTED
    bb = "#0d5f75" if a else _TEAL_BEST
    bf = "#b8e4ed" if a else _TEAL
    yh = (
        f'<span style="float:right;background:{_TEAL_MID};color:{_WHITE};font-size:10px;font-weight:700;padding:2px 10px;border-radius:12px;">★ you are here</span>'
        if a
        else ""
    )
    bdg = "".join(_b(t, bb, bf) for t in s["tags"])
    ch += f'<div style="background:{cb};border:2px solid {cbo};border-radius:12px;padding:16px 20px;display:flex;align-items:flex-start;gap:16px;box-shadow:{"0 4px 14px rgba(16,117,145,.25)" if a else "0 1px 4px rgba(0,0,0,.06)"};"><div style="min-width:44px;height:44px;background:{nb};color:{nf};border-radius:50%;display:flex;align-items:center;justify-content:center;font-size:15px;font-weight:800;flex-shrink:0;">{s["num"]}</div><div style="flex:1;"><div style="font-size:15px;font-weight:700;color:{tf};margin-bottom:4px;">{s["title"]}{yh}</div><div style="font-size:12.5px;color:{df};margin-bottom:8px;line-height:1.5;">{s["desc"]}</div><div>{bdg}</div></div></div>'
    if i < len(steps) - 1:
        ch += f'<div style="display:flex;justify-content:center;height:32px;"><svg width="24" height="32" viewBox="0 0 24 32" fill="none"><line x1="12" y1="0" x2="12" y2="24" stroke="{_TEAL_MID}" stroke-width="2" stroke-dasharray="4 3"/><polygon points="6,20 18,20 12,30" fill="{_TEAL_MID}"/></svg></div>'

HTML(
    f'<div style="font-family:Inter,\'Segoe UI\',sans-serif;max-width:640px;margin:8px 0;"><div style="background:linear-gradient(135deg,{_TEAL} 0%,{_TEAL_MID} 100%);border-radius:12px 12px 0 0;padding:14px 20px;display:flex;align-items:center;gap:10px;"><svg width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="{_WHITE}" stroke-width="2"><path d="M12 2L2 7l10 5 10-5-10-5z"/><path d="M2 17l10 5 10-5"/><path d="M2 12l10 5 10-5"/></svg><span style="color:{_WHITE};font-size:14px;font-weight:700;">Twiga Learning Path — {track_name}</span></div><div style="border:2px solid {_TEAL_LIGHT};border-top:none;border-radius:0 0 12px 12px;padding:20px 20px 16px;background:#f9fdfe;display:flex;flex-direction:column;">{ch}<div style="margin-top:16px;font-size:11.5px;color:{_TEXT_MUTED};text-align:center;border-top:1px solid {_TEAL_LIGHT};padding-top:12px;">{footer}</div></div></div>'
)

Split	Used for
Train	Model fitting during each trial
Validation	Optuna objective - the RMSE reported per trial
Test	Final honest evaluation, never seen during HPO