basic vibe code structure

2026-04-09 13:23:01 +02:00
commit e0d4788a11
4 changed files with 796 additions and 0 deletions
--- a/app/main.py
+++ b/app/main.py
@@ -0,0 +1,773 @@
+"""
+Urban Climate U-Net
+===================
+Predicts nocturnal T_air (2 m AGL) and daytime PET at 5 m resolution
+from morphological raster inputs, with aleatoric uncertainty estimation.
+
+Architecture overview
+---------------------
+  Shared U-Net encoder–decoder  (~5–8 M parameters)
+  Multi-scale input:
+    fine   — 256×256 patch at 5 m  (7 channels)
+    coarse — 64×64  patch at 25 m  (7 channels, broader context)
+  Encoder:   4 ConvBlock levels, Dropout2d on levels 2–4
+  Bottleneck: fuses fine enc-4 + coarse context, Dropout2d 0.2
+  Decoder:   4 UpBlock levels with skip connections, Dropout2d on level 3
+  Heads:
+    t_air_head     Conv 32→1   predicted mean T_air
+    pet_head       Conv 32→1   predicted mean PET
+    log_sigma_head Conv 32→2   log-σ for T_air and PET (aleatoric)
+
+Uncertainty
+-----------
+  Aleatoric (irreducible noise):
+    The model outputs σ per pixel alongside the mean prediction.
+    No label is provided — the Gaussian NLL loss teaches the model
+    to self-calibrate σ:
+
+        Loss = (y − μ)² / (2σ²)  +  ½ log(σ²)
+
+    Overclaiming certainty on a wrong prediction is penalised by the
+    first term; blanket hedging is penalised by the second. σ converges
+    to reflect genuine local predictability from morphology.
+
+  Epistemic (model ignorance, added in v1.1):
+    MC Dropout: keep Dropout2d active at inference, run N stochastic
+    forward passes, take std(predictions) as epistemic uncertainty.
+    σ_total = √(σ_aleatoric² + σ_epistemic²)
+
+Input channels (fine and coarse, identical schema)
+---------------------------------------------------
+  0  Building height      m
+  1  Impervious fraction  0–1
+  2  Vegetation cover     0–1
+  3  Albedo               0–1
+  4  DEM elevation        m (z-scored per city)
+  5  Slope                degrees (z-scored)
+  6  Distance to water/large green  m (z-scored)
+
+Labels (from FITNAH-3D or equivalent hi-fi simulation)
+-------------------------------------------------------
+  0  T_air  nocturnal air temperature at 2 m  [°C delta from city mean]
+  1  PET    daytime PET at 1.1 m              [°C delta from city mean]
+
+  Normalise labels city-by-city (subtract city mean, divide by city std)
+  before training to remove macro-climate offsets. Denormalise for output.
+
+Dataset layout
+--------------
+  root/
+    city_berlin/
+      fine/       0001.npy … (7, 256, 256) float32
+      coarse/     0001.npy … (7,  64,  64) float32
+      labels/     0001.npy … (2, 256, 256) float32
+      masks/      0001.npy … (1, 256, 256) bool      [optional]
+    city_munich/
+      ...
+
+Requirements
+------------
+  torch>=2.0  numpy  (training only)
+  onnxruntime                         (QGIS inference, no torch needed)
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import NamedTuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import DataLoader, Dataset
+
+
+# ---------------------------------------------------------------------------
+# Output container
+# ---------------------------------------------------------------------------
+
+class ModelOutput(NamedTuple):
+    """Typed output tuple — also valid as a plain tuple for ONNX export."""
+    t_air_mean:  torch.Tensor  # (B, 1, H, W)
+    t_air_sigma: torch.Tensor  # (B, 1, H, W)  aleatoric σ in °C
+    pet_mean:    torch.Tensor  # (B, 1, H, W)
+    pet_sigma:   torch.Tensor  # (B, 1, H, W)
+
+
+# ---------------------------------------------------------------------------
+# Building blocks
+# ---------------------------------------------------------------------------
+
+class ConvBlock(nn.Module):
+    """
+    Two Conv(3×3)–BN–ReLU layers with optional spatial dropout.
+
+    Dropout2d zeros entire feature maps rather than individual units.
+    This gives better spatial calibration and is standard for dense
+    prediction tasks.
+    """
+
+    def __init__(self, in_ch: int, out_ch: int, dropout_p: float = 0.0):
+        super().__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(out_ch),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(out_ch, out_ch, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(out_ch),
+            nn.ReLU(inplace=True),
+        )
+        # Dropout2d after activation: drop whole feature maps stochastically.
+        # During MC Dropout inference we re-enable these with enable_dropout().
+        self.drop = nn.Dropout2d(p=dropout_p) if dropout_p > 0.0 else nn.Identity()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.drop(self.conv(x))
+
+
+class UpBlock(nn.Module):
+    """
+    Bilinear upsample (×2) followed by ConvBlock with skip concatenation.
+
+    Bilinear + 1×1 conv is preferred over ConvTranspose2d because it
+    avoids checkerboard artefacts common in urban grid-like rasters.
+    """
+
+    def __init__(
+        self,
+        in_ch: int,
+        skip_ch: int,
+        out_ch: int,
+        dropout_p: float = 0.0,
+    ):
+        super().__init__()
+        self.up = nn.Sequential(
+            nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False),
+            nn.Conv2d(in_ch, out_ch, kernel_size=1, bias=False),
+        )
+        self.conv = ConvBlock(out_ch + skip_ch, out_ch, dropout_p=dropout_p)
+
+    def forward(self, x: torch.Tensor, skip: torch.Tensor) -> torch.Tensor:
+        x = self.up(x)
+        # Guard against ±1 pixel size mismatch from odd spatial dimensions.
+        if x.shape[-2:] != skip.shape[-2:]:
+            x = F.interpolate(
+                x, size=skip.shape[-2:], mode="bilinear", align_corners=False
+            )
+        return self.conv(torch.cat([x, skip], dim=1))
+
+
+class CoarseEncoder(nn.Module):
+    """
+    Lightweight encoder for the 25 m context patch (64×64 input).
+
+    Produces a 16×16 feature map that matches the bottleneck spatial
+    size so it can be directly concatenated before the bottleneck ConvBlock.
+
+    This implements the KLIMASCANNER principle of "decreasing information
+    density with distance": the coarse branch captures the macro-climate
+    context (e.g. a large park, urban fringe effect) that is outside the
+    fine patch's 1.28 km window but still influences local microclimate.
+    """
+
+    def __init__(self, in_ch: int, out_ch: int = 128):
+        super().__init__()
+        self.net = nn.Sequential(
+            ConvBlock(in_ch, 32),         # 64×64
+            nn.MaxPool2d(2),              # 32×32
+            ConvBlock(32, 64),            # 32×32
+            nn.MaxPool2d(2),              # 16×16
+            ConvBlock(64, out_ch),        # 16×16  ← same spatial as bottleneck
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.net(x)
+
+
+# ---------------------------------------------------------------------------
+# Main model
+# ---------------------------------------------------------------------------
+
+class UrbanClimateUNet(nn.Module):
+    """
+    U-Net for urban climate prediction with aleatoric uncertainty.
+
+    Parameters
+    ----------
+    in_ch        : Input channels (same for fine and coarse patches).
+    base_ch      : Feature channels at Enc-1; doubles each encoder level.
+    coarse_ch    : Output channels of CoarseEncoder; fused at bottleneck.
+    dropout_enc  : Dropout2d probability for encoder levels 2 and 3.
+    dropout_bot  : Dropout2d probability for encoder level 4 and bottleneck.
+    dropout_dec  : Dropout2d probability for decoder level 3 (Dec-3).
+    """
+
+    def __init__(
+        self,
+        in_ch:       int   = 7,
+        base_ch:     int   = 64,
+        coarse_ch:   int   = 128,
+        dropout_enc: float = 0.1,
+        dropout_bot: float = 0.2,
+        dropout_dec: float = 0.1,
+    ):
+        super().__init__()
+        c = base_ch  # shorthand: 64
+
+        # ── Encoder ──────────────────────────────────────────────────────────
+        # Spatial dropout is deliberately absent at Enc-1: the first layer
+        # needs all raw input channels to avoid discarding morphological signal
+        # before any abstractions are built.
+        self.enc1 = ConvBlock(in_ch,    c,     dropout_p=0.0)        # 256²
+        self.enc2 = ConvBlock(c,    c * 2, dropout_p=dropout_enc)    # 128²
+        self.enc3 = ConvBlock(c * 2, c * 4, dropout_p=dropout_enc)   #  64²
+        self.enc4 = ConvBlock(c * 4, c * 8, dropout_p=dropout_bot)   #  32²
+        self.pool = nn.MaxPool2d(2)
+
+        # ── Coarse context ───────────────────────────────────────────────────
+        self.coarse_enc = CoarseEncoder(in_ch, coarse_ch)
+
+        # ── Bottleneck ───────────────────────────────────────────────────────
+        # Receives: enc4-pooled (c*8 = 512 ch) + coarse context (128 ch)
+        #           → 640 ch input, 512 ch output
+        self.bottleneck = ConvBlock(
+            c * 8 + coarse_ch, c * 8, dropout_p=dropout_bot
+        )
+
+        # ── Decoder ──────────────────────────────────────────────────────────
+        # UpBlock(in_ch, skip_ch, out_ch):
+        #   in_ch   = channels coming up from the previous decoder level
+        #   skip_ch = channels of the matching encoder skip connection
+        #   out_ch  = channels output by this decoder level
+        self.dec4 = UpBlock(c * 8, c * 8, c * 4)                      # 32²
+        self.dec3 = UpBlock(c * 4, c * 4, c * 2, dropout_p=dropout_dec)  # 64²
+        self.dec2 = UpBlock(c * 2, c * 2, c)                          # 128²
+        self.dec1 = UpBlock(c,     c,     c // 2)                     # 256²
+
+        # ── Output heads (1×1 convolutions) ─────────────────────────────────
+        self.t_air_head    = nn.Conv2d(c // 2, 1, kernel_size=1)
+        self.pet_head      = nn.Conv2d(c // 2, 1, kernel_size=1)
+        # Single head outputs 2 channels: [log_σ_t_air, log_σ_pet]
+        # Using log-σ for numerical stability; softplus converts to σ > 0.
+        self.log_sigma_head = nn.Conv2d(c // 2, 2, kernel_size=1)
+
+        self._init_weights()
+
+    def _init_weights(self) -> None:
+        """He initialisation for Conv layers; zero-init the σ head bias."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+        # Initialise σ head to predict moderate uncertainty (~1 °C) at start.
+        # Without this, early training can be unstable if σ starts near 0.
+        nn.init.zeros_(self.log_sigma_head.bias)
+
+    def forward(self, fine: torch.Tensor, coarse: torch.Tensor) -> ModelOutput:
+        """
+        Parameters
+        ----------
+        fine   : (B, in_ch, 256, 256)  5 m resolution morphology patch
+        coarse : (B, in_ch,  64,  64)  25 m resolution context patch
+
+        Returns
+        -------
+        ModelOutput with fields t_air_mean, t_air_sigma, pet_mean, pet_sigma,
+        each shaped (B, 1, 256, 256).
+        """
+        # ── Encoder ──────────────────────────────────────────────────────────
+        e1 = self.enc1(fine)               # (B,  64, 256, 256)
+        e2 = self.enc2(self.pool(e1))      # (B, 128, 128, 128)
+        e3 = self.enc3(self.pool(e2))      # (B, 256,  64,  64)
+        e4 = self.enc4(self.pool(e3))      # (B, 512,  32,  32)
+
+        # ── Coarse context ───────────────────────────────────────────────────
+        ctx = self.coarse_enc(coarse)      # (B, 128,  16,  16)
+
+        # ── Bottleneck ───────────────────────────────────────────────────────
+        x = self.pool(e4)                  # (B, 512,  16,  16)
+        x = torch.cat([x, ctx], dim=1)    # (B, 640,  16,  16)
+        x = self.bottleneck(x)             # (B, 512,  16,  16)
+
+        # ── Decoder ──────────────────────────────────────────────────────────
+        x = self.dec4(x, e4)   # (B, 256,  32,  32)
+        x = self.dec3(x, e3)   # (B, 128,  64,  64)
+        x = self.dec2(x, e2)   # (B,  64, 128, 128)
+        x = self.dec1(x, e1)   # (B,  32, 256, 256)
+
+        # ── Output heads ─────────────────────────────────────────────────────
+        t_air_mean = self.t_air_head(x)     # (B, 1, 256, 256)
+        pet_mean   = self.pet_head(x)       # (B, 1, 256, 256)
+
+        log_sigma  = self.log_sigma_head(x) # (B, 2, 256, 256)
+        # softplus is smooth and always positive; 1e-4 floor prevents σ → 0.
+        sigma = F.softplus(log_sigma) + 1e-4
+        t_air_sigma = sigma[:, 0:1]         # (B, 1, 256, 256)
+        pet_sigma   = sigma[:, 1:2]         # (B, 1, 256, 256)
+
+        return ModelOutput(t_air_mean, t_air_sigma, pet_mean, pet_sigma)
+
+
+# ---------------------------------------------------------------------------
+# Loss function — Gaussian NLL
+# ---------------------------------------------------------------------------
+
+def gaussian_nll_loss(
+    mean:   torch.Tensor,
+    sigma:  torch.Tensor,
+    target: torch.Tensor,
+    mask:   torch.Tensor | None = None,
+) -> torch.Tensor:
+    """
+    Gaussian negative log-likelihood.
+
+        L = (y − μ)² / (2σ²)  +  ½ log(σ²)
+
+    This is the mechanism by which the model learns σ without any label:
+      • First term: penalises wrong predictions; large σ softens the penalty.
+      • Second term: penalises large σ; prevents the model from always hedging.
+    The model converges to the σ that minimises the sum for each pixel type.
+
+    Parameters
+    ----------
+    mean   : Predicted mean     (B, 1, H, W)
+    sigma  : Predicted sigma    (B, 1, H, W), must be > 0
+    target : FITNAH label       (B, 1, H, W)
+    mask   : Optional valid-pixel mask (B, 1, H, W) bool.
+             Pixels outside the simulation extent should be masked out.
+    """
+    nll = (target - mean) ** 2 / (2.0 * sigma ** 2) + 0.5 * torch.log(sigma ** 2)
+    if mask is not None:
+        nll = nll[mask]
+    return nll.mean()
+
+
+def combined_loss(
+    preds:        ModelOutput,
+    t_air_target: torch.Tensor,
+    pet_target:   torch.Tensor,
+    mask:         torch.Tensor | None = None,
+    t_air_weight: float = 1.0,
+    pet_weight:   float = 1.0,
+) -> torch.Tensor:
+    """Weighted sum of NLL losses for T_air and PET."""
+    l_t = gaussian_nll_loss(preds.t_air_mean, preds.t_air_sigma, t_air_target, mask)
+    l_p = gaussian_nll_loss(preds.pet_mean,   preds.pet_sigma,   pet_target,   mask)
+    return t_air_weight * l_t + pet_weight * l_p
+
+
+# ---------------------------------------------------------------------------
+# MC Dropout inference
+# ---------------------------------------------------------------------------
+
+def enable_dropout(model: nn.Module) -> None:
+    """
+    Enable only Dropout2d layers for MC inference.
+
+    Keeps BatchNorm in eval mode (uses running statistics, not batch stats).
+    This is the correct approach: BN.train() during inference introduces
+    noise from the current batch's statistics, which would contaminate the
+    epistemic uncertainty estimate.
+    """
+    model.eval()
+    for m in model.modules():
+        if isinstance(m, (nn.Dropout, nn.Dropout2d)):
+            m.train()
+
+
+@torch.no_grad()
+def predict_with_uncertainty(
+    model:    UrbanClimateUNet,
+    fine:     torch.Tensor,
+    coarse:   torch.Tensor,
+    n_passes: int = 20,
+    device:   torch.device = torch.device("cpu"),
+) -> dict[str, object]:
+    """
+    Single-sample inference with full uncertainty decomposition.
+
+    Aleatoric σ   = mean of per-pass σ predictions
+                    (model's self-reported irreducible noise)
+    Epistemic σ   = std of per-pass mean predictions
+                    (model's disagreement = out-of-distribution signal)
+    Total σ       = √(σ_al² + σ_ep²)
+
+    The scene-level trust score mirrors the KLIMASCANNER Vertrauenswert:
+      gut            ≥ 80 % of pixels with total σ < 0.5 °C
+      befriedigend   ≥ 50 %
+      ausreichend    < 50 %
+
+    For the MVP (aleatoric only), set n_passes=1 and ignore the
+    epistemic fields — they will be zero.
+
+    Parameters
+    ----------
+    model    : Trained UrbanClimateUNet.
+    fine     : (1, in_ch, 256, 256)
+    coarse   : (1, in_ch,  64,  64)
+    n_passes : Stochastic forward passes. 1 = aleatoric only (fast).
+               20 = full decomposition (recommended for production).
+    """
+    enable_dropout(model)
+    fine   = fine.to(device)
+    coarse = coarse.to(device)
+
+    t_means, t_sigmas, p_means, p_sigmas = [], [], [], []
+    for _ in range(n_passes):
+        out = model(fine, coarse)
+        t_means.append(out.t_air_mean)
+        t_sigmas.append(out.t_air_sigma)
+        p_means.append(out.pet_mean)
+        p_sigmas.append(out.pet_sigma)
+
+    def decompose(
+        means: list[torch.Tensor], sigmas: list[torch.Tensor]
+    ) -> tuple[torch.Tensor, ...]:
+        m  = torch.stack(means)                      # (n_passes, B, 1, H, W)
+        s  = torch.stack(sigmas)
+        mu       = m.mean(0)                         # mean prediction
+        sigma_al = s.mean(0)                         # aleatoric: mean of σ
+        sigma_ep = m.std(0) if n_passes > 1 else torch.zeros_like(mu)
+        sigma_tot = (sigma_al ** 2 + sigma_ep ** 2).sqrt()
+        return mu, sigma_al, sigma_ep, sigma_tot
+
+    t_mu, t_al, t_ep, t_tot = decompose(t_means, t_sigmas)
+    p_mu, p_al, p_ep, p_tot = decompose(p_means, p_sigmas)
+
+    pct_good = (t_tot < 0.5).float().mean().item()
+    trust = (
+        "gut"          if pct_good >= 0.8 else
+        "befriedigend" if pct_good >= 0.5 else
+        "ausreichend"
+    )
+
+    model.eval()
+    return {
+        "t_air_mean":             t_mu,
+        "t_air_sigma_aleatoric":  t_al,
+        "t_air_sigma_epistemic":  t_ep,
+        "t_air_sigma_total":      t_tot,
+        "pet_mean":               p_mu,
+        "pet_sigma_aleatoric":    p_al,
+        "pet_sigma_epistemic":    p_ep,
+        "pet_sigma_total":        p_tot,
+        "trust_score":            trust,           # "gut" / "befriedigend" / "ausreichend"
+        "pct_pixels_good":        pct_good,        # fraction of pixels with σ < 0.5 °C
+    }
+
+
+# ---------------------------------------------------------------------------
+# Dataset
+# ---------------------------------------------------------------------------
+
+class UrbanClimateDataset(Dataset):
+    """
+    Loads pre-rasterised patch triplets (fine, coarse, labels) from disk.
+
+    All inputs must be normalised before saving (z-score per channel per city).
+    Labels must be city-normalised (subtract city mean T/PET, divide by std)
+    so the model learns local spatial patterns, not absolute temperatures.
+
+    See dataset layout in module docstring.
+    """
+
+    def __init__(self, root: str | Path, augment: bool = True):
+        self.augment = augment
+        self.samples: list[tuple[Path, Path, Path, Path | None]] = []
+
+        for city_dir in sorted(Path(root).iterdir()):
+            if not city_dir.is_dir():
+                continue
+            fine_dir   = city_dir / "fine"
+            coarse_dir = city_dir / "coarse"
+            label_dir  = city_dir / "labels"
+            mask_dir   = city_dir / "masks"
+
+            for fine_p in sorted(fine_dir.glob("*.npy")):
+                stem     = fine_p.stem
+                coarse_p = coarse_dir / f"{stem}.npy"
+                label_p  = label_dir  / f"{stem}.npy"
+                mask_p   = (mask_dir / f"{stem}.npy") if mask_dir.exists() else None
+                if coarse_p.exists() and label_p.exists():
+                    self.samples.append((fine_p, coarse_p, label_p, mask_p))
+
+        if not self.samples:
+            raise FileNotFoundError(f"No samples found under {root}")
+
+    def __len__(self) -> int:
+        return len(self.samples)
+
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
+        fine_p, coarse_p, label_p, mask_p = self.samples[idx]
+
+        fine   = torch.from_numpy(np.load(fine_p)).float()    # (7, 256, 256)
+        coarse = torch.from_numpy(np.load(coarse_p)).float()  # (7,  64,  64)
+        labels = torch.from_numpy(np.load(label_p)).float()   # (2, 256, 256)
+        mask   = (
+            torch.from_numpy(np.load(mask_p)).bool()
+            if mask_p and mask_p.exists()
+            else torch.ones(1, fine.shape[-2], fine.shape[-1], dtype=torch.bool)
+        )
+
+        if self.augment:
+            fine, coarse, labels, mask = self._random_rotate_flip(
+                fine, coarse, labels, mask
+            )
+
+        return {
+            "fine":   fine,
+            "coarse": coarse,
+            "t_air":  labels[0:1],   # (1, 256, 256)
+            "pet":    labels[1:2],   # (1, 256, 256)
+            "mask":   mask,          # (1, 256, 256) bool
+        }
+
+    @staticmethod
+    def _random_rotate_flip(
+        fine:   torch.Tensor,
+        coarse: torch.Tensor,
+        labels: torch.Tensor,
+        mask:   torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Random 90° rotation + horizontal flip.
+
+        Note: wind direction changes with rotation. This is intentional —
+        the model should learn morphological patterns, not wind-relative
+        ones (wind direction is a runtime input, not a training feature).
+        """
+        k = int(torch.randint(4, ()).item())
+        dims = (-2, -1)
+        fine, coarse, labels, mask = (
+            torch.rot90(t, k, dims) for t in (fine, coarse, labels, mask)
+        )
+        if torch.rand(()).item() > 0.5:
+            fine, coarse, labels, mask = (
+                t.flip(-1) for t in (fine, coarse, labels, mask)
+            )
+        return fine, coarse, labels, mask
+
+
+# ---------------------------------------------------------------------------
+# Training
+# ---------------------------------------------------------------------------
+
+def train_one_epoch(
+    model:     UrbanClimateUNet,
+    loader:    DataLoader,
+    optimizer: torch.optim.Optimizer,
+    device:    torch.device,
+    scaler:    torch.cuda.amp.GradScaler | None = None,
+) -> float:
+    model.train()
+    total = 0.0
+
+    for batch in loader:
+        fine   = batch["fine"].to(device)
+        coarse = batch["coarse"].to(device)
+        t_air  = batch["t_air"].to(device)
+        pet    = batch["pet"].to(device)
+        mask   = batch["mask"].to(device)
+
+        optimizer.zero_grad()
+
+        if scaler is not None:
+            with torch.autocast("cuda"):
+                preds = model(fine, coarse)
+                loss  = combined_loss(preds, t_air, pet, mask)
+            scaler.scale(loss).backward()
+            scaler.unscale_(optimizer)
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            preds = model(fine, coarse)
+            loss  = combined_loss(preds, t_air, pet, mask)
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+            optimizer.step()
+
+        total += loss.item()
+
+    return total / len(loader)
+
+
+def train(
+    data_root:      str,
+    epochs:         int   = 100,
+    batch_size:     int   = 8,
+    lr:             float = 1e-4,
+    weight_decay:   float = 1e-4,
+    checkpoint_dir: str   = "checkpoints",
+    device_str:     str   = "cuda" if torch.cuda.is_available() else "cpu",
+) -> UrbanClimateUNet:
+    """
+    Full training loop with cosine LR schedule and mixed-precision.
+
+    Recommended hyperparameters for MVP (10–15 training cities):
+      epochs=150, batch_size=8, lr=1e-4
+
+    For fine-tuning on a single new city:
+      epochs=50, batch_size=4, lr=3e-5 (load pretrained weights first)
+    """
+    device = torch.device(device_str)
+    Path(checkpoint_dir).mkdir(parents=True, exist_ok=True)
+
+    dataset = UrbanClimateDataset(data_root, augment=True)
+    loader  = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=True,
+        num_workers=min(4, batch_size),
+        pin_memory=(device.type == "cuda"),
+        persistent_workers=True,
+    )
+
+    model     = UrbanClimateUNet().to(device)
+    optimizer = torch.optim.AdamW(
+        model.parameters(), lr=lr, weight_decay=weight_decay
+    )
+    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+        optimizer, T_max=epochs, eta_min=lr * 0.01
+    )
+    scaler = torch.cuda.amp.GradScaler() if device.type == "cuda" else None
+
+    n_params = sum(p.numel() for p in model.parameters())
+    print(f"Model  : {n_params:,} parameters")
+    print(f"Dataset: {len(dataset):,} patches")
+    print(f"Device : {device}")
+
+    for epoch in range(1, epochs + 1):
+        loss = train_one_epoch(model, loader, optimizer, device, scaler)
+        scheduler.step()
+
+        if epoch % 10 == 0 or epoch == epochs:
+            ckpt = Path(checkpoint_dir) / f"epoch_{epoch:04d}_loss{loss:.4f}.pt"
+            torch.save(
+                {
+                    "epoch":           epoch,
+                    "model_state":     model.state_dict(),
+                    "optimizer_state": optimizer.state_dict(),
+                    "loss":            loss,
+                    "hparams": {
+                        "epochs": epochs,
+                        "batch_size": batch_size,
+                        "lr": lr,
+                    },
+                },
+                ckpt,
+            )
+            print(f"Epoch {epoch:4d}/{epochs}  loss {loss:.4f}  → {ckpt.name}")
+
+    return model
+
+
+# ---------------------------------------------------------------------------
+# ONNX export for QGIS plugin deployment
+# ---------------------------------------------------------------------------
+
+class _OnnxWrapper(nn.Module):
+    """Thin wrapper so torch.onnx.export receives a plain tuple output."""
+
+    def __init__(self, model: UrbanClimateUNet):
+        super().__init__()
+        self.model = model
+
+    def forward(
+        self, fine: torch.Tensor, coarse: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        out = self.model(fine, coarse)
+        return out.t_air_mean, out.t_air_sigma, out.pet_mean, out.pet_sigma
+
+
+def export_onnx(
+    model:       UrbanClimateUNet,
+    output_path: str = "urban_climate_unet.onnx",
+    in_ch:       int = 7,
+    opset:       int = 17,
+) -> None:
+    """
+    Export trained model to ONNX for deployment in the QGIS plugin.
+
+    The plugin loads the .onnx file with onnxruntime (pure Python wheel,
+    ~15 MB, no PyTorch dependency at inference time).
+
+    After export, verify with:
+        import onnxruntime as ort
+        sess = ort.InferenceSession("urban_climate_unet.onnx")
+        print([o.name for o in sess.get_outputs()])
+    """
+    model.eval()
+    wrapper = _OnnxWrapper(model)
+
+    dummy_fine   = torch.zeros(1, in_ch, 256, 256)
+    dummy_coarse = torch.zeros(1, in_ch,  64,  64)
+
+    torch.onnx.export(
+        wrapper,
+        (dummy_fine, dummy_coarse),
+        output_path,
+        input_names  = ["fine_patch", "coarse_patch"],
+        output_names = ["t_air_mean", "t_air_sigma", "pet_mean", "pet_sigma"],
+        dynamic_axes = {
+            "fine_patch":   {0: "batch"},
+            "coarse_patch": {0: "batch"},
+        },
+        opset_version     = opset,
+        do_constant_folding = True,
+    )
+    print(f"ONNX model exported → {output_path}")
+    print("Load in QGIS plugin with:  ort.InferenceSession('urban_climate_unet.onnx')")
+
+
+# ---------------------------------------------------------------------------
+# Quick sanity check
+# ---------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    torch.manual_seed(42)
+
+    model = UrbanClimateUNet()
+    model.eval()
+
+    B = 2
+    fine   = torch.randn(B, 7, 256, 256)
+    coarse = torch.randn(B, 7,  64,  64)
+
+    # ── Forward pass ──────────────────────────────────────────────────────
+    out = model(fine, coarse)
+    print("=== Forward pass (batch=2) ===")
+    for name, tensor in zip(out._fields, out):
+        print(f"  {name:<18} {tuple(tensor.shape)}")
+
+    # ── Loss ──────────────────────────────────────────────────────────────
+    t_tgt = torch.randn(B, 1, 256, 256)
+    p_tgt = torch.randn(B, 1, 256, 256)
+    loss  = combined_loss(out, t_tgt, p_tgt)
+    print(f"\nGaussian NLL loss : {loss.item():.4f}")
+
+    # ── Parameter count ───────────────────────────────────────────────────
+    n = sum(p.numel() for p in model.parameters())
+    print(f"Parameters        : {n:,}")
+
+    # ── MC Dropout uncertainty (fast, 5 passes) ───────────────────────────
+    result = predict_with_uncertainty(
+        model, fine[0:1], coarse[0:1], n_passes=5
+    )
+    print(f"\n=== MC Dropout (5 passes) ===")
+    print(f"  Trust score      : {result['trust_score']}")
+    print(f"  Pct pixels good  : {result['pct_pixels_good']:.1%}")
+    print(f"  T_air σ aleat.   : {result['t_air_sigma_aleatoric'].mean():.3f} °C")
+    print(f"  T_air σ epist.   : {result['t_air_sigma_epistemic'].mean():.3f} °C")
+    print(f"  T_air σ total    : {result['t_air_sigma_total'].mean():.3f} °C")
+
+    # ── ONNX export (requires onnx package) ───────────────────────────────
+    try:
+        export_onnx(model, "urban_climate_unet.onnx")
+    except Exception as e:
+        print(f"\nONNX export skipped ({e})")
--- a/app/requirements.txt
+++ b/app/requirements.txt
@@ -0,0 +1,2 @@
+torch
+numpy