1.2.1.1-beta: notification + LXC + post-install fixes

- flask_notification_routes: PVE webhook X-Webhook-Secret written in standard base64 so PVE can decode it (GH #198) - notification_channels: Gmail SMTP App Password handling — normalize tls_mode (None/empty → starttls), reject creds without host (false- positive sendmail delivery), surface "AUTH not advertised" hint - notification_events: is_vzdump_active_on_host() reads /var/log/pve/ tasks/active directly so backup_start fallback and vm_shutdown suppression survive a Monitor restart mid-backup - notification_templates: extract --storage flag from vzdump log → "PBS-Cloud: vm/104/…" instead of generic "PBS:" prefix when multiple PBS endpoints exist - health_monitor: pve_storage_capacity + zfs_pool_capacity respect per-item dismiss (don't keep category WARNING/CRITICAL after user dismisses); updates_check cache invalidated when /var/log/apt/ history.log mtime advances - lxc_mount_points: PVE volume size from subvol quota (df via /proc/<host_pid>/root/<target> + lxc.conf size=NNNG fallback); host_source_state detects "host detached" zombie binds; per-mount subprocess work parallelised via ThreadPoolExecutor so a CT with many bind mounts doesn't trip the Caddy 3s reverse-proxy timeout - virtual-machines: "host detached" badge on bind mounts whose host source path disappeared - auto/customizable_post_install: log2ram FUNC_VERSION 1.1 → 1.2; new log2ram-check.sh vacuums journal + truncates non-rotating logs (pveproxy/access.log, pveam.log) instead of only calling `log2ram write` (which leaves the tmpfs full); auto flow gains the missing SystemMaxUse in /etc/systemd/journald.conf Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 07:45:01 +00:00 · 2026-05-19 00:06:49 +02:00
parent 81844fa456
commit 6eb1312c61
11 changed files with 548 additions and 92 deletions
@@ -1 +1 @@
-6249ae8d51e0d7dbd3035ba49f4244ff035c2c6d97d5c55f69ab0dac6a4ea021  ProxMenux-1.2.1.1-beta.AppImage
+70a510025df81652319d16e0d36e77bea95a965163608232e9aca60ada9c9fbf  ProxMenux-1.2.1.1-beta.AppImage
@@ -170,6 +170,12 @@ interface LxcMountPoint {
  runtime_readonly?: boolean
  runtime_reachable?: boolean
  runtime_error?: string | null
+  // Sprint 14.x: host-side bind source state. Detects the case where the
+  // CT still reports a bind as mounted even though the host already
+  // umounted the source (Ignacio Seijo 11/05). Null = N/A (PVE volume,
+  // not a host path).
+  host_source_exists?: boolean | null
+  host_source_is_mountpoint?: boolean | null
 }

 const fetcher = async (url: string) => {
@@ -321,9 +327,18 @@ function MountPointCard({ mp }: { mp: LxcMountPoint }) {
  const isStale = mp.runtime_reachable === false
  const isReadonly = !isStale && mp.runtime_readonly === true
  const isDivergent = mp.runtime_mounted === false  // configured but not actually mounted
+  // "Zombie bind": the host removed the source (e.g. USB pulled, manual
+  // umount) but the CT mount namespace still shows the bind as mounted.
+  // Reported by Ignacio Seijo (11/05). Only flag host_bind /
+  // pve_storage_bind sources — PVE volume sources have no host path
+  // and `host_source_exists` comes back null for them.
+  const isHostDetached =
+    mp.runtime_mounted === true &&
+    (mp.type === "host_bind" || mp.type === "pve_storage_bind") &&
+    mp.host_source_exists === false
  const cardClasses = isStale
    ? "border-red-500/50 bg-red-500/5"
-    : isDivergent
+    : isDivergent || isHostDetached
      ? "border-amber-500/40 bg-amber-500/5"
      : isReadonly
        ? "border-amber-500/30 bg-amber-500/5"
@@ -395,7 +410,7 @@ function MountPointCard({ mp }: { mp: LxcMountPoint }) {
          className={
            isStale
              ? "bg-red-500/10 text-red-500 border-red-500/20"
-              : isDivergent
+              : isDivergent || isHostDetached
                ? "bg-amber-500/10 text-amber-500 border-amber-500/20"
                : isReadonly
                  ? "bg-amber-500/10 text-amber-500 border-amber-500/20"
@@ -408,11 +423,13 @@ function MountPointCard({ mp }: { mp: LxcMountPoint }) {
            ? "stale"
            : isDivergent
              ? "not mounted"
-              : isReadonly
-                ? "read-only"
-                : mp.runtime_mounted === null
-                  ? "stopped"
-                  : "mounted"}
+              : isHostDetached
+                ? "host detached"
+                : isReadonly
+                  ? "read-only"
+                  : mp.runtime_mounted === null
+                    ? "stopped"
+                    : "mounted"}
        </Badge>
      </div>

@@ -191,6 +191,24 @@ def _bad_request(msg: str):
    return jsonify({'error': msg}), 400


+def _is_loopback_addr(value: str) -> bool:
+    """Return True for IPv4, IPv6 and IPv4-mapped loopback addresses.
+
+    When Flask is bound to ``::`` for dual-stack support, an HTTP request
+    sent to ``127.0.0.1`` can be reported as ``::ffff:127.0.0.1``. Treat it
+    as local so the PVE webhook keeps the intended localhost trust path.
+    """
+    try:
+        import ipaddress
+        addr = ipaddress.ip_address(value)
+        if addr.is_loopback:
+            return True
+        ipv4_mapped = getattr(addr, 'ipv4_mapped', None)
+        return bool(ipv4_mapped and ipv4_mapped.is_loopback)
+    except ValueError:
+        return value == 'localhost'
+
+
 def _validate_event_type(value: str) -> bool:
    return isinstance(value, str) and bool(_EVENT_TYPE_RE.match(value))

@@ -983,9 +1001,15 @@ def setup_pve_webhook_core() -> dict:
        # endpoint depends entirely on the localhost-bypass and any move
        # to a non-loopback bind silently breaks auth. Audit Tier 3.1 —
        # `setup_pve_webhook_core` no escribe secret en priv cfg.
+        #
+        # PVE stores `secret value=` in STANDARD base64 and decodes it
+        # before emitting the header. Writing the raw token here triggered
+        # `could not decode UTF8 string from base64, key 'X-Webhook-Secret' (500)`
+        # whenever `token_urlsafe` produced `-` or `_` chars (GH #198).
+        secret_b64 = base64.b64encode(secret.encode()).decode()
        priv_block = (
            f"webhook: {_PVE_ENDPOINT_ID}\n"
-            f"        secret name=X-Webhook-Secret,value={secret}\n"
+            f"        secret name=X-Webhook-Secret,value={secret_b64}\n"
        )
        
        if priv_text is not None:
@@ -1225,7 +1249,7 @@ def proxmox_webhook():
    _reject = lambda code, error, status: (jsonify({'accepted': False, 'error': error}), status)

    client_ip = request.remote_addr or ''
-    is_localhost = client_ip in ('127.0.0.1', '::1')
+    is_localhost = _is_loopback_addr(client_ip)

    # CSRF defence-in-depth: reject `application/x-www-form-urlencoded`
    # bodies. PVE always sends `application/json`; form-encoded bodies
@@ -4197,22 +4197,37 @@ class HealthMonitor:
        """
        cache_key = 'updates_check'
        current_time = time.time()
-        
-        # Cache for 10 minutes
-        if cache_key in self.last_check_times:
-            if current_time - self.last_check_times[cache_key] < 600:
-                return self.cached_results.get(cache_key)
-        
+        apt_history_path = '/var/log/apt/history.log'
+
+        # Detect a manual `apt install/upgrade` since the last check by
+        # comparing /var/log/apt/history.log's mtime against the cache
+        # timestamp. apt appends to this file on every transaction, so a
+        # newer mtime means the local package state changed and the cached
+        # pending-updates list is stale. Reported by Alberto (14/5): the
+        # dashboard tile kept showing pending updates ~hours after he ran
+        # `apt upgrade` manually. Cheap stat call; runs at most once per
+        # /api/health/full request.
+        history_mtime = None
+        try:
+            if os.path.exists(apt_history_path):
+                history_mtime = os.path.getmtime(apt_history_path)
+        except Exception:
+            history_mtime = None
+
+        if cache_key in self.last_check_times:
+            cache_ts = self.last_check_times[cache_key]
+            history_changed = (history_mtime is not None and history_mtime > cache_ts)
+            if not history_changed and current_time - cache_ts < 600:
+                return self.cached_results.get(cache_key)
+
        try:
-            apt_history_path = '/var/log/apt/history.log'
            last_update_days = None
            sec_result = None
            age_result = None
-            
-            if os.path.exists(apt_history_path):
+
+            if history_mtime is not None:
                try:
-                    mtime = os.path.getmtime(apt_history_path)
-                    days_since_update = (current_time - mtime) / 86400
+                    days_since_update = (current_time - history_mtime) / 86400
                    last_update_days = int(days_since_update)
                except Exception:
                    pass
@@ -5775,12 +5790,24 @@ class HealthMonitor:
                'used_bytes': used,
            }
            error_key = f'pve_storage_full_{name}'
+            # If the user already dismissed this exact error (within the
+            # suppression window), don't count it toward the category
+            # severity badge. Without this guard the storage section stayed
+            # WARNING/CRITICAL forever even after dismiss because the
+            # underlying % is unchanged — `record_error` correctly returned
+            # `skipped_acknowledged` to silence the notification side, but
+            # the dashboard counter ignored that signal and the user saw
+            # "Storage: 1 Warning" with no way to clear it. Reported on
+            # the community channel re: PBS-lleno (17-18/05).
+            is_dismissed = health_persistence.is_error_acknowledged(error_key)
            if pct >= crit_pct:
-                entry['status'] = 'CRITICAL'
+                entry['status'] = 'CRITICAL' if not is_dismissed else 'INFO'
                entry['error_key'] = error_key
                entry['dismissable'] = True
+                entry['dismissed'] = is_dismissed
                checks[label] = entry
-                critical_labels.append(label)
+                if not is_dismissed:
+                    critical_labels.append(label)
                emitted_keys.add(error_key)
                health_persistence.record_error(
                    error_key=error_key,
@@ -5790,11 +5817,13 @@ class HealthMonitor:
                    details=entry,
                )
            elif pct >= warn_pct:
-                entry['status'] = 'WARNING'
+                entry['status'] = 'WARNING' if not is_dismissed else 'INFO'
                entry['error_key'] = error_key
                entry['dismissable'] = True
+                entry['dismissed'] = is_dismissed
                checks[label] = entry
-                warning_labels.append(label)
+                if not is_dismissed:
+                    warning_labels.append(label)
                emitted_keys.add(error_key)
                health_persistence.record_error(
                    error_key=error_key,
@@ -5877,12 +5906,18 @@ class HealthMonitor:
                'pool_name': name,
            }
            error_key = f'zfs_pool_full_{name}'
+            # Same dismiss-respect as `_check_pve_storage_capacity`. A pool
+            # that the user dismissed keeps its underlying % but should no
+            # longer flip the category badge to WARNING/CRITICAL.
+            is_dismissed = health_persistence.is_error_acknowledged(error_key)
            if pct >= crit_pct:
-                entry['status'] = 'CRITICAL'
+                entry['status'] = 'CRITICAL' if not is_dismissed else 'INFO'
                entry['error_key'] = error_key
                entry['dismissable'] = True
+                entry['dismissed'] = is_dismissed
                checks[name] = entry
-                critical_labels.append(name)
+                if not is_dismissed:
+                    critical_labels.append(name)
                emitted_keys.add(error_key)
                health_persistence.record_error(
                    error_key=error_key,
@@ -5892,11 +5927,13 @@ class HealthMonitor:
                    details=entry,
                )
            elif pct >= warn_pct:
-                entry['status'] = 'WARNING'
+                entry['status'] = 'WARNING' if not is_dismissed else 'INFO'
                entry['error_key'] = error_key
                entry['dismissable'] = True
+                entry['dismissed'] = is_dismissed
                checks[name] = entry
-                warning_labels.append(name)
+                if not is_dismissed:
+                    warning_labels.append(name)
                emitted_keys.add(error_key)
                health_persistence.record_error(
                    error_key=error_key,
@@ -231,17 +231,134 @@ def _df_path(path: str) -> dict[str, Optional[int]]:
        return empty


+_SIZE_UNIT_TO_BYTES = {
+    "": 1, "B": 1,
+    "K": 1024, "KB": 1024, "KIB": 1024,
+    "M": 1024 ** 2, "MB": 1024 ** 2, "MIB": 1024 ** 2,
+    "G": 1024 ** 3, "GB": 1024 ** 3, "GIB": 1024 ** 3,
+    "T": 1024 ** 4, "TB": 1024 ** 4, "TIB": 1024 ** 4,
+}
+
+
+def _parse_pve_size(value: str) -> Optional[int]:
+    """Convert PVE-style sizes (``150G``, ``32M``, ``2T``) to bytes.
+
+    PVE stores volume sizes in lxc.conf as ``size=<num><unit>`` where
+    unit is a single letter from {K,M,G,T} (powers of 1024). Returns
+    None for empty/unparseable input — callers fall through to
+    pvesm-based totals.
+    """
+    if value is None:
+        return None
+    s = str(value).strip().upper()
+    if not s:
+        return None
+    m = re.match(r"^(\d+(?:\.\d+)?)\s*([KMGT]?I?B?)$", s)
+    if not m:
+        return None
+    try:
+        magnitude = float(m.group(1))
+    except ValueError:
+        return None
+    unit = m.group(2) or ""
+    multiplier = _SIZE_UNIT_TO_BYTES.get(unit)
+    if multiplier is None:
+        return None
+    return int(magnitude * multiplier)
+
+
+def _df_via_host_pid(host_pid: str, ct_target: str) -> dict[str, Optional[int]]:
+    """``df`` the CT-internal path via ``/proc/<pid>/root`` so we get
+    the filesystem as the container sees it, including ZFS dataset
+    quotas. Used for ``pve_volume`` mounts whose ``pvesm status``
+    numbers reflect the whole storage pool instead of the per-subvol
+    quota — without this the UI showed 851 GB total for a 150 GB ZFS
+    subvol because pvesm reports the rpool's free space.
+    """
+    empty = {"total_bytes": None, "used_bytes": None, "available_bytes": None}
+    if not host_pid or not ct_target:
+        return empty
+    full = f"/proc/{host_pid}/root{ct_target}"
+    try:
+        proc = subprocess.run(
+            ["df", "-B1", "--output=size,used,avail", full],
+            capture_output=True, text=True, timeout=_STAT_TIMEOUT,
+        )
+        if proc.returncode != 0:
+            return empty
+        lines = [ln for ln in proc.stdout.strip().splitlines() if ln.strip()]
+        if len(lines) < 2:
+            return empty
+        parts = lines[-1].split()
+        if len(parts) < 3:
+            return empty
+        return {
+            "total_bytes": int(parts[0]),
+            "used_bytes": int(parts[1]),
+            "available_bytes": int(parts[2]),
+        }
+    except (subprocess.TimeoutExpired, OSError, ValueError):
+        return empty
+
+
 def _capacity_for(source: str, classification: dict[str, Any],
-                  pve_storages: dict[str, dict[str, Any]]) -> dict[str, Optional[int]]:
+                  pve_storages: dict[str, dict[str, Any]],
+                  config_options: Optional[dict[str, Any]] = None,
+                  host_pid: str = "",
+                  target: str = "") -> dict[str, Optional[int]]:
    """Return total/used/available bytes for the *source* of a mount.

-    ``pve_volume`` and ``pve_storage_bind`` reuse the numbers from
-    ``pvesm status`` (already loaded once). ``host_bind`` falls back to
-    ``df`` of the host path. None values mean the lookup didn't
-    succeed and the UI will render n/a.
+    ``pve_volume`` quota handling (Sprint 14.x — Ignacio Seijo 10/05):
+      A ``mp6: local-zfs:subvol-310-disk-1,size=150G,...`` line carved
+      out a 150 GB subvol from a 1 TB pool. The previous code read
+      ``pvesm status local-zfs`` and reported 851 GB total / 19% used —
+      reflecting the whole pool, not the subvol. We now prefer, in
+      order:
+        1) ``df`` of ``/proc/<host_pid>/root/<target>`` when the CT is
+           up — gives the correct view-from-inside numbers including
+           the quota.
+        2) ``size=<N>`` from lxc.conf as the total; usage is unknown
+           when the CT isn't running, so the UI shows total only.
+        3) Fallback to ``pvesm status`` (pool numbers) when the entry
+           has no declared size — that's the legacy behaviour for
+           sizeless block volumes (lvm raw, rbd).
+
+    ``pve_storage_bind`` mounts (NFS, CIFS at ``/mnt/pve/...``) keep
+    the pvesm-based numbers because the storage IS the source of truth
+    for those.
+
+    ``host_bind`` falls back to ``df`` of the host path. None values
+    mean the lookup didn't succeed and the UI will render n/a.
    """
    ctype = classification.get("type")
-    if ctype in ("pve_volume", "pve_storage_bind"):
+    config_options = config_options or {}
+    declared_size_bytes = _parse_pve_size(config_options.get("size"))
+
+    if ctype == "pve_volume":
+        # 1) Live numbers from inside the CT (respects quota).
+        if host_pid and target:
+            live = _df_via_host_pid(host_pid, target)
+            if live.get("total_bytes") is not None:
+                return live
+        # 2) CT down (or df failed): expose declared quota as total.
+        if declared_size_bytes is not None:
+            return {
+                "total_bytes": declared_size_bytes,
+                "used_bytes": None,
+                "available_bytes": None,
+            }
+        # 3) No quota declared: legacy pool-level numbers.
+        sid = classification.get("origin_storage", "")
+        st = pve_storages.get(sid)
+        if not st:
+            return {"total_bytes": None, "used_bytes": None, "available_bytes": None}
+        return {
+            "total_bytes": st["total_kib"] * 1024 if st.get("total_kib") is not None else None,
+            "used_bytes": st["used_kib"] * 1024 if st.get("used_kib") is not None else None,
+            "available_bytes": st["avail_kib"] * 1024 if st.get("avail_kib") is not None else None,
+        }
+
+    if ctype == "pve_storage_bind":
        sid = classification.get("origin_storage", "")
        st = pve_storages.get(sid)
        if not st:
@@ -312,6 +429,45 @@ def _read_ct_proc_mounts(host_pid: str) -> list[dict[str, Any]]:
    return out


+def _host_source_state(source: str) -> dict[str, Any]:
+    """Inspect a host-side bind source to detect 'zombie' binds.
+
+    Reported by Ignacio Seijo (11/05): when the host unmounted
+    ``/mnt/nas1_con_backup`` the CT kept reporting it as ``mounted``
+    because the bind into the CT's mount namespace was still live —
+    the kernel doesn't propagate the host-side umount to the child
+    namespace. The CT's view becomes a frozen snapshot of whatever
+    was under the path at bind time (usually an empty dir).
+
+    Returns ``{exists, is_mountpoint, error}``. ``exists=False`` means
+    the source path is gone entirely (e.g. a USB drive that was
+    physically removed). ``is_mountpoint=False`` while ``exists=True``
+    is the zombie-bind case the UI flags.
+
+    Only meaningful for absolute host paths. Storage-id sources
+    (``local-zfs:subvol-...``) return ``{None, None, None}`` since
+    there is no host path to inspect.
+    """
+    empty = {"exists": None, "is_mountpoint": None, "error": None}
+    if not source or not source.startswith("/"):
+        return empty
+    try:
+        st_exists = os.path.exists(source)
+    except OSError as e:
+        return {"exists": None, "is_mountpoint": None, "error": str(e)}
+    if not st_exists:
+        return {"exists": False, "is_mountpoint": False, "error": "path missing"}
+    try:
+        proc = subprocess.run(
+            ["mountpoint", "-q", source],
+            capture_output=True, text=True, timeout=_STAT_TIMEOUT,
+        )
+        is_mp = (proc.returncode == 0)
+        return {"exists": True, "is_mountpoint": is_mp, "error": None}
+    except (subprocess.TimeoutExpired, OSError) as e:
+        return {"exists": True, "is_mountpoint": None, "error": str(e)}
+
+
 def _stat_via_host(host_pid: str, ct_target: str,
                   timeout: int = _STAT_TIMEOUT) -> dict[str, Any]:
    """Stat the container-internal target through /proc/<pid>/root —
@@ -366,11 +522,37 @@ def get_lxc_mount_points(vmid: str) -> dict[str, Any]:
    out: list[dict[str, Any]] = []
    matched_targets: set[str] = set()

-    for entry in config_entries:
+    # Pre-compute per-entry subprocess work in parallel so a CT with
+    # many mountpoints doesn't pay N×(_STAT_TIMEOUT + _STAT_TIMEOUT)
+    # serialised cost. The previous serial path tripped Caddy's 3s
+    # reverse-proxy timeout (Ignacio Seijo 11/05: "/api/lxc/210/
+    # mount-points → 502 (3.00s)") on hosts with 5+ binds. ThreadPool
+    # is the right primitive — these are all I/O-bound `df`/`stat`
+    # calls hitting independent paths.
+    from concurrent.futures import ThreadPoolExecutor
+
+    def _gather_one(entry):
+        src = entry.get("source", "")
+        tgt = entry.get("target", "")
+        classification = _classify(src, pve_storages)
+        capacity = _capacity_for(
+            src, classification, pve_storages,
+            config_options=entry.get("config_options", {}),
+            host_pid=host_pid if running else "",
+            target=tgt,
+        )
+        host_src = _host_source_state(src)
+        live_target = bool(running and tgt and tgt in rt_by_target)
+        health = _stat_via_host(host_pid, tgt) if live_target else None
+        return entry, classification, capacity, host_src, live_target, health
+
+    max_workers = max(2, min(8, len(config_entries) or 1))
+    with ThreadPoolExecutor(max_workers=max_workers) as pool:
+        gathered = list(pool.map(_gather_one, config_entries))
+
+    for entry, cls, cap, host_src, live_target, health in gathered:
        source = entry.get("source", "")
        target = entry.get("target", "")
-        cls = _classify(source, pve_storages)
-        cap = _capacity_for(source, cls, pve_storages)

        item: dict[str, Any] = {
            "mp_index": entry.get("mp_index", ""),
@@ -382,13 +564,14 @@ def get_lxc_mount_points(vmid: str) -> dict[str, Any]:
            "origin_label": cls.get("origin_label", source),
            "config_options": entry.get("config_options", {}),
            "config_flags": entry.get("config_flags", []),
+            "host_source_exists": host_src["exists"],
+            "host_source_is_mountpoint": host_src["is_mountpoint"],
            **cap,
        }

        # Runtime enrichment when CT is up.
-        if running and target and target in rt_by_target:
+        if live_target:
            rt = rt_by_target[target]
-            health = _stat_via_host(host_pid, target)
            item.update({
                "runtime_mounted": True,
                "runtime_source": rt["rt_source"],
@@ -416,34 +599,42 @@ def get_lxc_mount_points(vmid: str) -> dict[str, Any]:
    # original Sprint 13.24 issue revolves around catching them.
    ad_hoc: list[dict[str, Any]] = []
    if running:
-        for rt in rt_mounts:
-            target = rt["rt_target"]
-            if target in matched_targets:
-                continue
-            if not _REMOTE_FS_RE.match(rt["rt_fstype"]):
-                continue
-            health = _stat_via_host(host_pid, target)
-            ad_hoc.append({
-                "mp_index": "",
-                "source": rt["rt_source"],
-                "target": target,
-                "type": "ad_hoc",
-                "origin_storage": "",
-                "origin_storage_type": "",
-                "origin_label": rt["rt_source"],
-                "config_options": {},
-                "config_flags": [],
-                "total_bytes": None,
-                "used_bytes": None,
-                "available_bytes": None,
-                "runtime_mounted": True,
-                "runtime_source": rt["rt_source"],
-                "runtime_fstype": rt["rt_fstype"],
-                "runtime_options": rt["rt_options"],
-                "runtime_readonly": rt["rt_readonly"],
-                "runtime_reachable": health["reachable"],
-                "runtime_error": health["error"],
-            })
+        ad_hoc_candidates = [
+            rt for rt in rt_mounts
+            if rt["rt_target"] not in matched_targets
+            and _REMOTE_FS_RE.match(rt["rt_fstype"])
+        ]
+        # Same parallelisation as the configured-mp loop: stat'ing
+        # stale NFS exports serially can dominate the request and
+        # push it past the proxy timeout.
+        if ad_hoc_candidates:
+            with ThreadPoolExecutor(max_workers=max_workers) as pool:
+                healths = list(pool.map(
+                    lambda rt: _stat_via_host(host_pid, rt["rt_target"]),
+                    ad_hoc_candidates,
+                ))
+            for rt, health in zip(ad_hoc_candidates, healths):
+                ad_hoc.append({
+                    "mp_index": "",
+                    "source": rt["rt_source"],
+                    "target": rt["rt_target"],
+                    "type": "ad_hoc",
+                    "origin_storage": "",
+                    "origin_storage_type": "",
+                    "origin_label": rt["rt_source"],
+                    "config_options": {},
+                    "config_flags": [],
+                    "total_bytes": None,
+                    "used_bytes": None,
+                    "available_bytes": None,
+                    "runtime_mounted": True,
+                    "runtime_source": rt["rt_source"],
+                    "runtime_fstype": rt["rt_fstype"],
+                    "runtime_options": rt["rt_options"],
+                    "runtime_readonly": rt["rt_readonly"],
+                    "runtime_reachable": health["reachable"],
+                    "runtime_error": health["error"],
+                })

    return {
        "ok": True,
@@ -508,14 +508,22 @@ class EmailChannel(NotificationChannel):
    
    def __init__(self, config: Dict[str, str]):
        super().__init__()
-        self.host = config.get('host', '')
+        self.host = (config.get('host', '') or '').strip()
        self.port = int(config.get('port', 587) or 587)
-        self.username = config.get('username', '')
-        self.password = config.get('password', '')
-        self.tls_mode = config.get('tls_mode', 'starttls')  # none | starttls | ssl
-        self.from_address = config.get('from_address', '')
+        self.username = config.get('username', '') or ''
+        self.password = config.get('password', '') or ''
+        # `dict.get(k, default)` only returns default when the key is MISSING;
+        # if the user previously saved an empty string or null, we'd end up
+        # with `tls_mode=''` and silently skip STARTTLS — which causes
+        # `SMTPNotSupportedError: SMTP AUTH extension not supported by server`
+        # on Gmail/Outlook because they only advertise AUTH post-STARTTLS.
+        tls_raw = (config.get('tls_mode') or 'starttls').strip().lower()
+        if tls_raw not in ('none', 'starttls', 'ssl'):
+            tls_raw = 'starttls'
+        self.tls_mode = tls_raw
+        self.from_address = config.get('from_address', '') or ''
        self.to_addresses = self._parse_recipients(config.get('to_addresses', ''))
-        self.subject_prefix = config.get('subject_prefix', '[ProxMenux]')
+        self.subject_prefix = config.get('subject_prefix', '[ProxMenux]') or '[ProxMenux]'
        self.timeout = int(config.get('timeout', 10) or 10)
    
    @staticmethod
@@ -529,6 +537,17 @@ class EmailChannel(NotificationChannel):
            return False, 'No recipients configured'
        if not self.from_address:
            return False, 'No from address configured'
+        # Credentials without an explicit SMTP host would silently fall back to
+        # `/usr/sbin/sendmail`, which ignores username/password entirely — the
+        # test returns OK because Postfix queued the message, but the relay is
+        # never authenticated and the mail rots in the local mailq. Reported by
+        # Ignacio Seijo: "dejando host/puerto en blanco el test pasa pero el
+        # correo nunca llega".
+        if (self.username or self.password) and not self.host:
+            return False, ('SMTP credentials provided but no host configured. '
+                           'Set host (e.g. smtp.gmail.com) and port (587) — '
+                           'without a host the message goes to the local MTA '
+                           'and your username/password are ignored.')
        # Must have SMTP host OR local sendmail available
        if not self.host:
            import os
@@ -591,8 +610,33 @@ class EmailChannel(NotificationChannel):
                    server.ehlo()  # Re-identify after TLS -- server re-announces AUTH
            
            if self.username and self.password:
+                # If the server doesn't advertise AUTH after our EHLO sequence,
+                # smtplib's `login()` raises `SMTPNotSupportedError` with the
+                # opaque message "SMTP AUTH extension not supported by server".
+                # That fired for users who left tls_mode blank or pointed at
+                # port 587 without STARTTLS — Gmail only advertises AUTH after
+                # the TLS handshake. Surface the real reason here.
+                if not server.has_extn('auth'):
+                    hint = (
+                        f"server={self.host}:{self.port} tls_mode={self.tls_mode}"
+                    )
+                    if self.tls_mode == 'none':
+                        return 0, (
+                            'SMTP server did not advertise AUTH after EHLO. '
+                            'TLS is disabled — most providers (Gmail, Outlook, '
+                            'Office365) only allow login after STARTTLS or SSL. '
+                            f'Switch TLS Mode to STARTTLS (port 587) or SSL/TLS '
+                            f'(port 465). [{hint}]'
+                        )
+                    return 0, (
+                        'SMTP server did not advertise AUTH after EHLO. '
+                        'Verify the host/port/TLS combination. For Gmail use '
+                        'smtp.gmail.com:587 with STARTTLS and an App Password '
+                        '(https://myaccount.google.com/apppasswords); for '
+                        f'Outlook use smtp.office365.com:587 with STARTTLS. [{hint}]'
+                    )
                server.login(self.username, self.password)
-            
+
            server.send_message(msg)
            server.quit()
            server = None
@@ -601,8 +645,10 @@ class EmailChannel(NotificationChannel):
            return 0, f'SMTP authentication failed (check username/password or app-specific password): {e}'
        except smtplib.SMTPNotSupportedError as e:
            return 0, (f'SMTP AUTH not supported by server. '
-                       f'This may mean the server requires OAuth2 or an App Password '
-                       f'instead of regular credentials: {e}')
+                       f'TLS mode: {self.tls_mode}, port: {self.port}. '
+                       f'Gmail/Outlook require STARTTLS on 587 or SSL/TLS on 465. '
+                       f'For Gmail, generate an App Password at '
+                       f'https://myaccount.google.com/apppasswords. Detail: {e}')
        except smtplib.SMTPConnectError as e:
            return 0, f'SMTP connection failed: {e}'
        except smtplib.SMTPException as e:
@@ -292,6 +292,61 @@ def _record_smartd_observation_impl(title: str, message: str):
        print(f"[smartd_observation] Error recording smartd observation: {e}")


+# ─── Vzdump activity detector (shared, restart-tolerant) ─────────
+#
+# A single source of truth for "is a vzdump backup job running on this
+# host RIGHT NOW", consultable from any watcher and surviving Monitor
+# restarts. Reads `/var/log/pve/tasks/active` directly — PVE writes the
+# active UPID there at backup start and removes it on completion, so
+# it persists across our process restarts.
+#
+# Without this, JournalWatcher's in-memory `_last_backup_job_ts` got
+# reset by every Monitor restart, and any `Starting Backup of VM X`
+# log lines arriving after that point were treated as standalone
+# backups — emitting one `backup_start` per guest with `storage=local`
+# (the fallback path that doesn't see the parent job's --storage flag).
+# Reported by JC Miñarro 18/05 after a Monitor redeploy mid-job.
+_VZDUMP_ACTIVE_FILE = '/var/log/pve/tasks/active'
+_vzdump_active_cache_ts: float = 0
+_vzdump_active_cache_value: bool = False
+_VZDUMP_ACTIVE_CACHE_TTL = 5  # seconds
+
+
+def is_vzdump_active_on_host() -> bool:
+    """Return True if `/var/log/pve/tasks/active` contains an active
+    vzdump UPID (i.e. backup currently running). Cached 5s to avoid
+    hammering the file on every notification.
+
+    Caller-safe: returns False on any I/O / parse error.
+    """
+    global _vzdump_active_cache_ts, _vzdump_active_cache_value
+    now = time.time()
+    if now - _vzdump_active_cache_ts < _VZDUMP_ACTIVE_CACHE_TTL:
+        return _vzdump_active_cache_value
+    found = False
+    try:
+        with open(_VZDUMP_ACTIVE_FILE, 'r') as f:
+            for line in f:
+                # UPID format: UPID:node:pid:pstart:starttime:type:id:user:
+                if ':vzdump:' not in line:
+                    continue
+                parts = line.strip().split(':')
+                if len(parts) < 3:
+                    continue
+                try:
+                    pid = int(parts[2], 16)  # PID in UPID is hex
+                    os.kill(pid, 0)
+                    found = True
+                    break
+                except (ValueError, ProcessLookupError, PermissionError):
+                    continue
+    except (OSError, IOError):
+        pass
+    _vzdump_active_cache_ts = now
+    _vzdump_active_cache_value = found
+    return found
+
+
 # ─── Journal Watcher (Real-time) ─────────────────────────────────

 class JournalWatcher:
@@ -1238,6 +1293,14 @@ class JournalWatcher:
                now = time.time()
                if now - self._last_backup_job_ts < self._BACKUP_JOB_SUPPRESS_WINDOW:
                    return  # Part of an active job -- already notified
+                # Restart-tolerant fallback: if the in-memory timestamp was
+                # cleared (Monitor restarted mid-job) but PVE still has an
+                # active vzdump UPID, this per-guest line is part of that
+                # job — drop it instead of emitting a wrong "Backup started
+                # on local" with storage default. Reported by JC Miñarro 18/05
+                # after a Monitor redeploy during an active PBS backup.
+                if is_vzdump_active_on_host():
+                    return
                fallback_guest = fb.group(1)
            else:
                return
@@ -1893,10 +1956,15 @@ class TaskWatcher:
        # Suppress VM/CT start/stop/shutdown while a vzdump is active.
        # These are backup-induced operations (mode=stop), not user actions.
        # Exception: if a VM/CT FAILS or has WARNINGS, that IS important.
+        # We check BOTH our in-memory tracking (`_is_vzdump_active`) AND
+        # `tasks/active` on disk (`is_vzdump_active_on_host`). The disk
+        # check survives Monitor restarts mid-backup, which otherwise
+        # cleared `_vzdump_running_since` and exposed the post-restart
+        # shutdown notifications to the user (JC Miñarro 18/05).
        _BACKUP_NOISE = {'vm_start', 'vm_stop', 'vm_shutdown', 'vm_restart',
                         'ct_start', 'ct_stop', 'ct_shutdown', 'ct_restart'}
        if event_type in _BACKUP_NOISE and not is_error and not is_warning:
-            if self._is_vzdump_active():
+            if self._is_vzdump_active() or is_vzdump_active_on_host():
                return
        
        # Suppress VM/CT stop/shutdown during host shutdown/reboot.
@@ -223,14 +223,28 @@ def _parse_vzdump_message(message: str) -> Optional[Dict[str, Any]]:
            else:
                total_time = f"{secs}s"
    
+    # ── Extract the storage target name (PBS, PBS-Cloud, local, …) ──
+    # PVE logs the full command on the first line:
+    #   "INFO: starting new backup job: vzdump 104 105 --storage PBS-Cloud --mode stop"
+    # We surface it so the notification body can say "PBS-Cloud: vm/104/…"
+    # instead of the generic "PBS:" prefix when multiple PBS endpoints
+    # are configured. Reported by JC Miñarro 18/05.
+    storage_name = ''
+    for line in lines:
+        m_storage = re.search(r'--storage\s+(\S+)', line)
+        if m_storage:
+            storage_name = m_storage.group(1).strip()
+            break
+
    if not vms and not total_size:
        return None
-    
+
    return {
        'vms': vms,
        'total_time': total_time,
        'total_size': total_size,
        'vm_count': len(vms),
+        'storage_name': storage_name,
    }


@@ -277,13 +291,19 @@ def _format_vzdump_body(parsed: Dict[str, Any], is_success: bool) -> str:
        if detail_line:
            parts.append(' | '.join(detail_line))
        
-        # PBS/File on separate line with icon
+        # PBS/File on separate line with icon. When we know the
+        # storage name (e.g. "PBS-Cloud", "PBS-Office") prefix it so
+        # the user can tell which destination this archive lives in \u2014
+        # critical when there are multiple PBS endpoints configured.
        if vm.get('filename'):
            fname = vm['filename']
+            storage_name = parsed.get('storage_name', '') or ''
            if re.match(r'^(?:ct|vm)/\d+/', fname):
-                parts.append(f"\U0001F5C4\uFE0F PBS: {fname}")
+                label = storage_name if storage_name else 'PBS'
+                parts.append(f"\U0001F5C4\uFE0F {label}: {fname}")
            else:
-                parts.append(f"\U0001F4C1 File: {fname}")
+                label = storage_name if storage_name else 'File'
+                parts.append(f"\U0001F4C1 {label}: {fname}")
        
        # Error reason if failed
        if status != 'ok' and vm.get('error'):