Update AppImage

2026-05-22 00:24:48 +00:00 · 2026-05-20 18:14:32 +02:00
parent 1087a87ea2
commit 4112323961
20 changed files with 1638 additions and 261 deletions
@@ -1019,10 +1019,16 @@ def _capture_health_journal_context(categories: list, reason: str = '') -> str:
        if not pattern:
            return ""
        
-        # Capture recent journal entries matching keywords
-        # Use -b 0 to only include logs from the current boot
+        # Capture recent journal entries matching keywords.
+        # Use -b 0 to only include logs from the current boot.
+        # Filter out the Monitor's own stdout (AppRun, [HealthPersistence],
+        # proxmenux-auth, etc.) BEFORE keyword matching — otherwise a startup
+        # line like "[HealthPersistence] Database initialized with 13 tables"
+        # leaks into the AI context because grep -iE 'ata' matches the
+        # substring "ata" in "dATAbase". Self-logs are never system evidence.
        cmd = (
            f"journalctl -b 0 --since='10 minutes ago' --no-pager -n 500 2>/dev/null | "
+            f"grep -vE 'AppRun\\[|proxmenux-auth|\\[HealthPersistence\\]|\\[ProxMenux\\]|\\[NotificationManager\\]|\\[AIEnhancer\\]' | "
            f"grep -iE '{pattern}' | tail -n 30"
        )
        
@@ -1131,12 +1137,28 @@ def _health_collector_loop():
                'updates': 'update_summary',
            }

+            # Sub-categories already rolled up into details['storage']
+            # by _check_proxmox_storage_status. Emitting them as their
+            # own health_degraded entries duplicates the same warning
+            # (e.g. "Storage Mounts & Space" + "PVE Storage Capacity"
+            # both saying "PBS-Cloud (pbs) usage ≥70%"). Skip them at
+            # the notification layer — they still update _prev_statuses
+            # so a future degradation transition is detected normally.
+            _STORAGE_SUBCATEGORIES = {
+                'pve_storage_capacity', 'zfs_pool_capacity',
+                'lxc_disk', 'lxc_mounts', 'remote_mounts',
+            }
+
            for cat_key, cat_data in details.items():
                cur_status = cat_data.get('status', 'OK')
                prev_status = _prev_statuses.get(cat_key, 'OK')
                cur_rank = _SEV_RANK.get(cur_status, 0)
                prev_rank = _SEV_RANK.get(prev_status, 0)

+                if cat_key in _STORAGE_SUBCATEGORIES:
+                    _prev_statuses[cat_key] = cur_status
+                    continue
+
                if cur_rank > prev_rank and cur_rank >= 2:  # WARNING or CRITICAL
                    reason = cat_data.get('reason', f'{cat_key} status changed to {cur_status}')
                    reason_lower = reason.lower()
@@ -4676,16 +4698,56 @@ def get_network_info():
            'vm_lxc_total_count': 0
        }

+def _get_lxc_update_status_map() -> dict:
+    """Read the managed_installs registry and project the LXC update
+    state into a quick lookup ``{vmid: {available, count, security_count,
+    last_check, packages[]}}``. Used to decorate ``/api/vms`` output
+    without forcing the frontend to fetch a second endpoint.
+
+    Returns an empty dict if the registry module isn't available or
+    nothing is registered — callers must treat absence as "no info".
+    """
+    try:
+        import managed_installs
+    except Exception:
+        return {}
+    try:
+        active = managed_installs.get_active_items() or []
+    except Exception:
+        return {}
+
+    out: dict = {}
+    for it in active:
+        if it.get('type') != 'lxc':
+            continue
+        vmid = it.get('_vmid') or it.get('id', '').removeprefix('lxc:')
+        if not vmid:
+            continue
+        update = it.get('update_check') or {}
+        out[str(vmid)] = {
+            'available': bool(update.get('available')),
+            'count': int(update.get('_count') or 0),
+            'security_count': int(update.get('_security_count') or 0),
+            'last_check': update.get('last_check'),
+            'latest': update.get('latest'),
+            'error': update.get('error'),
+            # Cap packages list shipped to UI — modal uses first 30 max
+            'packages': (update.get('_packages') or [])[:30],
+        }
+    return out
+
+
 def get_proxmox_vms():
    """Get Proxmox VM and LXC information (requires pvesh command) - only from local node"""
    try:
        all_vms = []
-        
+        lxc_updates_map = _get_lxc_update_status_map()
+
        try:
            # local_node = socket.gethostname()
            local_node = get_proxmox_node_name()
            # print(f"[v0] Local node detected: {local_node}")
-        
+
            resources = get_cached_pvesh_cluster_resources_vm()
            if resources:
                for resource in resources:
@@ -4693,12 +4755,13 @@ def get_proxmox_vms():
                    if node != local_node:
                        # print(f"[v0] Skipping VM {resource.get('vmid')} from remote node: {node}")
                        continue
-                    
+
+                    vm_type = 'lxc' if resource.get('type') == 'lxc' else 'qemu'
                    vm_data = {
                        'vmid': resource.get('vmid'),
                        'name': resource.get('name', f"VM-{resource.get('vmid')}"),
                        'status': resource.get('status', 'unknown'),
-                        'type': 'lxc' if resource.get('type') == 'lxc' else 'qemu',
+                        'type': vm_type,
                        'cpu': resource.get('cpu', 0),
                        'mem': resource.get('mem', 0),
                        'maxmem': resource.get('maxmem', 0),
@@ -4710,6 +4773,14 @@ def get_proxmox_vms():
                        'diskread': resource.get('diskread', 0),
                        'diskwrite': resource.get('diskwrite', 0)
                    }
+                    # Decorate LXC rows with the apt update status if the
+                    # managed_installs registry has it. Absent key means
+                    # either the user hasn't enabled the feature or the
+                    # CT isn't running / isn't Debian/Ubuntu.
+                    if vm_type == 'lxc':
+                        upd = lxc_updates_map.get(str(resource.get('vmid')))
+                        if upd is not None:
+                            vm_data['update_check'] = upd
                    all_vms.append(vm_data)

                return all_vms
@@ -11035,9 +11106,53 @@ def api_vm_control(vmid):
                    'message': f'Successfully executed {action} on {vm_info.get("name")}'
                })
            else:
+                # `pvesh` failed → fire the matching vm_fail / ct_fail
+                # notification so the user gets paged on their channels
+                # too, not just an in-dashboard alert. Previously this
+                # path silently returned a 500 to the browser and lost
+                # the event entirely (reported on .1.10: tried to start
+                # VM 106 while log2ram tmpfs was full → 500 in the UI
+                # but no Telegram message). The stderr is the most
+                # useful single line we have — `pvesh` reliably prints
+                # the underlying daemon failure there (e.g.
+                # "start failed: command '/usr/bin/kvm …' failed with
+                # exit code 1: no space left on device").
+                err_text = (control_result.stderr or '').strip() \
+                    or (control_result.stdout or '').strip() \
+                    or f'{action} returned exit code {control_result.returncode}'
+                # Truncate runaway stderr (some pvesh failures dump
+                # multi-KB tracebacks) — keep the notification readable.
+                if len(err_text) > 500:
+                    err_text = err_text[:500] + ' …'
+
+                try:
+                    from notification_manager import notification_manager as _nm
+                    import socket as _sock
+                    _host = _sock.gethostname()
+                    event_type = 'ct_fail' if vm_type == 'lxc' else 'vm_fail'
+                    _nm.emit_event(
+                        event_type=event_type,
+                        severity='CRITICAL',
+                        data={
+                            'hostname': _host,
+                            'vmid': str(vmid),
+                            'vmname': vm_info.get('name') or f'{vm_type}-{vmid}',
+                            'reason': f'{action} failed: {err_text}',
+                            'action': action,
+                        },
+                        source='dashboard',
+                        entity='vm',
+                        entity_id=str(vmid),
+                    )
+                except Exception as _emit_err:
+                    print(f"[api_vm_control] failed to emit {vm_type}_fail "
+                          f"notification: {type(_emit_err).__name__}: {_emit_err}")
+
                return jsonify({
                    'success': False,
-                    'error': control_result.stderr
+                    'vmid': vmid,
+                    'action': action,
+                    'error': err_text,
                }), 500
        else:
            return jsonify({'error': 'Failed to get VM details'}), 500
@@ -92,7 +92,15 @@ class HealthPersistence:
        self.data_dir.mkdir(parents=True, exist_ok=True)
        
        self.db_path = self.data_dir / 'health_monitor.db'
-        self._db_lock = threading.Lock()
+        # Reentrant lock: `record_disk_observation` acquires this and then
+        # calls `register_disk` which acquires it again on the same thread.
+        # With a plain `threading.Lock` that second acquire deadlocks and the
+        # caller hangs forever — visible symptom on RimegraVE (Pedro Rico
+        # 19/05): no disk_observation update since the day a thread first
+        # walked that path. `RLock` allows re-entry from the same thread
+        # while still serialising cross-thread writes, which is what the
+        # serialisation rationale (race-free UPSERT dedup) actually wants.
+        self._db_lock = threading.RLock()
        self._init_database()
    
    def _get_conn(self) -> sqlite3.Connection:
@@ -228,6 +236,29 @@ class HealthPersistence:
            'CREATE INDEX IF NOT EXISTS idx_digest_pending_channel '
            'ON digest_pending(channel, ts)'
        )
+
+        # Sibling table for events buffered DURING Quiet Hours. Same
+        # shape as digest_pending so the existing summary renderer can
+        # be reused. Kept separate because the lifecycle is different:
+        # digest_pending flushes once per day at digest_time, while
+        # quiet_pending flushes once per Quiet Hours close (an arbitrary
+        # time that depends on the user's window settings).
+        cursor.execute('''
+            CREATE TABLE IF NOT EXISTS quiet_pending (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                channel TEXT NOT NULL,
+                event_type TEXT NOT NULL,
+                event_group TEXT NOT NULL,
+                severity TEXT NOT NULL,
+                ts INTEGER NOT NULL,
+                title TEXT NOT NULL,
+                body TEXT NOT NULL
+            )
+        ''')
+        cursor.execute(
+            'CREATE INDEX IF NOT EXISTS idx_quiet_pending_channel '
+            'ON quiet_pending(channel, ts)'
+        )
        
        # Migration: add missing columns to errors table for existing DBs
        cursor.execute("PRAGMA table_info(errors)")
@@ -2289,11 +2320,15 @@ class HealthPersistence:

                # Upsert observation: if same (disk, type, signature), bump count + update last timestamp.
                # IMPORTANT: Do NOT reset dismissed — if the user dismissed this observation,
-                # re-detecting the same journal entry must not un-dismiss it. Also do not
-                # increment the occurrence_count on dismissed rows (audit Tier 5 — once
-                # the user has dismissed, we don't want the counter to keep growing for
-                # journal events that no longer interest them; this also stops the badge
-                # from drifting upward for dismissed conditions).
+                # re-detecting the same journal entry must not un-dismiss it. BUT we DO
+                # keep counting + updating last_occurrence even when dismissed, because the
+                # responsible-monitoring contract is: every error counts toward the
+                # accumulated total shown in the disk modal ("324 connection errors"),
+                # even errors of the same signature the user already saw once. Dismissed
+                # only mutes notifications, NOT the per-disk error history surfaced in the
+                # UI. Reverting the earlier "WHERE dismissed=0" gate that froze the
+                # counter and last_occurrence for /dev/sdh on 2026-05-09, leaving 10
+                # silent days of unreported ATA errors (Pedro Rico, 19/05).
                cursor.execute(f'''
                    INSERT INTO disk_observations
                        (disk_registry_id, {type_col}, error_signature, {first_col},
@@ -2303,7 +2338,6 @@ class HealthPersistence:
                        {last_col} = excluded.{last_col},
                        occurrence_count = occurrence_count + 1,
                        severity = CASE WHEN excluded.severity = 'critical' THEN 'critical' ELSE severity END
-                    WHERE dismissed = 0
                ''', (disk_id, error_type, error_signature, now, now, raw_message, severity))

                conn.commit()
@@ -274,6 +274,12 @@ def _df_via_host_pid(host_pid: str, ct_target: str) -> dict[str, Optional[int]]:
    numbers reflect the whole storage pool instead of the per-subvol
    quota — without this the UI showed 851 GB total for a 150 GB ZFS
    subvol because pvesm reports the rpool's free space.
+
+    Note: this path does NOT measure NFS/CIFS mounts that were set up
+    from INSIDE the CT (`mount -t nfs` / `/etc/fstab` inside the
+    container). Those live in the CT's own mount namespace and aren't
+    visible to the host's `df` even through `/proc/<pid>/root`. Use
+    `_df_via_pct_exec` for ad-hoc mounts.
    """
    empty = {"total_bytes": None, "used_bytes": None, "available_bytes": None}
    if not host_pid or not ct_target:
@@ -301,6 +307,44 @@ def _df_via_host_pid(host_pid: str, ct_target: str) -> dict[str, Optional[int]]:
        return empty


+def _df_via_pct_exec(vmid: str, ct_target: str,
+                     timeout: int = 6) -> dict[str, Optional[int]]:
+    """``df`` a path from INSIDE the CT via ``pct exec``. Needed for
+    ad-hoc NFS/CIFS mounts that live in the CT's own mount namespace
+    and aren't visible from the host (so `_df_via_host_pid` returns
+    empty for them).
+
+    Heavier than the host-side df (full `pct exec` round-trip ~1-3s),
+    so we only use it for ad-hoc mounts. The 6s timeout is generous
+    enough for NFS over slow links but won't drag the request past
+    the proxy timeout.
+    """
+    empty = {"total_bytes": None, "used_bytes": None, "available_bytes": None}
+    if not vmid or not ct_target:
+        return empty
+    try:
+        proc = subprocess.run(
+            [_PCT, "exec", vmid, "--", "df", "-B1",
+             "--output=size,used,avail", ct_target],
+            capture_output=True, text=True, timeout=timeout,
+        )
+        if proc.returncode != 0:
+            return empty
+        lines = [ln for ln in proc.stdout.strip().splitlines() if ln.strip()]
+        if len(lines) < 2:
+            return empty
+        parts = lines[-1].split()
+        if len(parts) < 3:
+            return empty
+        return {
+            "total_bytes": int(parts[0]),
+            "used_bytes": int(parts[1]),
+            "available_bytes": int(parts[2]),
+        }
+    except (subprocess.TimeoutExpired, OSError, ValueError):
+        return empty
+
+
 def _capacity_for(source: str, classification: dict[str, Any],
                  pve_storages: dict[str, dict[str, Any]],
                  config_options: Optional[dict[str, Any]] = None,
@@ -606,14 +650,29 @@ def get_lxc_mount_points(vmid: str) -> dict[str, Any]:
        ]
        # Same parallelisation as the configured-mp loop: stat'ing
        # stale NFS exports serially can dominate the request and
-        # push it past the proxy timeout.
+        # push it past the proxy timeout. Capacity (`df`) is fetched
+        # in the SAME pool so the UI can render the usage bar for
+        # ad-hoc NFS/CIFS mounts too — null capacity was a regression
+        # spotted on CT 103 /mnt/Media. Skip df when stat already
+        # showed the mount as unreachable, otherwise the df subprocess
+        # blocks on the same broken export.
        if ad_hoc_candidates:
            with ThreadPoolExecutor(max_workers=max_workers) as pool:
-                healths = list(pool.map(
-                    lambda rt: _stat_via_host(host_pid, rt["rt_target"]),
-                    ad_hoc_candidates,
-                ))
-            for rt, health in zip(ad_hoc_candidates, healths):
+                def _gather_adhoc(rt):
+                    h = _stat_via_host(host_pid, rt["rt_target"])
+                    if h.get("reachable"):
+                        # NFS/CIFS mounts done inside the CT live in the
+                        # container's own mount namespace and aren't
+                        # visible to `df` from the host even via
+                        # /proc/<pid>/root — use `pct exec df` instead.
+                        cap = _df_via_pct_exec(vmid, rt["rt_target"])
+                    else:
+                        cap = {"total_bytes": None, "used_bytes": None,
+                               "available_bytes": None}
+                    return rt, h, cap
+
+                results = list(pool.map(_gather_adhoc, ad_hoc_candidates))
+            for rt, health, cap in results:
                ad_hoc.append({
                    "mp_index": "",
                    "source": rt["rt_source"],
@@ -624,9 +683,9 @@ def get_lxc_mount_points(vmid: str) -> dict[str, Any]:
                    "origin_label": rt["rt_source"],
                    "config_options": {},
                    "config_flags": [],
-                    "total_bytes": None,
-                    "used_bytes": None,
-                    "available_bytes": None,
+                    "total_bytes": cap["total_bytes"],
+                    "used_bytes": cap["used_bytes"],
+                    "available_bytes": cap["available_bytes"],
                    "runtime_mounted": True,
                    "runtime_source": rt["rt_source"],
                    "runtime_fstype": rt["rt_fstype"],
@@ -189,12 +189,169 @@ def _detect_oci_apps() -> list[dict]:
    return out


+# ── LXC containers (Phase 1: apt-based update detection) ────────────
+#
+# Each running Debian/Ubuntu CT becomes a registry entry of type "lxc".
+# Detection is opt-in: gated on the `lxc_updates_available` notification
+# being enabled somewhere, so the heavy `pct exec` work doesn't run on
+# hosts where the user hasn't asked for this.
+#
+# Phase 2 hook: once helper-scripts metadata is integrated, entries can
+# carry `_helper_script_app` so the checker swaps generic apt counting
+# for app-specific upstream-release tracking (Vaultwarden, Jellyfin,
+# etc.). For now every LXC uses the generic apt path.
+
+_PCT_BIN = "/usr/sbin/pct"
+_LXC_EXEC_TIMEOUT_SEC = 10
+_LXC_OS_PROBE_TIMEOUT_SEC = 5
+
+
+def _lxc_updates_notification_enabled() -> bool:
+    """Return True if the user has enabled `lxc_updates_available` on
+    at least one configured channel. Used to gate the heavy detection
+    + checker work — when disabled we don't touch any CT at all.
+    """
+    try:
+        import notification_manager as _nm_mod
+        nm = _nm_mod.notification_manager
+        return bool(nm.is_event_enabled("lxc_updates_available"))
+    except Exception:
+        return False
+
+
+def _list_pve_lxcs() -> list[dict]:
+    """Return basic info per LXC on this node via ``pct list``. Each
+    item is ``{vmid, status, name}``. Empty list on any failure — never
+    raises so the detector caller can continue.
+    """
+    try:
+        r = subprocess.run(
+            [_PCT_BIN, "list"],
+            capture_output=True, text=True, timeout=5,
+        )
+    except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
+        return []
+    if r.returncode != 0:
+        return []
+
+    out: list[dict] = []
+    for line in r.stdout.splitlines()[1:]:  # skip header row
+        # `pct list` columns: VMID  Status  Lock  Name
+        # `Lock` is empty most of the time, so split max 4 ways
+        parts = line.split(None, 3)
+        if len(parts) < 2:
+            continue
+        vmid = parts[0]
+        status = parts[1]
+        # Name is the last column; in unlocked rows the 3rd col may
+        # be the name itself if Lock was omitted by the formatter.
+        name = parts[-1] if len(parts) >= 3 else ""
+        if not vmid.isdigit():
+            continue
+        out.append({"vmid": vmid, "status": status, "name": name})
+    return out
+
+
+_SUPPORTED_OS_FAMILIES = ("debian", "ubuntu", "alpine")
+
+
+def _probe_lxc_os(vmid: str) -> Optional[str]:
+    """Return a normalized family identifier (``debian`` / ``ubuntu`` /
+    ``alpine``) by reading ``/etc/os-release`` inside the running CT.
+    Returns None for distributions whose package manager we don't yet
+    speak — those CTs are skipped in detection so the framework
+    doesn't keep retrying a checker we can't run.
+
+    Cached per CT in the registry — re-probed only when the entry has
+    no ``_os_family`` yet, since the OS rarely changes for the life of
+    a CT.
+    """
+    try:
+        r = subprocess.run(
+            [_PCT_BIN, "exec", vmid, "--", "cat", "/etc/os-release"],
+            capture_output=True, text=True,
+            timeout=_LXC_OS_PROBE_TIMEOUT_SEC,
+        )
+    except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
+        return None
+    if r.returncode != 0:
+        return None
+    text = r.stdout.lower()
+    if "id=ubuntu" in text:
+        return "ubuntu"
+    if "id=debian" in text or "id_like=debian" in text:
+        return "debian"
+    if "id=alpine" in text:
+        return "alpine"
+    # Future Phase 1.5: CentOS/Rocky/Alma (dnf check-update), Arch
+    # (checkupdates), openSUSE (zypper list-updates). Each needs a
+    # parser similar to apt/apk — skip silently for now.
+    return None
+
+
+def _detect_lxc_containers() -> list[dict]:
+    """Enumerate running Debian/Ubuntu CTs as registry entries.
+
+    OS detection is cached in the registry entry (`_os_family`), so the
+    expensive ``pct exec cat /etc/os-release`` only runs the first time
+    a CT is seen. CT reinstalls with a different OS will keep the old
+    family cached until the user resets the registry — acceptable
+    trade-off vs paying the probe cost every 24h cycle.
+    """
+    if not _lxc_updates_notification_enabled():
+        return []
+
+    # Read existing registry so we can preserve cached `_os_family`.
+    # No lock needed here — we only inspect; the framework holds the
+    # write lock when it merges back our results in detect_and_register.
+    try:
+        existing = _read_registry().get("items", [])
+    except Exception:
+        existing = []
+    existing_by_id = {
+        it.get("id"): it for it in existing
+        if isinstance(it, dict) and it.get("type") == "lxc"
+    }
+
+    cts = _list_pve_lxcs()
+    out: list[dict] = []
+    for ct in cts:
+        if ct["status"] != "running":
+            continue
+        vmid = ct["vmid"]
+        cid = f"lxc:{vmid}"
+        prior = existing_by_id.get(cid) or {}
+        os_family = prior.get("_os_family")
+        if not os_family:
+            os_family = _probe_lxc_os(vmid)
+            if os_family not in _SUPPORTED_OS_FAMILIES:
+                # Distribution we don't yet have a package-manager
+                # parser for. Skip silently. The framework marks any
+                # existing entry as removed_at if it stops appearing
+                # in the detector output.
+                continue
+        out.append({
+            "id": cid,
+            "type": "lxc",
+            "name": ct.get("name") or f"CT-{vmid}",
+            "current_version": None,  # apt has no single version
+            "menu_label": None,        # user upgrades inside the CT
+            "menu_script": None,
+            "_vmid": vmid,
+            "_os_family": os_family,
+            # Phase 2 hook: populate `_helper_script_app` here once we
+            # learn how to read the community-scripts marker.
+        })
+    return out
+
+
 # Detectors registered here. Each returns either a single entry dict
 # or a list (for sources that yield multiple items, like OCI). The
 # framework normalises both shapes.
 _DETECTORS: list[Callable[[], Any]] = [
    _detect_nvidia_xfree86,
    _detect_oci_apps,
+    _detect_lxc_containers,
 ]


@@ -514,9 +671,173 @@ def _check_nvidia_xfree86(entry: dict) -> dict:
    }


+def _parse_apt_list_upgradable(text: str) -> list[dict]:
+    """Parse the output of ``apt list --upgradable`` into structured rows.
+
+    Each upgradable line looks like::
+
+        package/release version arch [upgradable from: oldversion]
+
+    Returns a list of ``{name, current, latest, security}``. Lines that
+    can't be parsed are skipped; the header ``Listing...`` is ignored
+    because it lacks the ``[upgradable`` marker.
+
+    "security" flag is detected from the release/suite name (e.g.
+    ``bookworm-security``, ``jammy-security``). Some derivatives don't
+    use that naming and will report security=False even when patches
+    are present — acceptable for Phase 1, refined later if needed.
+    """
+    rows: list[dict] = []
+    for line in text.splitlines():
+        line = line.strip()
+        if not line or "[upgradable" not in line or "/" not in line:
+            continue
+        try:
+            head, _, tail = line.partition(" ")
+            name, _, release = head.partition("/")
+            tail_parts = tail.split()
+            if not tail_parts:
+                continue
+            new_ver = tail_parts[0]
+            old_ver = ""
+            if "from:" in line:
+                old_ver = line.split("from:", 1)[1].strip().rstrip("]").strip()
+            release_lower = release.lower()
+            is_security = "-security" in release_lower or "/security" in release_lower
+            rows.append({
+                "name": name,
+                "current": old_ver,
+                "latest": new_ver,
+                "security": is_security,
+            })
+        except Exception:
+            continue
+    return rows
+
+
+def _parse_apk_list_upgradable(text: str) -> list[dict]:
+    """Parse the output of ``apk list -u`` into structured rows.
+
+    Lines look like::
+
+        busybox-1.36.1-r29 x86_64 {busybox} (GPL-2.0-only) [upgradable from: busybox-1.36.1-r28]
+
+    apk smashes name + version into the leading token, so reliable
+    name/version splitting requires walking from the right (versions
+    end in ``-r<num>``). For the badge + notification we only need a
+    count and a representative sample, so we keep the parser tolerant
+    and surface the raw token as the package "name". Alpine's main
+    repos don't expose a separate "security" suite via apk metadata,
+    so we mark every row as ``security=False`` — security==0 always.
+    """
+    rows: list[dict] = []
+    for line in text.splitlines():
+        line = line.strip()
+        if not line or "[upgradable" not in line:
+            continue
+        try:
+            first_tok = line.split(" ", 1)[0]
+            old = ""
+            if "from:" in line:
+                old = line.split("from:", 1)[1].strip().rstrip("]").strip()
+            rows.append({
+                "name": first_tok,
+                "current": old,
+                "latest": first_tok,
+                "security": False,
+            })
+        except Exception:
+            continue
+    return rows
+
+
+def _run_pct_pkg_listing(vmid: str, cmd: str) -> tuple[bool, str, str]:
+    """Run a package-listing command inside ``vmid`` via ``pct exec``.
+    Returns ``(ok, stdout, error_message)``. Centralises the timeout
+    and stderr handling so apt/apk callers stay symmetric.
+    """
+    try:
+        r = subprocess.run(
+            [_PCT_BIN, "exec", vmid, "--", "sh", "-c", cmd],
+            capture_output=True, text=True,
+            timeout=_LXC_EXEC_TIMEOUT_SEC,
+        )
+    except subprocess.TimeoutExpired:
+        return False, "", f"{cmd.split()[0]} listing timed out"
+    except (FileNotFoundError, OSError) as e:
+        return False, "", str(e)
+    if r.returncode != 0:
+        return False, "", (r.stderr or "package listing failed").strip()[:200]
+    return True, r.stdout, ""
+
+
+def _check_lxc_updates(entry: dict) -> dict:
+    """Inspect pending package updates inside the LXC and report them.
+
+    Dispatches to the right package-manager parser based on the cached
+    ``_os_family``. Uses the CT's existing metadata cache — never runs
+    ``apt update`` / ``apk update`` from outside, so the user's own
+    update cadence (unattended-upgrades, cron) is preserved.
+
+    The dedup fingerprint (``latest``) combines count, security count
+    and the sorted top package names so a stable set of pending
+    updates doesn't re-notify daily, while a meaningfully different
+    update set does.
+    """
+    vmid = entry.get("_vmid")
+    family = (entry.get("_os_family") or "").lower()
+    if not vmid:
+        return {
+            "available": False, "latest": None,
+            "last_check": _now_iso(), "error": "no vmid in entry",
+        }
+
+    if family in ("debian", "ubuntu"):
+        ok, stdout, err = _run_pct_pkg_listing(
+            vmid, "apt list --upgradable 2>/dev/null"
+        )
+        packages = _parse_apt_list_upgradable(stdout) if ok else []
+    elif family == "alpine":
+        ok, stdout, err = _run_pct_pkg_listing(
+            vmid, "apk list -u 2>/dev/null"
+        )
+        packages = _parse_apk_list_upgradable(stdout) if ok else []
+    else:
+        return {
+            "available": False, "latest": None,
+            "last_check": _now_iso(),
+            "error": f"unsupported family: {family}",
+        }
+
+    if not ok:
+        return {
+            "available": False, "latest": None,
+            "last_check": _now_iso(), "error": err,
+        }
+
+    count = len(packages)
+    sec_count = sum(1 for p in packages if p.get("security"))
+    available = count > 0
+    latest_fp = None
+    if available:
+        top_names = ",".join(sorted(p["name"] for p in packages)[:5])
+        latest_fp = f"{count}:{sec_count}:{top_names}"
+
+    return {
+        "available": available,
+        "latest": latest_fp,
+        "last_check": _now_iso(),
+        "error": None,
+        "_count": count,
+        "_security_count": sec_count,
+        "_packages": packages[:30],  # cap to keep the registry compact
+    }
+
+
 _CHECKERS: dict[str, Callable[[dict], dict]] = {
    "oci_app": _check_oci_app,
    "nvidia_xfree86": _check_nvidia_xfree86,
+    "lxc": _check_lxc_updates,
 }


@@ -562,8 +883,14 @@ def check_for_updates(force: bool = False) -> list[dict]:
            }
            if result.get("current") and not it.get("current_version"):
                it["current_version"] = result["current"]
+            # Per-checker extras carried through into the persisted
+            # `update_check` blob. Add new keys here when a future
+            # checker needs to surface fields beyond available/latest.
+            # `_count` + `_security_count` were missing originally, so
+            # the LXC checker's counts dropped on the floor and the
+            # frontend badge couldn't render.
            for extra_key in ("_packages", "_upgrade_kind", "_kernel",
-                              "_kernel_note"):
+                              "_kernel_note", "_count", "_security_count"):
                if extra_key in result:
                    it["update_check"][extra_key] = result[extra_key]

@@ -327,14 +327,27 @@ def is_vzdump_active_on_host() -> bool:
    try:
        with open(_VZDUMP_ACTIVE_FILE, 'r') as f:
            for line in f:
-                # UPID format: UPID:node:pid:pstart:starttime:type:id:user:
+                # tasks/active row layout (whitespace separated):
+                #   "<UPID> 1"                                ← running
+                #   "<UPID> 1 <endtime_hex> <STATUS>"         ← finished
+                # PVE leaves finished rows lingering for hours
+                # sometimes — without the field-count check below the
+                # PID-recycling case fires a false positive (an
+                # unrelated process inherited the old vzdump's PID
+                # and `os.kill(pid, 0)` succeeds).
                if ':vzdump:' not in line:
                    continue
-                parts = line.strip().split(':')
-                if len(parts) < 3:
+                fields = line.split()
+                if not fields:
+                    continue
+                # >2 fields means endtime + status are written → terminated.
+                if len(fields) > 2:
+                    continue
+                upid_parts = fields[0].split(':')
+                if len(upid_parts) < 3:
                    continue
                try:
-                    pid = int(parts[2], 16)  # PID in UPID is hex
+                    pid = int(upid_parts[2], 16)  # PID in UPID is hex
                    os.kill(pid, 0)
                    found = True
                    break
@@ -1033,21 +1046,28 @@ class JournalWatcher:
            else:
                resolved = re.sub(r'\d+$', '', raw_device) if raw_device.startswith('sd') else raw_device
            
-            # ── Gate 1: SMART must confirm disk failure ──
-            # If the disk is healthy (PASSED) or we can't verify
-            # (UNKNOWN / unresolvable ATA port), do NOT notify.
+            # ── ALWAYS persist the observation, regardless of SMART ──
+            # The disk_observation_contract is explicit (memory note
+            # disk-observation-contract): every kernel-surfaced disk
+            # error must be recorded in disk_observations *even when
+            # SMART reports PASSED*. Silent errors on a "healthy" disk
+            # are exactly the early-warning signal the modal histogram
+            # exists to surface ("324 connection errors on this disk").
+            # Previously this line lived AFTER a `return` gate keyed on
+            # smart_health != 'FAILED', so the 3162 ata8 errors on
+            # .1.10 (PASSED SMART) all dropped on the floor instead of
+            # accumulating in the per-disk audit history.
+            self._record_disk_io_observation(resolved, msg)
+
+            # ── Gate 1: only NOTIFY when SMART reports FAILED ──
+            # Observation is already saved above. We avoid spamming a
+            # CRITICAL notification for transient ATA/SCSI noise on
+            # otherwise-healthy disks — the modal histogram surfaces
+            # those without paging the user at 3 AM.
            smart_health = self._quick_smart_health(resolved)
            if smart_health != 'FAILED':
                return

-            # ── Persist observation (before the cooldown gate) ──
-            # The 24h cooldown below only suppresses RE-notification; the
-            # per-disk observations history must reflect every genuine
-            # detection. The DB UPSERT dedups same-signature events via
-            # occurrence_count, so calling this on every match is safe.
-            # Aligns with the parallel path in HealthMonitor._check_disks_optimized.
-            self._record_disk_io_observation(resolved, msg)
-
            # ── Gate 2: 24-hour dedup per device ──
            # Check both in-memory cache AND the DB (user dismiss clears DB cooldowns).
            # If user dismissed the error, _clear_disk_io_cooldown() removed the DB
@@ -1814,12 +1834,31 @@ class TaskWatcher:
                    line = line.strip()
                    if not line:
                        continue
-                    upid = line.split()[0] if line.split() else line
+                    parts = line.split()
+                    if not parts:
+                        continue
+                    upid = parts[0]
                    current_upids.add(upid)
-                    
-                    if ':vzdump:' in upid:
+
+                    if ':vzdump:' not in upid:
+                        continue
+
+                    # PVE writes each line in tasks/active as:
+                    #   "<UPID> 1"                                    ← task still running
+                    #   "<UPID> 1 <endtime_hex> <STATUS>"             ← task already finished
+                    # PVE doesn't always prune finished rows from this
+                    # file (observed on RimegraVE 19/05: 25 OK/error
+                    # entries lingering for hours after job end). Just
+                    # matching ':vzdump:' kept `_vzdump_running_since`
+                    # permanently fresh, which then made
+                    # `_is_vzdump_active()` return True forever and
+                    # silenced every vm_start / vm_stop / vm_shutdown
+                    # via the _BACKUP_NOISE filter. Only treat the row
+                    # as a live vzdump when no end-time / status has
+                    # been written yet (≤ 2 fields: UPID + version).
+                    if len(parts) <= 2:
                        found_vzdump = True
-            
+
            # Keep _vzdump_running_since fresh as long as vzdump is in active
            if found_vzdump:
                self._vzdump_running_since = time.time()
@@ -2175,6 +2214,16 @@ class PollingCollector:
        # has an update".
        self._last_managed_check = 0
        self._notified_managed_updates: dict[str, str] = {}
+        # LXC notifications are grouped — one event per polling cycle
+        # covering every running Debian/Ubuntu CT with pending apt
+        # updates. The fingerprint encodes the per-CT state so a stable
+        # batch doesn't re-notify while a meaningful change does.
+        self._notified_lxc_batch: str | None = None
+        # Track previous state of the LXC-updates notification toggle
+        # so a user enabling it post-startup bypasses the 24h gate
+        # ONCE — the next polling cycle runs a fresh detection without
+        # waiting up to a day. Cleared after the forced run completes.
+        self._lxc_was_enabled: bool = False
        # Track notified ProxMenux versions to avoid duplicates
        self._notified_proxmenux_version: str | None = None
        self._notified_proxmenux_beta_version: str | None = None
@@ -3101,7 +3150,24 @@ class PollingCollector:
        NVIDIA driver → ``nvidia_driver_update_available``, etc.).
        """
        now = time.time()
-        if now - self._last_managed_check < self.UPDATE_CHECK_INTERVAL:
+
+        # Detect OFF→ON transition of the LXC update toggle. Without
+        # this, the first polling cycle after service start always sets
+        # the 24h gate — so a user who enables the toggle later (which
+        # is the normal flow, since the toggle defaults to OFF) would
+        # have to wait up to 24h or restart the service before the
+        # detector ran. A one-shot bypass on the transition fixes that
+        # without weakening the 24h cadence in steady state.
+        try:
+            import managed_installs as _mi
+            lxc_enabled_now = _mi._lxc_updates_notification_enabled()
+        except Exception:
+            lxc_enabled_now = False
+        lxc_just_enabled = lxc_enabled_now and not self._lxc_was_enabled
+        self._lxc_was_enabled = lxc_enabled_now
+
+        if (not lxc_just_enabled
+                and now - self._last_managed_check < self.UPDATE_CHECK_INTERVAL):
            return
        self._last_managed_check = now

@@ -3117,8 +3183,15 @@ class PollingCollector:
            print(f"[PollingCollector] managed_installs update run failed: {e}")
            return

+        # Split LXC updates out of the per-item event stream — they get
+        # one grouped notification per cycle instead of one per CT, to
+        # avoid spamming the user when 15 CTs have pending updates the
+        # same day. Non-LXC types keep their existing per-item flow.
+        lxc_updates = [u for u in updates if u.get('type') == 'lxc']
+        other_updates = [u for u in updates if u.get('type') != 'lxc']
+
        seen_ids: set[str] = set()
-        for item in updates:
+        for item in other_updates:
            item_id = item.get('id', '')
            if not item_id:
                continue
@@ -3143,6 +3216,17 @@ class PollingCollector:
                entity_id=f'managed_{item_id}',
            ))

+        # LXC: emit one grouped event with all CTs that have pending
+        # updates. The batch fingerprint is recomputed every cycle and
+        # compared with the last notified one — if the set of CTs or
+        # their per-CT fingerprints changed, we notify again.
+        if lxc_updates:
+            self._emit_lxc_updates_batch(lxc_updates)
+        else:
+            # Empty batch — clear the dedup so a fresh batch later fires
+            # a new notification even with the same CTs/versions.
+            self._notified_lxc_batch = None
+
        # Forget items that no longer have an update available. If
        # the user installs the update and then a later release lands,
        # the dedup state is already cleared so the next notification
@@ -3159,6 +3243,67 @@ class PollingCollector:
            if stale_id not in active_with_update:
                self._notified_managed_updates.pop(stale_id, None)

+    def _emit_lxc_updates_batch(self, items: list[dict]) -> None:
+        """Build and queue a single ``lxc_updates_available`` event for
+        every running CT that currently has pending apt updates.
+
+        The batch fingerprint combines every CT's per-CT fingerprint
+        (count + security_count + top package names). A new CT entering
+        the set OR an existing CT changing its per-CT fingerprint
+        produces a new batch fingerprint, so the cooldown is broken and
+        the event fires. A truly stable batch is silenced via the
+        equality check below.
+        """
+        # Stable order so the fingerprint is deterministic
+        items_sorted = sorted(items, key=lambda x: x.get('id', ''))
+
+        ct_lines: list[str] = []
+        per_ct_fps: list[str] = []
+        total_packages = 0
+        total_security = 0
+
+        for idx, it in enumerate(items_sorted):
+            update = it.get('update_check', {}) or {}
+            count = int(update.get('_count') or 0)
+            sec_count = int(update.get('_security_count') or 0)
+            total_packages += count
+            total_security += sec_count
+
+            vmid = it.get('_vmid') or it.get('id', '').removeprefix('lxc:') or '?'
+            name = it.get('name') or f'CT {vmid}'
+            # Each CT renders across two/three lines so the count and the
+            # security count don't compete with the CT label on the same
+            # row — much easier to read in Telegram/Discord at a glance.
+            # A blank line before every CT except the first separates
+            # entries cleanly without a trailing blank at the end.
+            if idx > 0:
+                ct_lines.append("")
+            ct_lines.append(f"🏷️ CT {vmid} ({name}):")
+            ct_lines.append(f"    📦 {count} update(s)")
+            if sec_count:
+                ct_lines.append(f"    🔒 {sec_count} security")
+            per_ct_fps.append(f"{it.get('id', '')}={update.get('latest', '')}")
+
+        batch_fingerprint = '|'.join(per_ct_fps)
+        if self._notified_lxc_batch == batch_fingerprint:
+            return  # same batch as last time — silent
+        self._notified_lxc_batch = batch_fingerprint
+
+        data = {
+            'hostname': self._hostname,
+            'count': len(items_sorted),
+            'total_packages': total_packages,
+            'security_count': total_security,
+            'ct_list': '\n'.join(ct_lines),
+        }
+        self._queue.put(NotificationEvent(
+            'lxc_updates_available', 'INFO', data,
+            source='polling',
+            entity='node',
+            # Hash so different batches get distinct cooldown keys
+            entity_id=f'lxc_batch_{abs(hash(batch_fingerprint)) % 10**10}',
+        ))
+
    def _build_managed_install_event(self, item: dict) -> tuple[str, dict]:
        """Translate a registry item into a (event_type, template_data)
        pair. Per-type bodies live here so the registry stays
@@ -973,7 +973,9 @@ class NotificationManager:
        cleanup_interval = 3600  # Cleanup cooldowns every hour
        flush_interval = 5       # Flush aggregation buckets every 5s
        digest_check_interval = 60  # Re-evaluate digest schedule every minute
-        
+        last_quiet_check = 0.0
+        quiet_check_interval = 60   # Re-evaluate per-channel quiet window every minute
+
        while self._running:
            try:
                event = self._event_queue.get(timeout=2)
@@ -990,17 +992,36 @@ class NotificationManager:
                if now_mono - last_digest_check > digest_check_interval:
                    self._maybe_flush_digests()
                    last_digest_check = now_mono
+                # Quiet Hours close → flush buffered sub-CRITICAL events
+                # as a single grouped summary. Has to run even when the
+                # queue is idle, otherwise users who don't generate any
+                # events post-window would never see their summary.
+                if now_mono - last_quiet_check > quiet_check_interval:
+                    self._maybe_flush_quiet_hours()
+                    last_quiet_check = now_mono
                continue
            
            try:
                self._process_event(event)
            except Exception as e:
                print(f"[NotificationManager] Dispatch error: {e}")
-            
+
            # Also flush aggregation after each event
-            if time.monotonic() - last_flush > flush_interval:
+            now_mono = time.monotonic()
+            if now_mono - last_flush > flush_interval:
                self._flush_aggregation()
-                last_flush = time.monotonic()
+                last_flush = now_mono
+            # Re-check digest schedule after each event too. The idle-only
+            # check above misses the daily flush window when the queue stays
+            # busy through the digest_time minute (rare but real: a burst of
+            # journal events arriving at the same minute as the target). The
+            # 23h guard inside _maybe_flush_digests keeps it idempotent.
+            if now_mono - last_digest_check > digest_check_interval:
+                self._maybe_flush_digests()
+                last_digest_check = now_mono
+            if now_mono - last_quiet_check > quiet_check_interval:
+                self._maybe_flush_quiet_hours()
+                last_quiet_check = now_mono
    
    def _flush_aggregation(self):
        """Flush expired aggregation buckets and dispatch summaries."""
@@ -1171,20 +1192,20 @@ class NotificationManager:

            # ── Per-channel quiet hours ──
            # The user marks a window (e.g. 22:00 → 06:00) during which only
-            # CRITICAL events reach this channel. Anything below CRITICAL is
-            # dropped silently — not buffered, not retried — because the
-            # whole point is "don't wake me up at 3 AM unless the disk
-            # exploded". CRITICAL always wins. The window is configured
-            # per-channel; same channel can have different rules from
-            # another. See _in_quiet_hours() for boundary semantics.
+            # CRITICAL events reach this channel. Sub-CRITICAL events are
+            # **buffered** to `quiet_pending` and flushed as a SINGLE grouped
+            # summary when the window closes — so the user doesn't get
+            # paged at 3 AM but also doesn't lose 8h of activity overnight.
+            # CRITICAL always wins. The window is configured per-channel.
+            # See _in_quiet_hours() for boundary semantics.
            # `_dispatch_to_channels` does NOT receive the NotificationEvent
            # object — only the rendered primitives. Using `event.X` here
-            # raised `NameError: name 'event' is not defined` for every
-            # event passing through (silenced by the dispatch loop's broad
-            # except → no notifications EVER delivered after Quiet Hours +
-            # Daily Digest were merged). All community-reported "stopped
-            # receiving notifications after update" cases trace back here.
+            # raised `NameError` for every event passing through, silenced
+            # by the dispatch loop's broad except → no notifications EVER
+            # delivered after Quiet Hours + Daily Digest were merged.
            if severity != 'CRITICAL' and self._in_quiet_hours(ch_name):
+                self._buffer_quiet_event(ch_name, event_type, event_group,
+                                          severity, title, body)
                continue

            # ── Per-channel daily digest ──
@@ -1537,6 +1558,126 @@ class NotificationManager:
        )
        return '\n'.join(lines).rstrip() + '\n'

+    # ─── Quiet Hours buffer + flush ────────────────────────────
+    # Reused infrastructure: `quiet_pending` table (created in
+    # health_persistence) has the same shape as `digest_pending`, so
+    # `_compose_digest_body` renders the summary unchanged. What
+    # differs is the lifecycle — quiet_pending flushes when each
+    # channel's window CLOSES, not at a fixed daily time. We track
+    # that transition via `self._was_in_quiet_hours[ch_name]`.
+
+    def _buffer_quiet_event(self, ch_name: str, event_type: str,
+                            event_group: str, severity: str,
+                            title: str, body: str) -> None:
+        """Append a sub-CRITICAL event to the channel's quiet-hours
+        buffer in SQLite. Mirrors `_buffer_digest_event` — same shape,
+        different table.
+        """
+        try:
+            conn = sqlite3.connect(str(DB_PATH), timeout=10)
+            conn.execute('PRAGMA journal_mode=WAL')
+            conn.execute('PRAGMA busy_timeout=5000')
+            conn.execute(
+                'INSERT INTO quiet_pending '
+                '(channel, event_type, event_group, severity, ts, title, body) '
+                'VALUES (?, ?, ?, ?, ?, ?, ?)',
+                (ch_name, event_type, event_group, severity,
+                 int(time.time()), title, body),
+            )
+            conn.commit()
+            conn.close()
+        except Exception as e:
+            print(f"[NotificationManager] quiet_pending write failed: {e}")
+
+    def _maybe_flush_quiet_hours(self) -> None:
+        """Detect per-channel quiet-hours close (in→out transition) and
+        emit one summary notification with everything buffered during
+        the window. Called every ~60s from the dispatch loop.
+
+        State held in-memory: `self._was_in_quiet_hours[ch_name]`. On
+        first run after restart all channels start as "unknown" — we
+        seed with the current window status WITHOUT firing a summary,
+        so a Monitor restart in the middle of someone's quiet window
+        doesn't trigger a fake close-of-window flush.
+        """
+        if not hasattr(self, '_was_in_quiet_hours'):
+            self._was_in_quiet_hours = {}
+
+        for ch_name, channel in list(self._channels.items()):
+            currently_in = self._in_quiet_hours(ch_name)
+            previously_in = self._was_in_quiet_hours.get(ch_name)
+            self._was_in_quiet_hours[ch_name] = currently_in
+
+            # Seed run (no prior state) — don't fire anything.
+            if previously_in is None:
+                continue
+            # Still in the window → just buffer.
+            if currently_in:
+                continue
+            # Was in window, now out → close transition → flush.
+            if previously_in and not currently_in:
+                try:
+                    self._flush_quiet_for_channel(ch_name, channel)
+                except Exception as e:
+                    print(f"[NotificationManager] quiet flush failed for "
+                          f"{ch_name}: {e}")
+
+    def _flush_quiet_for_channel(self, ch_name: str, channel: Any) -> None:
+        """Send a single grouped summary of everything buffered for
+        `ch_name` during the just-closed quiet window, then drop the
+        buffer rows. Reuses `_compose_digest_body` for rendering since
+        the row shape is identical.
+        """
+        try:
+            conn = sqlite3.connect(str(DB_PATH), timeout=10)
+            conn.execute('PRAGMA journal_mode=WAL')
+            cursor = conn.cursor()
+            cursor.execute(
+                'SELECT id, event_type, event_group, ts, title, body '
+                'FROM quiet_pending WHERE channel = ? ORDER BY ts ASC',
+                (ch_name,),
+            )
+            rows = cursor.fetchall()
+            conn.close()
+        except Exception as e:
+            print(f"[NotificationManager] quiet read failed for {ch_name}: {e}")
+            return
+
+        if not rows:
+            return
+
+        host = _hostname(self._config)
+        summary_title = (
+            f"{host}: {len(rows)} events buffered during Quiet Hours"
+        )
+        summary_body = self._compose_digest_body(rows)
+
+        try:
+            channel.send(summary_title, summary_body, severity='INFO',
+                         data={'_quiet_hours_summary': True, '_count': len(rows)})
+        except Exception as e:
+            print(f"[NotificationManager] quiet send failed for "
+                  f"{ch_name}: {e}")
+            return
+
+        # Only drop the rows after a successful send so a transient
+        # transport failure (Telegram timeout, SMTP outage) doesn't
+        # lose the user's overnight context.
+        try:
+            ids = [r[0] for r in rows]
+            conn = sqlite3.connect(str(DB_PATH), timeout=10)
+            conn.execute('PRAGMA journal_mode=WAL')
+            placeholders = ','.join('?' * len(ids))
+            conn.execute(
+                f'DELETE FROM quiet_pending WHERE id IN ({placeholders})',
+                ids,
+            )
+            conn.commit()
+            conn.close()
+        except Exception as e:
+            print(f"[NotificationManager] quiet cleanup failed for "
+                  f"{ch_name}: {e}")
+
    def _passes_cooldown(self, event: NotificationEvent) -> bool:
        """Check if the event passes cooldown rules WITHOUT stamping.

@@ -2315,6 +2456,18 @@ class NotificationManager:
            ch_cfg: Dict[str, Any] = {
                'enabled': self._config.get(f'{ch_type}.enabled', 'false') == 'true',
                'rich_format': self._config.get(f'{ch_type}.rich_format', 'false') == 'true',
+                # Quiet Hours + Daily Digest live in the same per-channel
+                # namespace but weren't being projected back to the UI —
+                # the toggles round-tripped through POST but the GET only
+                # returned `enabled`/`rich_format` plus channel-specific
+                # config_keys, so after a reload the user saw the toggle
+                # off even though the DB had it on. Reported on .1.10
+                # along with the post-window delivery bug.
+                'quiet_enabled': self._config.get(f'{ch_type}.quiet_enabled', 'false') == 'true',
+                'quiet_start': self._config.get(f'{ch_type}.quiet_start', '22:00'),
+                'quiet_end': self._config.get(f'{ch_type}.quiet_end', '06:00'),
+                'digest_enabled': self._config.get(f'{ch_type}.digest_enabled', 'false') == 'true',
+                'digest_time': self._config.get(f'{ch_type}.digest_time', '09:00'),
            }
            for config_key in info['config_keys']:
                full_key = f'{ch_type}.{config_key}'
@@ -484,6 +484,23 @@ TEMPLATES = {
    },
    
    # ── VM / CT events ──
+    # Phase 1: apt-based update detection inside running Debian/Ubuntu
+    # LXCs. Grouped — one notification per cycle covers every CT with
+    # pending updates. Opt-in (default_enabled=False) because the check
+    # uses `pct exec` to inspect package state inside the user's CTs.
+    # Phase 2 (community-scripts metadata) will extend this without
+    # changing the event type.
+    'lxc_updates_available': {
+        'title': '{hostname}: {count} LXC(s) with package updates available',
+        'body': (
+            '📊 {count} LXC(s) with pending package updates '
+            '(📦 {total_packages} total, 🔒 {security_count} security):\n\n'
+            '{ct_list}'
+        ),
+        'label': 'LXC updates available (experimental)',
+        'group': 'vm_ct',
+        'default_enabled': False,
+    },
    'vm_start': {
        'title': '{hostname}: VM {vmname} ({vmid}) started',
        'body': 'Virtual machine {vmname} (ID: {vmid}) is now running.',
@@ -1109,8 +1126,8 @@ TEMPLATES = {
        'title': '{hostname}: {count} ProxMenux optimization update(s) available',
        'body': (
            '{count} optimization update(s) detected on this host.\n\n'
-            'Tools:\n{tool_list}\n\n'
-            'How to apply:\n'
+            '🛠️ Tools:\n{tool_list}\n\n'
+            '💡 How to apply:\n'
            '  • ProxMenux Monitor → Settings → ProxMenux Optimizations\n'
            '  • Or run the post-install menu (option 2) → "Apply available updates"'
        ),
@@ -1129,12 +1146,12 @@ TEMPLATES = {
    'secure_gateway_update_available': {
        'title': '{hostname}: {app_name} update available — v{latest_version}',
        'body': (
-            '{app_name} (managed by ProxMenux) has {package_count} package update(s) '
+            '{app_name} (managed by ProxMenux) has 📦 {package_count} package update(s) '
            'pending in its container.\n'
-            'Current Tailscale: v{current_version}  →  Latest: v{latest_version}\n\n'
-            'Open ProxMenux Monitor > Settings > Secure Gateway and click '
+            '🔹 Current Tailscale: v{current_version}  →  🟢 Latest: v{latest_version}\n\n'
+            '💡 Open ProxMenux Monitor > Settings > Secure Gateway and click '
            '"Update" to apply.\n\n'
-            'Packages:\n{package_list}'
+            '🗂️ Packages:\n{package_list}'
        ),
        'label': 'Secure Gateway update available',
        'group': 'updates',
@@ -1147,10 +1164,10 @@ TEMPLATES = {
        'title': '{hostname}: NVIDIA driver update available — v{latest_version}',
        'body': (
            'A newer NVIDIA driver compatible with kernel {kernel} is available.\n'
-            'Currently installed: v{current_version}\n'
-            'Latest available:    v{latest_version}\n\n'
+            '🔹 Currently installed: v{current_version}\n'
+            '🟢 Latest available:    v{latest_version}\n\n'
            '{upgrade_reason}\n\n'
-            'To reinstall:\n'
+            '💡 To reinstall:\n'
            '  • From the ProxMenux post-install menu: {menu_label}\n\n'
            'Reinstalling rebuilds the DKMS module against the running kernel and '
            'requires a reboot to load the new driver.'
@@ -1465,6 +1482,7 @@ CATEGORY_EMOJI = {
 # Event-specific title icons  (override category default when present)
 EVENT_EMOJI = {
    # VM / CT
+    'lxc_updates_available': '\U0001F4E6',     # \uD83D\uDCE6 package \u2014 pending CT updates
    'vm_start':             '\u25B6\uFE0F',    # play button
    'vm_start_warning':     '\u26A0\uFE0F',     # warning sign - started with warnings
    'vm_stop':              '\u23F9\uFE0F',     # stop button
@@ -1768,6 +1786,14 @@ Your job: translate alerts into {language} and enrich them with context when pro
 ═══ ABSOLUTE CONSTRAINTS (NO EXCEPTIONS) ═══
 - NO HALLUCINATIONS: Do not invent causes, solutions, or facts not present in the provided data
 - NO SPECULATION: If something is unclear, state what IS known, not what MIGHT be
+- NO FILLER LINES: Every output line must derive from the input message, the journal context,
+  or the known-error database. NEVER add generic statements like "Event detected during normal
+  operation", "No further issues", or padding lines just to fill space. If a field has no evidence,
+  OMIT it — a shorter output is always better than invented content.
+- 📝 Log lines: ONLY include when the journal context contains an actual relevant log line.
+  Convey its meaning faithfully, do not invent one. If no relevant log exists, OMIT the 📝 line.
+- ⏱️ Duration/timing lines: ONLY for backup/migration durations explicitly present in the input.
+  NEVER use ⏱️ for vague "event detected at X" filler.
 - NO CONVERSATIONAL TEXT: Never write "Here is...", "I've translated...", "Let me explain..."
 - ONLY use information from: the message, journal context, and known error database (if provided)

@@ -1884,7 +1910,12 @@ Your goal is to maintain the original structure of the message while using emoji
 ESPECIALLY when adding new context, formatting technical data, or writing tips.

 RULES:
-1. PRESERVE BASE STRUCTURE: Respect the original fields and layout provided in the input message.
+1. PRESERVE BASE STRUCTURE AND INPUT EMOJIS: Respect the original fields and layout provided in
+   the input message. **CRITICAL: every emoji already present in the input (📊, 🏷️, 📦, 🔒, 🛠️,
+   💡, ⚠️, ✨, 🌐, 🔥, 💧, 📝, ⏱️, etc.) MUST appear in the output, in the same position relative
+   to its label.** Translating the surrounding words is fine; deleting or relocating the emoji is
+   not. You may add additional context-appropriate emojis from BODY EMOJIS below, but never strip
+   the ones the template already provides.
 2. ENHANCE WITH ICONS: Place emojis at the START of a line to identify the data type.
 3. NEW CONTEXT: When adding journal info, SMART data, or known errors, use appropriate icons to make it readable.
 4. NO SPAM: Do not put emojis in the middle or end of sentences. Use 1-3 emojis at START of lines where they add clarity. Combine when meaningful (💾✅ backup ok).