Update Beta 1.2.1.2

2026-05-22 00:24:48 +00:00 · 2026-05-20 19:47:42 +02:00
parent 4112323961
commit 298cd2c6d4
15 changed files with 781 additions and 109 deletions
@@ -1 +1 @@
-150694a49a5b0a4546a2bf5fedcc0914d37666d0cdeac1d9fdc58793c131b4bd  ProxMenux-1.2.1.1-beta.AppImage
+0d74347d2feae2be4b8c6d62d6cd9b1b15b94ef431c088b5580560f6b4751594  ProxMenux-1.2.1.2-beta.AppImage
@@ -271,7 +271,7 @@ export function Login({ onLogin }: LoginProps) {
          </form>
        </div>

-        <p className="text-center text-sm text-muted-foreground">ProxMenux Monitor v1.2.0</p>
+        <p className="text-center text-sm text-muted-foreground">ProxMenux Monitor v1.2.1.2-beta</p>
      </div>
    </div>
  )
@@ -814,7 +814,7 @@ export function ProxmoxDashboard() {
        </Tabs>

        <footer className="mt-8 md:mt-12 pt-4 md:pt-6 border-t border-border text-center text-xs md:text-sm text-muted-foreground">
-          <p className="font-medium mb-2">ProxMenux Monitor v1.2.0</p>
+          <p className="font-medium mb-2">ProxMenux Monitor v1.2.1.2-beta</p>
          <p>
            <a
              href="https://ko-fi.com/macrimi"
@@ -6,7 +6,7 @@ import { Dialog, DialogContent, DialogTitle } from "./ui/dialog"
 import { X, Sparkles, Thermometer, Activity, HardDrive, Shield, Globe, Cpu, Zap, Sliders, Wrench, RefreshCw, Server } from "lucide-react"
 import { Checkbox } from "./ui/checkbox"

-const APP_VERSION = "1.2.1.1-beta" // Sync with AppImage/package.json
+const APP_VERSION = "1.2.1.2-beta" // Sync with AppImage/package.json

 interface ReleaseNote {
  date: string
@@ -18,6 +18,30 @@ interface ReleaseNote {
 }

 export const CHANGELOG: Record<string, ReleaseNote> = {
+  "1.2.1.2-beta": {
+    date: "May 20, 2026",
+    changes: {
+      added: [
+        "Coral TPU installer - Uninstall path mirroring the NVIDIA flow, and registry-driven update notifications for both the PCIe gasket-dkms driver (tracked against feranick/gasket-driver) and the USB libedgetpu1 runtime (tracked via apt)",
+        "Disk I/O severity tiers - Sliding 24h window classifies dmesg ATA/SCSI errors into silent (0-10), WARNING (11-100) and CRITICAL (100+ or any hard error like UNC / Buffer I/O / Sense Key Hardware Error), so quiet days stay quiet and a single Buffer I/O event still pages immediately",
+        "Quiet Hours buffering - Events suppressed during a channel's quiet window are now persisted to SQLite and released as a grouped summary when the window closes, instead of being silently dropped",
+      ],
+      changed: [
+        "Burst aggregation wording - Burst summaries now report only the additional events that arrived after the initial individual alert, so the operator no longer sees the first event counted twice (\"+N more X in window\" instead of the old \"N X in window\" overlap)",
+        "Known-error classifier - Word-boundary regex on ATA/UNC patterns so kernel messages like nvidia_uvm:FatalError are no longer misclassified as ATA cable issues",
+        "Health journal context - Excludes proxmenux-monitor.service systemd lines so internal watchdog SIGKILLs no longer leak into the body of unrelated kernel events",
+        "Resolved notifications severity - The \"previous severity\" now matches the severity the user actually saw in the notification, not whatever escalated value silently landed in the DB during the 24h same-key cooldown",
+        "log2ram apply path - The auto/update flow now restarts log2ram after writing the new size, so a configured 512M actually takes effect on the running tmpfs (previously left at 128M until a manual restart)",
+        "VM/CT control errors - Failed start/stop/restart now surfaces the real pvesh stderr (e.g. \"no space left on device\") in the UI toast and fires a vm_fail / ct_fail notification, instead of a bare 500 INTERNAL SERVER ERROR",
+        "Mobile design of Quiet Hours / Daily Digest - Time inputs are now full-height with inline labels instead of the cramped grid layout that overflowed on narrow screens",
+      ],
+      fixed: [
+        "ATA disk error not recorded - disk_observations is now written before the SMART gate, so transient errors that don't yet trip SMART still build the per-disk history",
+        "Quiet Hours toggle not persisting - get_settings now returns the per-channel quiet_*/digest_* fields so the toggle's state reloads correctly after a refresh",
+        "Frontend 401 cascade - Login screen no longer swallows the 401 forever after a brief stale-token state; the dedup flag is cleared on mount and on successful login",
+      ],
+    },
+  },
  "1.2.1.1-beta": {
    date: "May 9, 2026",
    changes: {
@@ -3584,7 +3584,7 @@ ${observationsHtml}
  <!-- Footer -->
 <div class="rpt-footer">
  <div>Report generated by ProxMenux Monitor</div>
-  <div>ProxMenux Monitor v1.2.0</div>
+  <div>ProxMenux Monitor v1.2.1.2-beta</div>
 </div>

 </body>
@@ -1,6 +1,6 @@
 {
  "name": "ProxMenux-Monitor",
-  "version": "1.2.1.1-beta",
+  "version": "1.2.1.2-beta",
  "description": "Proxmox System Monitoring Dashboard",
  "private": true,
  "scripts": {
@@ -1026,9 +1026,16 @@ def _capture_health_journal_context(categories: list, reason: str = '') -> str:
        # line like "[HealthPersistence] Database initialized with 13 tables"
        # leaks into the AI context because grep -iE 'ata' matches the
        # substring "ata" in "dATAbase". Self-logs are never system evidence.
+        #
+        # Also exclude systemd actions on the proxmenux-monitor unit itself
+        # (e.g. "proxmenux-monitor.service: Killed process 2010621 with
+        # signal SIGKILL"). When a kernel event fires within the same
+        # 10-min window as one of our own watchdog kills, the SIGKILL
+        # line would otherwise leak into the journal_context and the AI
+        # would paste it under the unrelated event as "📝 Log: …".
        cmd = (
            f"journalctl -b 0 --since='10 minutes ago' --no-pager -n 500 2>/dev/null | "
-            f"grep -vE 'AppRun\\[|proxmenux-auth|\\[HealthPersistence\\]|\\[ProxMenux\\]|\\[NotificationManager\\]|\\[AIEnhancer\\]' | "
+            f"grep -vE 'AppRun\\[|proxmenux-auth|\\[HealthPersistence\\]|\\[ProxMenux\\]|\\[NotificationManager\\]|\\[AIEnhancer\\]|proxmenux-monitor\\.service' | "
            f"grep -iE '{pattern}' | tail -n 30"
        )
        
@@ -10344,7 +10351,7 @@ def api_health():
    return jsonify({
        'status': 'healthy',
        'timestamp': datetime.now().isoformat(),
-        'version': '1.2.1.1-beta'
+        'version': '1.2.1.2-beta'
    })

 # ─── User-configurable health thresholds ─────────────────────────────────────
@@ -10737,7 +10744,7 @@ def api_info():
    """Root endpoint with API information"""
    return jsonify({
        'name': 'ProxMenux Monitor API',
-        'version': '1.2.1.1-beta',
+        'version': '1.2.1.2-beta',
        'endpoints': [
            '/api/system',
            '/api/system-info',
@@ -11387,7 +11394,7 @@ if __name__ == '__main__':
        try:
            import sqlite3
            from pathlib import Path
-            MONITOR_VERSION = '1.2.1.1-beta'
+            MONITOR_VERSION = '1.2.1.2-beta'
            db_path = Path('/usr/local/share/proxmenux/health_monitor.db')
            if db_path.exists():
                conn = sqlite3.connect(str(db_path), timeout=10)
@@ -156,6 +156,90 @@ def _detect_nvidia_xfree86() -> Optional[dict]:
    }


+# ── Coral TPU host driver (PCIe gasket-dkms + USB libedgetpu1) ──
+#
+# Two install paths share the same registry entry because the user
+# thinks of them as one "Coral driver" install. The detector returns
+# one entry per path that is actually present on the host, so a system
+# with both M.2 and USB Coral devices gets two entries — independent
+# update streams (gasket-dkms from feranick/gasket-driver on GitHub,
+# libedgetpu1-std from Google's apt repo).
+
+
+def _detect_coral_host() -> list[dict]:
+    out: list[dict] = []
+
+    # PCIe / M.2 — gasket-dkms package version, falling back to the
+    # registered DKMS version if the package was force-removed but the
+    # built modules still exist.
+    pcie_version: Optional[str] = None
+    try:
+        r = subprocess.run(
+            ["dpkg-query", "-W", "-f=${Status}|${Version}", "gasket-dkms"],
+            capture_output=True, text=True, timeout=3,
+        )
+        if r.returncode == 0 and "ok installed" in r.stdout:
+            pcie_version = r.stdout.split("|", 1)[1].strip()
+    except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
+        pass
+    if not pcie_version:
+        try:
+            r = subprocess.run(
+                ["dkms", "status"], capture_output=True, text=True, timeout=3,
+            )
+            if r.returncode == 0:
+                for line in r.stdout.splitlines():
+                    if line.startswith("gasket"):
+                        # "gasket, 1.0, ..." or "gasket/1.0, ..."
+                        m = re.match(r"^gasket[, /]([^,\s]+)", line)
+                        if m:
+                            pcie_version = m.group(1)
+                            break
+        except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
+            pass
+    if pcie_version:
+        out.append({
+            "id": "coral-host-pcie",
+            "type": "coral_host",
+            "name": "Coral TPU Driver (gasket-dkms)",
+            "current_version": pcie_version,
+            "menu_label": "GPU & TPU → Coral TPU",
+            "menu_script": "scripts/gpu_tpu/install_coral.sh",
+            "_coral_variant": "pcie",
+        })
+
+    # USB — libedgetpu1-std (default) or libedgetpu1-max if the user
+    # opted into the overclocked runtime. Either one means the USB
+    # path is installed.
+    usb_version: Optional[str] = None
+    usb_pkg: Optional[str] = None
+    for pkg in ("libedgetpu1-std", "libedgetpu1-max"):
+        try:
+            r = subprocess.run(
+                ["dpkg-query", "-W", "-f=${Status}|${Version}", pkg],
+                capture_output=True, text=True, timeout=3,
+            )
+        except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
+            continue
+        if r.returncode == 0 and "ok installed" in r.stdout:
+            usb_version = r.stdout.split("|", 1)[1].strip()
+            usb_pkg = pkg
+            break
+    if usb_version and usb_pkg:
+        out.append({
+            "id": "coral-host-usb",
+            "type": "coral_host",
+            "name": f"Coral TPU Runtime ({usb_pkg})",
+            "current_version": usb_version,
+            "menu_label": "GPU & TPU → Coral TPU",
+            "menu_script": "scripts/gpu_tpu/install_coral.sh",
+            "_coral_variant": "usb",
+            "_coral_pkg": usb_pkg,
+        })
+
+    return out
+
+
 def _detect_oci_apps() -> list[dict]:
    """Bridge to the OCI manager so every OCI-installed app shows up
    in the registry without a per-app detector here. The OCI manager
@@ -350,6 +434,7 @@ def _detect_lxc_containers() -> list[dict]:
 # framework normalises both shapes.
 _DETECTORS: list[Callable[[], Any]] = [
    _detect_nvidia_xfree86,
+    _detect_coral_host,
    _detect_oci_apps,
    _detect_lxc_containers,
 ]
@@ -834,9 +919,171 @@ def _check_lxc_updates(entry: dict) -> dict:
    }


+# ── Coral driver checker ──
+#
+# Two upstreams to track:
+#
+#   PCIe (gasket-dkms) → feranick/gasket-driver on GitHub. The fork is
+#       actively maintained; releases are tagged like "v1.0-22". We pull
+#       the latest tag from the GitHub API and compare against the
+#       installed gasket-dkms Debian version. Because the Debian version
+#       string ("1.0-18") doesn't perfectly match the upstream tag
+#       ("v1.0-22"), we normalise both sides to the trailing "-N" build
+#       number for the comparison. Strict semver isn't workable here.
+#
+#   USB (libedgetpu1-std/-max) → Google's apt repo. `apt-cache policy`
+#       reports installed + candidate versions in one shot, no internet
+#       round-trip required (apt's own cache is the canonical answer).
+#
+# Cache TTL for the GitHub call is 7 days — feranick's release cadence
+# is roughly monthly, matching NVIDIA's pattern. The cache lives in
+# memory so AppImage restarts refresh it for free.
+
+_CORAL_GASKET_REPO = "feranick/gasket-driver"
+_CORAL_CACHE_TTL = 7 * 86400
+_coral_gasket_cache: dict[str, Any] = {"latest_tag": None, "fetched_at": 0}
+
+
+def _coral_build_number(s: str) -> int:
+    """Extract the trailing build number from a Coral version string.
+
+    Handles both upstream tag form (``v1.0-22``, ``1.0-22``) and the
+    Debian package form (``1.0-22``, ``1.0-18+pmx1``). Returns 0 if no
+    trailing ``-N`` segment exists — that pushes "no build number"
+    versions to the lowest rank so any tagged release shows as newer.
+    """
+    if not s:
+        return 0
+    m = re.search(r"-(\d+)", s)
+    if not m:
+        return 0
+    try:
+        return int(m.group(1))
+    except (ValueError, TypeError):
+        return 0
+
+
+def _fetch_gasket_latest_tag(force: bool = False) -> Optional[str]:
+    now = time.time()
+    if not force and _coral_gasket_cache["latest_tag"] and \
+       now - _coral_gasket_cache["fetched_at"] < _CORAL_CACHE_TTL:
+        return _coral_gasket_cache["latest_tag"]
+    url = f"https://api.github.com/repos/{_CORAL_GASKET_REPO}/tags?per_page=5"
+    try:
+        req = urllib.request.Request(
+            url,
+            headers={
+                "User-Agent": "ProxMenux-Monitor/1.0",
+                "Accept": "application/vnd.github+json",
+            },
+        )
+        with urllib.request.urlopen(req, timeout=15) as resp:
+            tags = json.loads(resp.read().decode("utf-8", errors="replace"))
+    except Exception as e:
+        print(f"[ProxMenux] gasket-driver tag fetch failed: {e}")
+        return _coral_gasket_cache.get("latest_tag")
+    if not isinstance(tags, list) or not tags:
+        return _coral_gasket_cache.get("latest_tag")
+    # Pick the tag with the highest trailing build number — feranick's
+    # tags are not strictly chronological, occasionally rebuilt.
+    best: Optional[str] = None
+    best_n = -1
+    for t in tags:
+        if not isinstance(t, dict):
+            continue
+        name = t.get("name") or ""
+        n = _coral_build_number(name)
+        if n > best_n:
+            best_n = n
+            best = name
+    if best:
+        _coral_gasket_cache["latest_tag"] = best
+        _coral_gasket_cache["fetched_at"] = now
+    return best
+
+
+def _apt_cache_candidate(pkg: str) -> Optional[str]:
+    """Return the candidate (newest available) version for ``pkg`` from
+    the local apt cache. Caller is responsible for the package existing —
+    a missing package returns None silently.
+    """
+    try:
+        r = subprocess.run(
+            ["apt-cache", "policy", pkg],
+            capture_output=True, text=True, timeout=5,
+        )
+    except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
+        return None
+    if r.returncode != 0:
+        return None
+    for line in r.stdout.splitlines():
+        line = line.strip()
+        if line.startswith("Candidate:"):
+            cand = line.split(":", 1)[1].strip()
+            if cand and cand != "(none)":
+                return cand
+    return None
+
+
+def _check_coral_host(entry: dict) -> dict:
+    variant = entry.get("_coral_variant") or ""
+    current = entry.get("current_version") or ""
+
+    if variant == "pcie":
+        latest_tag = _fetch_gasket_latest_tag()
+        if not latest_tag:
+            return {"available": False, "latest": None,
+                    "last_check": _now_iso(),
+                    "error": "could not fetch gasket-driver tags"}
+        cur_n = _coral_build_number(current)
+        new_n = _coral_build_number(latest_tag)
+        available = new_n > cur_n
+        return {
+            "available": available,
+            "latest": latest_tag if available else None,
+            "last_check": _now_iso(),
+            "error": None,
+            "_coral_variant": "pcie",
+        }
+
+    if variant == "usb":
+        pkg = entry.get("_coral_pkg") or "libedgetpu1-std"
+        candidate = _apt_cache_candidate(pkg)
+        if not candidate:
+            return {"available": False, "latest": None,
+                    "last_check": _now_iso(),
+                    "error": f"apt-cache policy returned no candidate for {pkg}"}
+        # Use plain string compare via the same build-number heuristic
+        # apt uses dpkg version compare upstream, but for the libedgetpu
+        # packages a trailing "-N" build number is the only thing that
+        # ever moves, so the build-number compare is enough here too.
+        # If it ever isn't, dpkg --compare-versions is the right call.
+        try:
+            cmp = subprocess.run(
+                ["dpkg", "--compare-versions", current, "lt", candidate],
+                capture_output=True, timeout=3,
+            )
+            available = cmp.returncode == 0
+        except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
+            available = candidate != current
+        return {
+            "available": available,
+            "latest": candidate if available else None,
+            "last_check": _now_iso(),
+            "error": None,
+            "_coral_variant": "usb",
+            "_coral_pkg": pkg,
+        }
+
+    return {"available": False, "latest": None,
+            "last_check": _now_iso(),
+            "error": f"unknown coral variant: {variant}"}
+
+
 _CHECKERS: dict[str, Callable[[dict], dict]] = {
    "oci_app": _check_oci_app,
    "nvidia_xfree86": _check_nvidia_xfree86,
+    "coral_host": _check_coral_host,
    "lxc": _check_lxc_updates,
 }

@@ -890,7 +1137,8 @@ def check_for_updates(force: bool = False) -> list[dict]:
            # the LXC checker's counts dropped on the floor and the
            # frontend badge couldn't render.
            for extra_key in ("_packages", "_upgrade_kind", "_kernel",
-                              "_kernel_note", "_count", "_security_count"):
+                              "_kernel_note", "_count", "_security_count",
+                              "_coral_variant", "_coral_pkg"):
                if extra_key in result:
                    it["update_check"][extra_key] = result[extra_key]

@@ -382,9 +382,40 @@ class JournalWatcher:
        self._recent_events: Dict[str, float] = {}
        self._dedup_window = 30  # seconds

-        # 24h anti-cascade for disk I/O + filesystem errors (keyed by device name)
+        # 24h anti-cascade for disk I/O + filesystem errors. The dict
+        # key includes a tier suffix (`sdh:warning`, `sdh:critical`)
+        # so a disk in WARNING cooldown can still escalate to CRITICAL
+        # within the same 24h if the rate accelerates.
        self._disk_io_notified: Dict[str, float] = {}
        self._DISK_IO_COOLDOWN = 86400  # 24 hours
+
+        # Sliding 24h window of ATA error timestamps per disk, used to
+        # decide notification severity tier. Don't blindly trust the
+        # SMART firmware self-report — the Google "Failure Trends"
+        # paper showed ~36% of failed drives gave no SMART warning.
+        # Rate-based escalation catches the dying drives that SMART
+        # would never flag until they were already bricked.
+        from collections import deque as _deque
+        self._disk_error_window: Dict[str, "_deque[float]"] = {}
+        self._DISK_ERROR_WINDOW_SECS = 86400  # 24h
+        # Tiers calibrated for homelab/SMB Proxmox usage:
+        #  * 0-10/24h  → transient noise (cable rattle, sleep/wake,
+        #    PHY retrain). Silent observation only.
+        #  * 11-100/24h → WARNING. Notify once per 24h.
+        #  * 100+/24h   → CRITICAL. Active failure.
+        # Hard errors (Buffer I/O, UNC, medium error, unrecovered read)
+        # are CRITICAL on the FIRST occurrence regardless of count —
+        # those are uncorrectable data losses, not transient noise.
+        self._DISK_TIER_WARNING = 10
+        self._DISK_TIER_CRITICAL = 100
+        # Hard-error pattern: matches any of the kernel-reported
+        # signals that mean data was lost or could not be recovered.
+        self._DISK_HARD_ERR_RE = re.compile(
+            r'(Buffer I/O error|UNC\b|Medium Error|medium error'
+            r'|Unrecovered read error|unrecovered read error'
+            r'|Sense Key.*Hardware Error)',
+            re.IGNORECASE,
+        )
        
        # Track when the last full backup job notification was sent
        # so we can suppress per-guest "Starting Backup of VM ..." noise
@@ -1046,73 +1077,107 @@ class JournalWatcher:
            else:
                resolved = re.sub(r'\d+$', '', raw_device) if raw_device.startswith('sd') else raw_device
            
-            # ── ALWAYS persist the observation, regardless of SMART ──
+            # ── ALWAYS persist the observation, regardless of severity ──
            # The disk_observation_contract is explicit (memory note
            # disk-observation-contract): every kernel-surfaced disk
-            # error must be recorded in disk_observations *even when
-            # SMART reports PASSED*. Silent errors on a "healthy" disk
-            # are exactly the early-warning signal the modal histogram
-            # exists to surface ("324 connection errors on this disk").
-            # Previously this line lived AFTER a `return` gate keyed on
-            # smart_health != 'FAILED', so the 3162 ata8 errors on
-            # .1.10 (PASSED SMART) all dropped on the floor instead of
-            # accumulating in the per-disk audit history.
+            # error must be recorded in disk_observations. The modal
+            # histogram is the per-disk audit trail; it must reflect
+            # everything the kernel saw, even noise.
            self._record_disk_io_observation(resolved, msg)

-            # ── Gate 1: only NOTIFY when SMART reports FAILED ──
-            # Observation is already saved above. We avoid spamming a
-            # CRITICAL notification for transient ATA/SCSI noise on
-            # otherwise-healthy disks — the modal histogram surfaces
-            # those without paging the user at 3 AM.
+            # ── Update sliding 24h rate window for this disk ──
+            now = time.time()
+            from collections import deque as _deque
+            window = self._disk_error_window.setdefault(resolved, _deque())
+            window.append(now)
+            cutoff = now - self._DISK_ERROR_WINDOW_SECS
+            while window and window[0] < cutoff:
+                window.popleft()
+            rate_24h = len(window)
+
+            # ── Decide severity tier ──
+            #   * hard error (UNC, Buffer I/O, medium, unrecovered read)
+            #       → CRITICAL on first occurrence, no count threshold.
+            #       These are uncorrectable: data is gone.
+            #   * SMART self-report FAILED → CRITICAL (firmware admits it).
+            #   * rate_24h > _DISK_TIER_CRITICAL → CRITICAL (active failure
+            #       even if SMART still says PASSED).
+            #   * rate_24h > _DISK_TIER_WARNING → WARNING (suspicious,
+            #       worth a heads-up).
+            #   * Otherwise → silent observation only (transient noise).
+            is_hard_error = bool(self._DISK_HARD_ERR_RE.search(msg))
            smart_health = self._quick_smart_health(resolved)
-            if smart_health != 'FAILED':
+            if is_hard_error or smart_health == 'FAILED' or rate_24h > self._DISK_TIER_CRITICAL:
+                tier = 'critical'
+            elif rate_24h > self._DISK_TIER_WARNING:
+                tier = 'warning'
+            else:
+                # Silent — observation already saved, that's enough.
                return

-            # ── Gate 2: 24-hour dedup per device ──
-            # Check both in-memory cache AND the DB (user dismiss clears DB cooldowns).
-            # If user dismissed the error, _clear_disk_io_cooldown() removed the DB
-            # entry, so we should refresh from DB to get the real state.
-            now = time.time()
-            
-            # First check in-memory cache
-            last_notified = self._disk_io_notified.get(resolved, 0)
-            
+            # ── 24h anti-cascade per (device, tier) ──
+            # Independent cooldown per tier so a disk that fires WARNING
+            # at noon can still escalate to CRITICAL the same day when
+            # the rate jumps past _DISK_TIER_CRITICAL — they're
+            # different keys.
+            cooldown_key = f'{resolved}:{tier}'
+            last_notified = self._disk_io_notified.get(cooldown_key, 0)
            if now - last_notified < self._DISK_IO_COOLDOWN:
-                # In-memory says we already notified. But user might have dismissed
-                # the error, which clears the DB. Re-check DB to be sure.
-                db_ts = self._get_disk_io_cooldown_from_db(resolved)
+                # In-memory says cooldown active. Re-verify in DB in
+                # case the user dismissed (which clears the DB entry).
+                db_ts = self._get_disk_io_cooldown_from_db(cooldown_key)
                if db_ts is not None and now - db_ts < self._DISK_IO_COOLDOWN:
-                    return  # DB confirms cooldown is still active
-                # DB says cooldown was cleared (user dismissed) - proceed to notify
-                # Update in-memory cache
-                del self._disk_io_notified[resolved]
-            
-            self._disk_io_notified[resolved] = now
-            self._save_disk_io_notified(resolved, now)
-            
+                    return
+                # Dismissed → DB cleared → proceed to notify and refresh state.
+                del self._disk_io_notified[cooldown_key]
+
+            self._disk_io_notified[cooldown_key] = now
+            self._save_disk_io_notified(cooldown_key, now)
+
            # ── Build enriched notification ──
            device_info = self._identify_block_device(resolved)
-            
+
            parts = []
-            parts.append(f'Disk /dev/{resolved}: I/O errors detected')
-            parts.append('SMART status: FAILED -- disk is failing')
-            
+            if tier == 'critical':
+                if is_hard_error:
+                    parts.append(f'Disk /dev/{resolved}: UNRECOVERABLE error detected')
+                elif smart_health == 'FAILED':
+                    parts.append(f'Disk /dev/{resolved}: SMART reports FAILED')
+                else:
+                    parts.append(
+                        f'Disk /dev/{resolved}: high I/O error rate '
+                        f'({rate_24h} errors in last 24h)'
+                    )
+            else:  # warning
+                parts.append(
+                    f'Disk /dev/{resolved}: elevated I/O error rate '
+                    f'({rate_24h} errors in last 24h)'
+                )
+
+            parts.append(f'SMART status: {smart_health}')
+
            if device_info:
                parts.append(f'Device: {device_info}')
            else:
                parts.append(f'Device: /dev/{resolved}')
-            
+
            # Translate the raw kernel error code
            detail = self._translate_ata_error(msg)
            if detail:
                parts.append(f'Error detail: {detail}')
-            
-            parts.append(f'Action: Replace disk /dev/{resolved} as soon as possible.')
+
+            if tier == 'critical':
+                parts.append(f'Action: Replace disk /dev/{resolved} as soon as possible.')
+            else:
+                parts.append(
+                    f'Action: Monitor /dev/{resolved} closely. '
+                    f'Plan a backup verification and replacement if rate grows.'
+                )
            parts.append(f'  Check details: smartctl -a /dev/{resolved}')
-            
+
            enriched = '\n'.join(parts)
            dev_display = f'/dev/{resolved}'
-            
+
            # Capture journal context for AI enrichment.
            # `raw_device` is the original ATA-port literal extracted by the regex
            # (e.g. "ata8"). The previous code used a name `ata_port` that was
@@ -1123,12 +1188,15 @@ class JournalWatcher:
                keywords=[resolved, raw_device, 'I/O error', 'exception', 'SMART'],
                lines=30
            )
-            
-            self._emit('disk_io_error', 'CRITICAL', {
+
+            severity = 'CRITICAL' if tier == 'critical' else 'WARNING'
+            self._emit('disk_io_error', severity, {
                'device': dev_display,
                'reason': enriched,
                'hostname': self._hostname,
-                'smart_status': 'FAILED',
+                'smart_status': smart_health,
+                'rate_24h': rate_24h,
+                'tier': tier,
                '_journal_context': journal_ctx,
            }, entity='disk', entity_id=resolved)
            return
@@ -2229,6 +2297,17 @@ class PollingCollector:
        self._notified_proxmenux_beta_version: str | None = None
        # In-memory cache: error_key -> last notification timestamp
        self._last_notified: Dict[str, float] = {}
+        # In-memory cache: error_key -> severity actually sent in the last
+        # notification. Decoupled from `_known_errors[k].severity` (which
+        # always reflects the most-recent DB row) so a recovery message
+        # quotes the same severity the user saw. Without this, an error
+        # that fired WARNING, silently escalated to CRITICAL during its
+        # 24h same-key cooldown, then resolved, would be reported as
+        # "previous severity: CRITICAL" — confusing the operator who only
+        # ever saw the WARNING. Not persisted across restarts: the
+        # post-restart first-poll guard (`_first_poll_done`) already
+        # suppresses spurious recoveries.
+        self._notified_severity: Dict[str, str] = {}
        # Track known error keys + metadata so we can detect new ones AND emit recovery
        # Dict[error_key, dict(category, severity, reason, first_seen, error_key)]
        self._known_errors: Dict[str, dict] = {}
@@ -2572,6 +2651,11 @@ class PollingCollector:
            # Track that we notified
            self._last_notified[error_key] = now
            self._persist_last_notified(error_key, now)
+            # Snapshot the severity we actually delivered, so a future
+            # recovery message quotes the same value the user saw — not
+            # whatever silently-escalated severity ended up in the DB
+            # during the same-key 24h cooldown window.
+            self._notified_severity[error_key] = emit_severity
        
        # ── Emit recovery notifications for errors that resolved ──
        resolved_keys = set(self._known_errors.keys()) - set(current_keys.keys())
@@ -2674,24 +2758,32 @@ class PollingCollector:
            else:
                clean_reason = 'Condition resolved'
            
+            # `original_severity` must match what the user actually saw
+            # in the most-recent notification for this error, not the
+            # latest DB severity. See `_notified_severity` docstring at
+            # __init__ for the failure mode this avoids.
+            original_severity = self._notified_severity.get(
+                key, old_meta.get('severity', 'WARNING'),
+            )
            data = {
                'hostname': self._hostname,
                'category': category,
                'reason': clean_reason,
                'error_key': key,
                'severity': 'OK',
-                'original_severity': old_meta.get('severity', 'WARNING'),
+                'original_severity': original_severity,
                'first_seen': first_seen,
                'duration': duration,
                'is_recovery': True,
            }
-            
+
            self._queue.put(NotificationEvent(
                'error_resolved', 'OK', data, source='health',
                entity=entity, entity_id=eid or key,
            ))
-            
+
            self._last_notified.pop(key, None)
+            self._notified_severity.pop(key, None)
        
        self._known_errors = current_keys
        self._first_poll_done = True
@@ -3356,6 +3448,41 @@ class PollingCollector:
            }
            return 'nvidia_driver_update_available', data

+        if item_type == 'coral_host':
+            variant = update.get('_coral_variant') or item.get('_coral_variant') or ''
+            if variant == 'pcie':
+                variant_label = 'gasket-dkms (PCIe / M.2) driver'
+                upgrade_reason = (
+                    'feranick/gasket-driver has published a newer release. '
+                    'The installer rebuilds the gasket + apex kernel modules '
+                    'via DKMS against the running kernel.'
+                )
+                reboot_note = (
+                    'Reinstalling rebuilds the DKMS module and requires a '
+                    'reboot to load the new driver.'
+                )
+            elif variant == 'usb':
+                pkg = update.get('_coral_pkg') or item.get('_coral_pkg') or 'libedgetpu1'
+                variant_label = f'{pkg} runtime (USB Accelerator)'
+                upgrade_reason = (
+                    'A newer Edge TPU runtime is available from the Google '
+                    'Coral apt repository.'
+                )
+                reboot_note = (
+                    'The USB runtime upgrade does not require a reboot.'
+                )
+            else:
+                variant_label = 'Coral TPU driver'
+                upgrade_reason = 'A newer Coral driver is available.'
+                reboot_note = ''
+            data = {
+                **common,
+                'variant_label': variant_label,
+                'upgrade_reason': upgrade_reason,
+                'reboot_note': reboot_note,
+            }
+            return 'coral_driver_update_available', data
+
        # Unknown type — don't notify (keeps the queue clean if a
        # future detector lands without a corresponding event mapping).
        return '', {}
@@ -627,9 +627,19 @@ class BurstAggregator:
            else:
                details = '\n'.join(detail_lines)
        
+        # The first event in the bucket was already sent individually on
+        # ingest (see line 547 — "fast alert" path). The burst summary
+        # must therefore describe the *additional* events that arrived
+        # after that initial alert, otherwise the user receives both a
+        # "1 system problem" individual notification AND a "2 system
+        # problems" burst summary that double-counts the first event.
+        # `count` reports the additional count; `total_count` is exposed
+        # for templates that want to show "N more (X total in window)".
+        additional_count = max(len(events) - 1, 1)
        data = {
            'hostname': first.data.get('hostname') or _resolve_display_hostname(self._config),
-            'count': str(len(events)),
+            'count': str(additional_count),
+            'total_count': str(len(events)),
            'window': window_str,
            'entity_list': entity_list,
            'event_type': first.event_type,
@@ -1176,60 +1176,91 @@ TEMPLATES = {
        'group': 'updates',
        'default_enabled': True,
    },
+
+    # Sprint 14.7 follow-up: host-side Coral TPU driver. Mirrors the
+    # NVIDIA flow — there's no in-dashboard "Apply update" button; the
+    # operator reruns the installer from the post-install menu. The
+    # PCIe (gasket-dkms) and USB (libedgetpu1-*) variants share one
+    # template and use {variant_label} to surface which is moving so
+    # the body stays readable in either case.
+    'coral_driver_update_available': {
+        'title': '{hostname}: Coral TPU driver update available — {latest_version}',
+        'body': (
+            'A newer {variant_label} is available.\n'
+            '🔹 Currently installed: {current_version}\n'
+            '🟢 Latest available:    {latest_version}\n\n'
+            '{upgrade_reason}\n\n'
+            '💡 To reinstall:\n'
+            '  • From the ProxMenux post-install menu: {menu_label}\n\n'
+            '{reboot_note}'
+        ),
+        'label': 'Coral TPU driver update available',
+        'group': 'updates',
+        'default_enabled': True,
+    },
    
    # ── Burst aggregation summaries (hidden -- auto-generated by BurstAggregator) ──
    # These inherit enabled state from their parent event type at dispatch time.
+    #
+    # IMPORTANT — `{count}` here is the count of *additional* events that
+    # arrived AFTER the first one was already sent individually on the
+    # fast-alert path (see notification_manager.py:_create_summary). It is
+    # NOT the total event count in the window; that lives in `{total_count}`.
+    # The wording must reflect "more / additional" so the user does not
+    # mistake a 2-event burst for a duplicate of the initial individual
+    # notification. The first event has already been delivered when this
+    # summary fires.
    'burst_auth_fail': {
-        'title': '{hostname}: {count} auth failures in {window}',
-        'body': '{count} authentication failures detected in {window}.\nSources: {entity_list}',
+        'title': '{hostname}: +{count} more auth failures in {window}',
+        'body': '+{count} additional authentication failures detected in {window} ({total_count} total).\nSources: {entity_list}',
        'label': 'Auth failures burst',
        'group': 'security',
        'default_enabled': True,
        'hidden': True,
    },
    'burst_ip_block': {
-        'title': '{hostname}: Fail2Ban banned {count} IPs in {window}',
-        'body': '{count} IPs banned by Fail2Ban in {window}.\nIPs: {entity_list}',
+        'title': '{hostname}: Fail2Ban banned +{count} more IPs in {window}',
+        'body': '+{count} additional IPs banned by Fail2Ban in {window} ({total_count} total).\nIPs: {entity_list}',
        'label': 'IP block burst',
        'group': 'security',
        'default_enabled': True,
        'hidden': True,
    },
    'burst_disk_io': {
-        'title': '{hostname}: {count} disk I/O errors on {entity_list}',
-        'body': '{count} I/O errors detected in {window}.\nDevices: {entity_list}',
+        'title': '{hostname}: +{count} more disk I/O errors on {entity_list}',
+        'body': '+{count} additional I/O errors detected in {window} ({total_count} total).\nDevices: {entity_list}',
        'label': 'Disk I/O burst',
        'group': 'storage',
        'default_enabled': True,
        'hidden': True,
    },
    'burst_cluster': {
-        'title': '{hostname}: Cluster flapping detected ({count} changes)',
-        'body': 'Cluster state changed {count} times in {window}.\nNodes: {entity_list}',
+        'title': '{hostname}: Cluster flapping detected (+{count} more changes)',
+        'body': 'Cluster state changed +{count} more times in {window} ({total_count} total).\nNodes: {entity_list}',
        'label': 'Cluster flapping burst',
        'group': 'cluster',
        'default_enabled': True,
        'hidden': True,
    },
    'burst_service_fail': {
-        'title': '{hostname}: {count} services failed in {window}',
-        'body': '{count} service failures detected in {window}.\nThis typically indicates a node reboot or PVE service restart.\n\nAdditional failures:\n{details}',
+        'title': '{hostname}: +{count} more services failed in {window}',
+        'body': '+{count} additional service failures detected in {window} ({total_count} total).\nThis typically indicates a node reboot or PVE service restart.\n\nAdditional failures:\n{details}',
        'label': 'Service fail burst',
        'group': 'services',
        'default_enabled': True,
        'hidden': True,
    },
    'burst_system': {
-        'title': '{hostname}: {count} system problems in {window}',
-        'body': '{count} system problems detected in {window}.\n\nAdditional issues:\n{details}',
+        'title': '{hostname}: +{count} more system problems in {window}',
+        'body': '+{count} additional system problems detected in {window} ({total_count} total).\n\nAdditional issues:\n{details}',
        'label': 'System problems burst',
        'group': 'services',
        'default_enabled': True,
        'hidden': True,
    },
    'burst_generic': {
-        'title': '{hostname}: {count} {event_type} events in {window}',
-        'body': '{count} events of type {event_type} in {window}.\n\nAdditional events:\n{details}',
+        'title': '{hostname}: +{count} more {event_type} events in {window}',
+        'body': '+{count} additional events of type {event_type} in {window} ({total_count} total).\n\nAdditional events:\n{details}',
        'label': 'Generic burst',
        'group': 'other',
        'default_enabled': True,
@@ -1559,6 +1590,7 @@ EVENT_EMOJI = {
    'post_install_update':  '✨',              # sparkles
    'secure_gateway_update_available': '\U0001F510',  # 🔐 closed lock with key
    'nvidia_driver_update_available':  '\U0001F3AE',  # 🎮 video game (GPU)
+    'coral_driver_update_available':   '\U0001F9E0',  # 🧠 brain (TPU/inference)
    # AI
    'ai_model_migrated':    '\U0001F504',         # arrows counterclockwise (refresh/update)
    # GPU / PCIe
@@ -83,7 +83,7 @@ PROXMOX_KNOWN_ERRORS: List[Dict[str, Any]] = [
        "category": "disks"
    },
    {
-        "pattern": r"ata.*error|ATA.*bus.*error|Emask.*0x|DRDY.*ERR|UNC.*error",
+        "pattern": r"\bata\d.*\berror\b|\bATA\b.*bus.*error|Emask.*0x|DRDY.*ERR|\bUNC\b.*error",
        "cause": "ATA communication error with disk",
        "cause_detailed": "The SATA/ATA controller encountered communication errors with the disk. This can indicate cable issues, controller problems, or disk failure.",
        "severity": "warning",