Update Beta 1.2.1.2

This commit is contained in:
MacRimi
2026-05-20 19:47:42 +02:00
parent 4112323961
commit 298cd2c6d4
15 changed files with 781 additions and 109 deletions
+1 -1
View File
@@ -1 +1 @@
150694a49a5b0a4546a2bf5fedcc0914d37666d0cdeac1d9fdc58793c131b4bd ProxMenux-1.2.1.1-beta.AppImage
0d74347d2feae2be4b8c6d62d6cd9b1b15b94ef431c088b5580560f6b4751594 ProxMenux-1.2.1.2-beta.AppImage
+1 -1
View File
@@ -271,7 +271,7 @@ export function Login({ onLogin }: LoginProps) {
</form>
</div>
<p className="text-center text-sm text-muted-foreground">ProxMenux Monitor v1.2.0</p>
<p className="text-center text-sm text-muted-foreground">ProxMenux Monitor v1.2.1.2-beta</p>
</div>
</div>
)
+1 -1
View File
@@ -814,7 +814,7 @@ export function ProxmoxDashboard() {
</Tabs>
<footer className="mt-8 md:mt-12 pt-4 md:pt-6 border-t border-border text-center text-xs md:text-sm text-muted-foreground">
<p className="font-medium mb-2">ProxMenux Monitor v1.2.0</p>
<p className="font-medium mb-2">ProxMenux Monitor v1.2.1.2-beta</p>
<p>
<a
href="https://ko-fi.com/macrimi"
+25 -1
View File
@@ -6,7 +6,7 @@ import { Dialog, DialogContent, DialogTitle } from "./ui/dialog"
import { X, Sparkles, Thermometer, Activity, HardDrive, Shield, Globe, Cpu, Zap, Sliders, Wrench, RefreshCw, Server } from "lucide-react"
import { Checkbox } from "./ui/checkbox"
const APP_VERSION = "1.2.1.1-beta" // Sync with AppImage/package.json
const APP_VERSION = "1.2.1.2-beta" // Sync with AppImage/package.json
interface ReleaseNote {
date: string
@@ -18,6 +18,30 @@ interface ReleaseNote {
}
export const CHANGELOG: Record<string, ReleaseNote> = {
"1.2.1.2-beta": {
date: "May 20, 2026",
changes: {
added: [
"Coral TPU installer - Uninstall path mirroring the NVIDIA flow, and registry-driven update notifications for both the PCIe gasket-dkms driver (tracked against feranick/gasket-driver) and the USB libedgetpu1 runtime (tracked via apt)",
"Disk I/O severity tiers - Sliding 24h window classifies dmesg ATA/SCSI errors into silent (0-10), WARNING (11-100) and CRITICAL (100+ or any hard error like UNC / Buffer I/O / Sense Key Hardware Error), so quiet days stay quiet and a single Buffer I/O event still pages immediately",
"Quiet Hours buffering - Events suppressed during a channel's quiet window are now persisted to SQLite and released as a grouped summary when the window closes, instead of being silently dropped",
],
changed: [
"Burst aggregation wording - Burst summaries now report only the additional events that arrived after the initial individual alert, so the operator no longer sees the first event counted twice (\"+N more X in window\" instead of the old \"N X in window\" overlap)",
"Known-error classifier - Word-boundary regex on ATA/UNC patterns so kernel messages like nvidia_uvm:FatalError are no longer misclassified as ATA cable issues",
"Health journal context - Excludes proxmenux-monitor.service systemd lines so internal watchdog SIGKILLs no longer leak into the body of unrelated kernel events",
"Resolved notifications severity - The \"previous severity\" now matches the severity the user actually saw in the notification, not whatever escalated value silently landed in the DB during the 24h same-key cooldown",
"log2ram apply path - The auto/update flow now restarts log2ram after writing the new size, so a configured 512M actually takes effect on the running tmpfs (previously left at 128M until a manual restart)",
"VM/CT control errors - Failed start/stop/restart now surfaces the real pvesh stderr (e.g. \"no space left on device\") in the UI toast and fires a vm_fail / ct_fail notification, instead of a bare 500 INTERNAL SERVER ERROR",
"Mobile design of Quiet Hours / Daily Digest - Time inputs are now full-height with inline labels instead of the cramped grid layout that overflowed on narrow screens",
],
fixed: [
"ATA disk error not recorded - disk_observations is now written before the SMART gate, so transient errors that don't yet trip SMART still build the per-disk history",
"Quiet Hours toggle not persisting - get_settings now returns the per-channel quiet_*/digest_* fields so the toggle's state reloads correctly after a refresh",
"Frontend 401 cascade - Login screen no longer swallows the 401 forever after a brief stale-token state; the dedup flag is cleared on mount and on successful login",
],
},
},
"1.2.1.1-beta": {
date: "May 9, 2026",
changes: {
+1 -1
View File
@@ -3584,7 +3584,7 @@ ${observationsHtml}
<!-- Footer -->
<div class="rpt-footer">
<div>Report generated by ProxMenux Monitor</div>
<div>ProxMenux Monitor v1.2.0</div>
<div>ProxMenux Monitor v1.2.1.2-beta</div>
</div>
</body>
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "ProxMenux-Monitor",
"version": "1.2.1.1-beta",
"version": "1.2.1.2-beta",
"description": "Proxmox System Monitoring Dashboard",
"private": true,
"scripts": {
+11 -4
View File
@@ -1026,9 +1026,16 @@ def _capture_health_journal_context(categories: list, reason: str = '') -> str:
# line like "[HealthPersistence] Database initialized with 13 tables"
# leaks into the AI context because grep -iE 'ata' matches the
# substring "ata" in "dATAbase". Self-logs are never system evidence.
#
# Also exclude systemd actions on the proxmenux-monitor unit itself
# (e.g. "proxmenux-monitor.service: Killed process 2010621 with
# signal SIGKILL"). When a kernel event fires within the same
# 10-min window as one of our own watchdog kills, the SIGKILL
# line would otherwise leak into the journal_context and the AI
# would paste it under the unrelated event as "📝 Log: …".
cmd = (
f"journalctl -b 0 --since='10 minutes ago' --no-pager -n 500 2>/dev/null | "
f"grep -vE 'AppRun\\[|proxmenux-auth|\\[HealthPersistence\\]|\\[ProxMenux\\]|\\[NotificationManager\\]|\\[AIEnhancer\\]' | "
f"grep -vE 'AppRun\\[|proxmenux-auth|\\[HealthPersistence\\]|\\[ProxMenux\\]|\\[NotificationManager\\]|\\[AIEnhancer\\]|proxmenux-monitor\\.service' | "
f"grep -iE '{pattern}' | tail -n 30"
)
@@ -10344,7 +10351,7 @@ def api_health():
return jsonify({
'status': 'healthy',
'timestamp': datetime.now().isoformat(),
'version': '1.2.1.1-beta'
'version': '1.2.1.2-beta'
})
# ─── User-configurable health thresholds ─────────────────────────────────────
@@ -10737,7 +10744,7 @@ def api_info():
"""Root endpoint with API information"""
return jsonify({
'name': 'ProxMenux Monitor API',
'version': '1.2.1.1-beta',
'version': '1.2.1.2-beta',
'endpoints': [
'/api/system',
'/api/system-info',
@@ -11387,7 +11394,7 @@ if __name__ == '__main__':
try:
import sqlite3
from pathlib import Path
MONITOR_VERSION = '1.2.1.1-beta'
MONITOR_VERSION = '1.2.1.2-beta'
db_path = Path('/usr/local/share/proxmenux/health_monitor.db')
if db_path.exists():
conn = sqlite3.connect(str(db_path), timeout=10)
+249 -1
View File
@@ -156,6 +156,90 @@ def _detect_nvidia_xfree86() -> Optional[dict]:
}
# ── Coral TPU host driver (PCIe gasket-dkms + USB libedgetpu1) ──
#
# Two install paths share the same registry entry because the user
# thinks of them as one "Coral driver" install. The detector returns
# one entry per path that is actually present on the host, so a system
# with both M.2 and USB Coral devices gets two entries — independent
# update streams (gasket-dkms from feranick/gasket-driver on GitHub,
# libedgetpu1-std from Google's apt repo).
def _detect_coral_host() -> list[dict]:
out: list[dict] = []
# PCIe / M.2 — gasket-dkms package version, falling back to the
# registered DKMS version if the package was force-removed but the
# built modules still exist.
pcie_version: Optional[str] = None
try:
r = subprocess.run(
["dpkg-query", "-W", "-f=${Status}|${Version}", "gasket-dkms"],
capture_output=True, text=True, timeout=3,
)
if r.returncode == 0 and "ok installed" in r.stdout:
pcie_version = r.stdout.split("|", 1)[1].strip()
except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
pass
if not pcie_version:
try:
r = subprocess.run(
["dkms", "status"], capture_output=True, text=True, timeout=3,
)
if r.returncode == 0:
for line in r.stdout.splitlines():
if line.startswith("gasket"):
# "gasket, 1.0, ..." or "gasket/1.0, ..."
m = re.match(r"^gasket[, /]([^,\s]+)", line)
if m:
pcie_version = m.group(1)
break
except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
pass
if pcie_version:
out.append({
"id": "coral-host-pcie",
"type": "coral_host",
"name": "Coral TPU Driver (gasket-dkms)",
"current_version": pcie_version,
"menu_label": "GPU & TPU → Coral TPU",
"menu_script": "scripts/gpu_tpu/install_coral.sh",
"_coral_variant": "pcie",
})
# USB — libedgetpu1-std (default) or libedgetpu1-max if the user
# opted into the overclocked runtime. Either one means the USB
# path is installed.
usb_version: Optional[str] = None
usb_pkg: Optional[str] = None
for pkg in ("libedgetpu1-std", "libedgetpu1-max"):
try:
r = subprocess.run(
["dpkg-query", "-W", "-f=${Status}|${Version}", pkg],
capture_output=True, text=True, timeout=3,
)
except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
continue
if r.returncode == 0 and "ok installed" in r.stdout:
usb_version = r.stdout.split("|", 1)[1].strip()
usb_pkg = pkg
break
if usb_version and usb_pkg:
out.append({
"id": "coral-host-usb",
"type": "coral_host",
"name": f"Coral TPU Runtime ({usb_pkg})",
"current_version": usb_version,
"menu_label": "GPU & TPU → Coral TPU",
"menu_script": "scripts/gpu_tpu/install_coral.sh",
"_coral_variant": "usb",
"_coral_pkg": usb_pkg,
})
return out
def _detect_oci_apps() -> list[dict]:
"""Bridge to the OCI manager so every OCI-installed app shows up
in the registry without a per-app detector here. The OCI manager
@@ -350,6 +434,7 @@ def _detect_lxc_containers() -> list[dict]:
# framework normalises both shapes.
_DETECTORS: list[Callable[[], Any]] = [
_detect_nvidia_xfree86,
_detect_coral_host,
_detect_oci_apps,
_detect_lxc_containers,
]
@@ -834,9 +919,171 @@ def _check_lxc_updates(entry: dict) -> dict:
}
# ── Coral driver checker ──
#
# Two upstreams to track:
#
# PCIe (gasket-dkms) → feranick/gasket-driver on GitHub. The fork is
# actively maintained; releases are tagged like "v1.0-22". We pull
# the latest tag from the GitHub API and compare against the
# installed gasket-dkms Debian version. Because the Debian version
# string ("1.0-18") doesn't perfectly match the upstream tag
# ("v1.0-22"), we normalise both sides to the trailing "-N" build
# number for the comparison. Strict semver isn't workable here.
#
# USB (libedgetpu1-std/-max) → Google's apt repo. `apt-cache policy`
# reports installed + candidate versions in one shot, no internet
# round-trip required (apt's own cache is the canonical answer).
#
# Cache TTL for the GitHub call is 7 days — feranick's release cadence
# is roughly monthly, matching NVIDIA's pattern. The cache lives in
# memory so AppImage restarts refresh it for free.
_CORAL_GASKET_REPO = "feranick/gasket-driver"
_CORAL_CACHE_TTL = 7 * 86400
_coral_gasket_cache: dict[str, Any] = {"latest_tag": None, "fetched_at": 0}
def _coral_build_number(s: str) -> int:
"""Extract the trailing build number from a Coral version string.
Handles both upstream tag form (``v1.0-22``, ``1.0-22``) and the
Debian package form (``1.0-22``, ``1.0-18+pmx1``). Returns 0 if no
trailing ``-N`` segment exists — that pushes "no build number"
versions to the lowest rank so any tagged release shows as newer.
"""
if not s:
return 0
m = re.search(r"-(\d+)", s)
if not m:
return 0
try:
return int(m.group(1))
except (ValueError, TypeError):
return 0
def _fetch_gasket_latest_tag(force: bool = False) -> Optional[str]:
now = time.time()
if not force and _coral_gasket_cache["latest_tag"] and \
now - _coral_gasket_cache["fetched_at"] < _CORAL_CACHE_TTL:
return _coral_gasket_cache["latest_tag"]
url = f"https://api.github.com/repos/{_CORAL_GASKET_REPO}/tags?per_page=5"
try:
req = urllib.request.Request(
url,
headers={
"User-Agent": "ProxMenux-Monitor/1.0",
"Accept": "application/vnd.github+json",
},
)
with urllib.request.urlopen(req, timeout=15) as resp:
tags = json.loads(resp.read().decode("utf-8", errors="replace"))
except Exception as e:
print(f"[ProxMenux] gasket-driver tag fetch failed: {e}")
return _coral_gasket_cache.get("latest_tag")
if not isinstance(tags, list) or not tags:
return _coral_gasket_cache.get("latest_tag")
# Pick the tag with the highest trailing build number — feranick's
# tags are not strictly chronological, occasionally rebuilt.
best: Optional[str] = None
best_n = -1
for t in tags:
if not isinstance(t, dict):
continue
name = t.get("name") or ""
n = _coral_build_number(name)
if n > best_n:
best_n = n
best = name
if best:
_coral_gasket_cache["latest_tag"] = best
_coral_gasket_cache["fetched_at"] = now
return best
def _apt_cache_candidate(pkg: str) -> Optional[str]:
"""Return the candidate (newest available) version for ``pkg`` from
the local apt cache. Caller is responsible for the package existing —
a missing package returns None silently.
"""
try:
r = subprocess.run(
["apt-cache", "policy", pkg],
capture_output=True, text=True, timeout=5,
)
except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
return None
if r.returncode != 0:
return None
for line in r.stdout.splitlines():
line = line.strip()
if line.startswith("Candidate:"):
cand = line.split(":", 1)[1].strip()
if cand and cand != "(none)":
return cand
return None
def _check_coral_host(entry: dict) -> dict:
variant = entry.get("_coral_variant") or ""
current = entry.get("current_version") or ""
if variant == "pcie":
latest_tag = _fetch_gasket_latest_tag()
if not latest_tag:
return {"available": False, "latest": None,
"last_check": _now_iso(),
"error": "could not fetch gasket-driver tags"}
cur_n = _coral_build_number(current)
new_n = _coral_build_number(latest_tag)
available = new_n > cur_n
return {
"available": available,
"latest": latest_tag if available else None,
"last_check": _now_iso(),
"error": None,
"_coral_variant": "pcie",
}
if variant == "usb":
pkg = entry.get("_coral_pkg") or "libedgetpu1-std"
candidate = _apt_cache_candidate(pkg)
if not candidate:
return {"available": False, "latest": None,
"last_check": _now_iso(),
"error": f"apt-cache policy returned no candidate for {pkg}"}
# Use plain string compare via the same build-number heuristic
# apt uses dpkg version compare upstream, but for the libedgetpu
# packages a trailing "-N" build number is the only thing that
# ever moves, so the build-number compare is enough here too.
# If it ever isn't, dpkg --compare-versions is the right call.
try:
cmp = subprocess.run(
["dpkg", "--compare-versions", current, "lt", candidate],
capture_output=True, timeout=3,
)
available = cmp.returncode == 0
except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
available = candidate != current
return {
"available": available,
"latest": candidate if available else None,
"last_check": _now_iso(),
"error": None,
"_coral_variant": "usb",
"_coral_pkg": pkg,
}
return {"available": False, "latest": None,
"last_check": _now_iso(),
"error": f"unknown coral variant: {variant}"}
_CHECKERS: dict[str, Callable[[dict], dict]] = {
"oci_app": _check_oci_app,
"nvidia_xfree86": _check_nvidia_xfree86,
"coral_host": _check_coral_host,
"lxc": _check_lxc_updates,
}
@@ -890,7 +1137,8 @@ def check_for_updates(force: bool = False) -> list[dict]:
# the LXC checker's counts dropped on the floor and the
# frontend badge couldn't render.
for extra_key in ("_packages", "_upgrade_kind", "_kernel",
"_kernel_note", "_count", "_security_count"):
"_kernel_note", "_count", "_security_count",
"_coral_variant", "_coral_pkg"):
if extra_key in result:
it["update_check"][extra_key] = result[extra_key]
+178 -51
View File
@@ -382,9 +382,40 @@ class JournalWatcher:
self._recent_events: Dict[str, float] = {}
self._dedup_window = 30 # seconds
# 24h anti-cascade for disk I/O + filesystem errors (keyed by device name)
# 24h anti-cascade for disk I/O + filesystem errors. The dict
# key includes a tier suffix (`sdh:warning`, `sdh:critical`)
# so a disk in WARNING cooldown can still escalate to CRITICAL
# within the same 24h if the rate accelerates.
self._disk_io_notified: Dict[str, float] = {}
self._DISK_IO_COOLDOWN = 86400 # 24 hours
# Sliding 24h window of ATA error timestamps per disk, used to
# decide notification severity tier. Don't blindly trust the
# SMART firmware self-report — the Google "Failure Trends"
# paper showed ~36% of failed drives gave no SMART warning.
# Rate-based escalation catches the dying drives that SMART
# would never flag until they were already bricked.
from collections import deque as _deque
self._disk_error_window: Dict[str, "_deque[float]"] = {}
self._DISK_ERROR_WINDOW_SECS = 86400 # 24h
# Tiers calibrated for homelab/SMB Proxmox usage:
# * 0-10/24h → transient noise (cable rattle, sleep/wake,
# PHY retrain). Silent observation only.
# * 11-100/24h → WARNING. Notify once per 24h.
# * 100+/24h → CRITICAL. Active failure.
# Hard errors (Buffer I/O, UNC, medium error, unrecovered read)
# are CRITICAL on the FIRST occurrence regardless of count —
# those are uncorrectable data losses, not transient noise.
self._DISK_TIER_WARNING = 10
self._DISK_TIER_CRITICAL = 100
# Hard-error pattern: matches any of the kernel-reported
# signals that mean data was lost or could not be recovered.
self._DISK_HARD_ERR_RE = re.compile(
r'(Buffer I/O error|UNC\b|Medium Error|medium error'
r'|Unrecovered read error|unrecovered read error'
r'|Sense Key.*Hardware Error)',
re.IGNORECASE,
)
# Track when the last full backup job notification was sent
# so we can suppress per-guest "Starting Backup of VM ..." noise
@@ -1046,73 +1077,107 @@ class JournalWatcher:
else:
resolved = re.sub(r'\d+$', '', raw_device) if raw_device.startswith('sd') else raw_device
# ── ALWAYS persist the observation, regardless of SMART ──
# ── ALWAYS persist the observation, regardless of severity ──
# The disk_observation_contract is explicit (memory note
# disk-observation-contract): every kernel-surfaced disk
# error must be recorded in disk_observations *even when
# SMART reports PASSED*. Silent errors on a "healthy" disk
# are exactly the early-warning signal the modal histogram
# exists to surface ("324 connection errors on this disk").
# Previously this line lived AFTER a `return` gate keyed on
# smart_health != 'FAILED', so the 3162 ata8 errors on
# .1.10 (PASSED SMART) all dropped on the floor instead of
# accumulating in the per-disk audit history.
# error must be recorded in disk_observations. The modal
# histogram is the per-disk audit trail; it must reflect
# everything the kernel saw, even noise.
self._record_disk_io_observation(resolved, msg)
# ── Gate 1: only NOTIFY when SMART reports FAILED ──
# Observation is already saved above. We avoid spamming a
# CRITICAL notification for transient ATA/SCSI noise on
# otherwise-healthy disks — the modal histogram surfaces
# those without paging the user at 3 AM.
# ── Update sliding 24h rate window for this disk ──
now = time.time()
from collections import deque as _deque
window = self._disk_error_window.setdefault(resolved, _deque())
window.append(now)
cutoff = now - self._DISK_ERROR_WINDOW_SECS
while window and window[0] < cutoff:
window.popleft()
rate_24h = len(window)
# ── Decide severity tier ──
# * hard error (UNC, Buffer I/O, medium, unrecovered read)
# → CRITICAL on first occurrence, no count threshold.
# These are uncorrectable: data is gone.
# * SMART self-report FAILED → CRITICAL (firmware admits it).
# * rate_24h > _DISK_TIER_CRITICAL → CRITICAL (active failure
# even if SMART still says PASSED).
# * rate_24h > _DISK_TIER_WARNING → WARNING (suspicious,
# worth a heads-up).
# * Otherwise → silent observation only (transient noise).
is_hard_error = bool(self._DISK_HARD_ERR_RE.search(msg))
smart_health = self._quick_smart_health(resolved)
if smart_health != 'FAILED':
if is_hard_error or smart_health == 'FAILED' or rate_24h > self._DISK_TIER_CRITICAL:
tier = 'critical'
elif rate_24h > self._DISK_TIER_WARNING:
tier = 'warning'
else:
# Silent — observation already saved, that's enough.
return
# ── Gate 2: 24-hour dedup per device ──
# Check both in-memory cache AND the DB (user dismiss clears DB cooldowns).
# If user dismissed the error, _clear_disk_io_cooldown() removed the DB
# entry, so we should refresh from DB to get the real state.
now = time.time()
# First check in-memory cache
last_notified = self._disk_io_notified.get(resolved, 0)
# ── 24h anti-cascade per (device, tier) ──
# Independent cooldown per tier so a disk that fires WARNING
# at noon can still escalate to CRITICAL the same day when
# the rate jumps past _DISK_TIER_CRITICAL — they're
# different keys.
cooldown_key = f'{resolved}:{tier}'
last_notified = self._disk_io_notified.get(cooldown_key, 0)
if now - last_notified < self._DISK_IO_COOLDOWN:
# In-memory says we already notified. But user might have dismissed
# the error, which clears the DB. Re-check DB to be sure.
db_ts = self._get_disk_io_cooldown_from_db(resolved)
# In-memory says cooldown active. Re-verify in DB in
# case the user dismissed (which clears the DB entry).
db_ts = self._get_disk_io_cooldown_from_db(cooldown_key)
if db_ts is not None and now - db_ts < self._DISK_IO_COOLDOWN:
return # DB confirms cooldown is still active
# DB says cooldown was cleared (user dismissed) - proceed to notify
# Update in-memory cache
del self._disk_io_notified[resolved]
self._disk_io_notified[resolved] = now
self._save_disk_io_notified(resolved, now)
return
# Dismissed → DB cleared → proceed to notify and refresh state.
del self._disk_io_notified[cooldown_key]
self._disk_io_notified[cooldown_key] = now
self._save_disk_io_notified(cooldown_key, now)
# ── Build enriched notification ──
device_info = self._identify_block_device(resolved)
parts = []
parts.append(f'Disk /dev/{resolved}: I/O errors detected')
parts.append('SMART status: FAILED -- disk is failing')
if tier == 'critical':
if is_hard_error:
parts.append(f'Disk /dev/{resolved}: UNRECOVERABLE error detected')
elif smart_health == 'FAILED':
parts.append(f'Disk /dev/{resolved}: SMART reports FAILED')
else:
parts.append(
f'Disk /dev/{resolved}: high I/O error rate '
f'({rate_24h} errors in last 24h)'
)
else: # warning
parts.append(
f'Disk /dev/{resolved}: elevated I/O error rate '
f'({rate_24h} errors in last 24h)'
)
parts.append(f'SMART status: {smart_health}')
if device_info:
parts.append(f'Device: {device_info}')
else:
parts.append(f'Device: /dev/{resolved}')
# Translate the raw kernel error code
detail = self._translate_ata_error(msg)
if detail:
parts.append(f'Error detail: {detail}')
parts.append(f'Action: Replace disk /dev/{resolved} as soon as possible.')
if tier == 'critical':
parts.append(f'Action: Replace disk /dev/{resolved} as soon as possible.')
else:
parts.append(
f'Action: Monitor /dev/{resolved} closely. '
f'Plan a backup verification and replacement if rate grows.'
)
parts.append(f' Check details: smartctl -a /dev/{resolved}')
enriched = '\n'.join(parts)
dev_display = f'/dev/{resolved}'
# Capture journal context for AI enrichment.
# `raw_device` is the original ATA-port literal extracted by the regex
# (e.g. "ata8"). The previous code used a name `ata_port` that was
@@ -1123,12 +1188,15 @@ class JournalWatcher:
keywords=[resolved, raw_device, 'I/O error', 'exception', 'SMART'],
lines=30
)
self._emit('disk_io_error', 'CRITICAL', {
severity = 'CRITICAL' if tier == 'critical' else 'WARNING'
self._emit('disk_io_error', severity, {
'device': dev_display,
'reason': enriched,
'hostname': self._hostname,
'smart_status': 'FAILED',
'smart_status': smart_health,
'rate_24h': rate_24h,
'tier': tier,
'_journal_context': journal_ctx,
}, entity='disk', entity_id=resolved)
return
@@ -2229,6 +2297,17 @@ class PollingCollector:
self._notified_proxmenux_beta_version: str | None = None
# In-memory cache: error_key -> last notification timestamp
self._last_notified: Dict[str, float] = {}
# In-memory cache: error_key -> severity actually sent in the last
# notification. Decoupled from `_known_errors[k].severity` (which
# always reflects the most-recent DB row) so a recovery message
# quotes the same severity the user saw. Without this, an error
# that fired WARNING, silently escalated to CRITICAL during its
# 24h same-key cooldown, then resolved, would be reported as
# "previous severity: CRITICAL" — confusing the operator who only
# ever saw the WARNING. Not persisted across restarts: the
# post-restart first-poll guard (`_first_poll_done`) already
# suppresses spurious recoveries.
self._notified_severity: Dict[str, str] = {}
# Track known error keys + metadata so we can detect new ones AND emit recovery
# Dict[error_key, dict(category, severity, reason, first_seen, error_key)]
self._known_errors: Dict[str, dict] = {}
@@ -2572,6 +2651,11 @@ class PollingCollector:
# Track that we notified
self._last_notified[error_key] = now
self._persist_last_notified(error_key, now)
# Snapshot the severity we actually delivered, so a future
# recovery message quotes the same value the user saw — not
# whatever silently-escalated severity ended up in the DB
# during the same-key 24h cooldown window.
self._notified_severity[error_key] = emit_severity
# ── Emit recovery notifications for errors that resolved ──
resolved_keys = set(self._known_errors.keys()) - set(current_keys.keys())
@@ -2674,24 +2758,32 @@ class PollingCollector:
else:
clean_reason = 'Condition resolved'
# `original_severity` must match what the user actually saw
# in the most-recent notification for this error, not the
# latest DB severity. See `_notified_severity` docstring at
# __init__ for the failure mode this avoids.
original_severity = self._notified_severity.get(
key, old_meta.get('severity', 'WARNING'),
)
data = {
'hostname': self._hostname,
'category': category,
'reason': clean_reason,
'error_key': key,
'severity': 'OK',
'original_severity': old_meta.get('severity', 'WARNING'),
'original_severity': original_severity,
'first_seen': first_seen,
'duration': duration,
'is_recovery': True,
}
self._queue.put(NotificationEvent(
'error_resolved', 'OK', data, source='health',
entity=entity, entity_id=eid or key,
))
self._last_notified.pop(key, None)
self._notified_severity.pop(key, None)
self._known_errors = current_keys
self._first_poll_done = True
@@ -3356,6 +3448,41 @@ class PollingCollector:
}
return 'nvidia_driver_update_available', data
if item_type == 'coral_host':
variant = update.get('_coral_variant') or item.get('_coral_variant') or ''
if variant == 'pcie':
variant_label = 'gasket-dkms (PCIe / M.2) driver'
upgrade_reason = (
'feranick/gasket-driver has published a newer release. '
'The installer rebuilds the gasket + apex kernel modules '
'via DKMS against the running kernel.'
)
reboot_note = (
'Reinstalling rebuilds the DKMS module and requires a '
'reboot to load the new driver.'
)
elif variant == 'usb':
pkg = update.get('_coral_pkg') or item.get('_coral_pkg') or 'libedgetpu1'
variant_label = f'{pkg} runtime (USB Accelerator)'
upgrade_reason = (
'A newer Edge TPU runtime is available from the Google '
'Coral apt repository.'
)
reboot_note = (
'The USB runtime upgrade does not require a reboot.'
)
else:
variant_label = 'Coral TPU driver'
upgrade_reason = 'A newer Coral driver is available.'
reboot_note = ''
data = {
**common,
'variant_label': variant_label,
'upgrade_reason': upgrade_reason,
'reboot_note': reboot_note,
}
return 'coral_driver_update_available', data
# Unknown type — don't notify (keeps the queue clean if a
# future detector lands without a corresponding event mapping).
return '', {}
+11 -1
View File
@@ -627,9 +627,19 @@ class BurstAggregator:
else:
details = '\n'.join(detail_lines)
# The first event in the bucket was already sent individually on
# ingest (see line 547 — "fast alert" path). The burst summary
# must therefore describe the *additional* events that arrived
# after that initial alert, otherwise the user receives both a
# "1 system problem" individual notification AND a "2 system
# problems" burst summary that double-counts the first event.
# `count` reports the additional count; `total_count` is exposed
# for templates that want to show "N more (X total in window)".
additional_count = max(len(events) - 1, 1)
data = {
'hostname': first.data.get('hostname') or _resolve_display_hostname(self._config),
'count': str(len(events)),
'count': str(additional_count),
'total_count': str(len(events)),
'window': window_str,
'entity_list': entity_list,
'event_type': first.event_type,
+46 -14
View File
@@ -1176,60 +1176,91 @@ TEMPLATES = {
'group': 'updates',
'default_enabled': True,
},
# Sprint 14.7 follow-up: host-side Coral TPU driver. Mirrors the
# NVIDIA flow — there's no in-dashboard "Apply update" button; the
# operator reruns the installer from the post-install menu. The
# PCIe (gasket-dkms) and USB (libedgetpu1-*) variants share one
# template and use {variant_label} to surface which is moving so
# the body stays readable in either case.
'coral_driver_update_available': {
'title': '{hostname}: Coral TPU driver update available — {latest_version}',
'body': (
'A newer {variant_label} is available.\n'
'🔹 Currently installed: {current_version}\n'
'🟢 Latest available: {latest_version}\n\n'
'{upgrade_reason}\n\n'
'💡 To reinstall:\n'
' • From the ProxMenux post-install menu: {menu_label}\n\n'
'{reboot_note}'
),
'label': 'Coral TPU driver update available',
'group': 'updates',
'default_enabled': True,
},
# ── Burst aggregation summaries (hidden -- auto-generated by BurstAggregator) ──
# These inherit enabled state from their parent event type at dispatch time.
#
# IMPORTANT — `{count}` here is the count of *additional* events that
# arrived AFTER the first one was already sent individually on the
# fast-alert path (see notification_manager.py:_create_summary). It is
# NOT the total event count in the window; that lives in `{total_count}`.
# The wording must reflect "more / additional" so the user does not
# mistake a 2-event burst for a duplicate of the initial individual
# notification. The first event has already been delivered when this
# summary fires.
'burst_auth_fail': {
'title': '{hostname}: {count} auth failures in {window}',
'body': '{count} authentication failures detected in {window}.\nSources: {entity_list}',
'title': '{hostname}: +{count} more auth failures in {window}',
'body': '+{count} additional authentication failures detected in {window} ({total_count} total).\nSources: {entity_list}',
'label': 'Auth failures burst',
'group': 'security',
'default_enabled': True,
'hidden': True,
},
'burst_ip_block': {
'title': '{hostname}: Fail2Ban banned {count} IPs in {window}',
'body': '{count} IPs banned by Fail2Ban in {window}.\nIPs: {entity_list}',
'title': '{hostname}: Fail2Ban banned +{count} more IPs in {window}',
'body': '+{count} additional IPs banned by Fail2Ban in {window} ({total_count} total).\nIPs: {entity_list}',
'label': 'IP block burst',
'group': 'security',
'default_enabled': True,
'hidden': True,
},
'burst_disk_io': {
'title': '{hostname}: {count} disk I/O errors on {entity_list}',
'body': '{count} I/O errors detected in {window}.\nDevices: {entity_list}',
'title': '{hostname}: +{count} more disk I/O errors on {entity_list}',
'body': '+{count} additional I/O errors detected in {window} ({total_count} total).\nDevices: {entity_list}',
'label': 'Disk I/O burst',
'group': 'storage',
'default_enabled': True,
'hidden': True,
},
'burst_cluster': {
'title': '{hostname}: Cluster flapping detected ({count} changes)',
'body': 'Cluster state changed {count} times in {window}.\nNodes: {entity_list}',
'title': '{hostname}: Cluster flapping detected (+{count} more changes)',
'body': 'Cluster state changed +{count} more times in {window} ({total_count} total).\nNodes: {entity_list}',
'label': 'Cluster flapping burst',
'group': 'cluster',
'default_enabled': True,
'hidden': True,
},
'burst_service_fail': {
'title': '{hostname}: {count} services failed in {window}',
'body': '{count} service failures detected in {window}.\nThis typically indicates a node reboot or PVE service restart.\n\nAdditional failures:\n{details}',
'title': '{hostname}: +{count} more services failed in {window}',
'body': '+{count} additional service failures detected in {window} ({total_count} total).\nThis typically indicates a node reboot or PVE service restart.\n\nAdditional failures:\n{details}',
'label': 'Service fail burst',
'group': 'services',
'default_enabled': True,
'hidden': True,
},
'burst_system': {
'title': '{hostname}: {count} system problems in {window}',
'body': '{count} system problems detected in {window}.\n\nAdditional issues:\n{details}',
'title': '{hostname}: +{count} more system problems in {window}',
'body': '+{count} additional system problems detected in {window} ({total_count} total).\n\nAdditional issues:\n{details}',
'label': 'System problems burst',
'group': 'services',
'default_enabled': True,
'hidden': True,
},
'burst_generic': {
'title': '{hostname}: {count} {event_type} events in {window}',
'body': '{count} events of type {event_type} in {window}.\n\nAdditional events:\n{details}',
'title': '{hostname}: +{count} more {event_type} events in {window}',
'body': '+{count} additional events of type {event_type} in {window} ({total_count} total).\n\nAdditional events:\n{details}',
'label': 'Generic burst',
'group': 'other',
'default_enabled': True,
@@ -1559,6 +1590,7 @@ EVENT_EMOJI = {
'post_install_update': '', # sparkles
'secure_gateway_update_available': '\U0001F510', # 🔐 closed lock with key
'nvidia_driver_update_available': '\U0001F3AE', # 🎮 video game (GPU)
'coral_driver_update_available': '\U0001F9E0', # 🧠 brain (TPU/inference)
# AI
'ai_model_migrated': '\U0001F504', # arrows counterclockwise (refresh/update)
# GPU / PCIe
+1 -1
View File
@@ -83,7 +83,7 @@ PROXMOX_KNOWN_ERRORS: List[Dict[str, Any]] = [
"category": "disks"
},
{
"pattern": r"ata.*error|ATA.*bus.*error|Emask.*0x|DRDY.*ERR|UNC.*error",
"pattern": r"\bata\d.*\berror\b|\bATA\b.*bus.*error|Emask.*0x|DRDY.*ERR|\bUNC\b.*error",
"cause": "ATA communication error with disk",
"cause_detailed": "The SATA/ATA controller encountered communication errors with the disk. This can indicate cable issues, controller problems, or disk failure.",
"severity": "warning",