Update Beta 1.2.1.2

This commit is contained in:
MacRimi
2026-05-20 19:47:42 +02:00
parent 4112323961
commit 298cd2c6d4
15 changed files with 781 additions and 109 deletions
+1 -1
View File
@@ -1 +1 @@
150694a49a5b0a4546a2bf5fedcc0914d37666d0cdeac1d9fdc58793c131b4bd ProxMenux-1.2.1.1-beta.AppImage 0d74347d2feae2be4b8c6d62d6cd9b1b15b94ef431c088b5580560f6b4751594 ProxMenux-1.2.1.2-beta.AppImage
+1 -1
View File
@@ -271,7 +271,7 @@ export function Login({ onLogin }: LoginProps) {
</form> </form>
</div> </div>
<p className="text-center text-sm text-muted-foreground">ProxMenux Monitor v1.2.0</p> <p className="text-center text-sm text-muted-foreground">ProxMenux Monitor v1.2.1.2-beta</p>
</div> </div>
</div> </div>
) )
+1 -1
View File
@@ -814,7 +814,7 @@ export function ProxmoxDashboard() {
</Tabs> </Tabs>
<footer className="mt-8 md:mt-12 pt-4 md:pt-6 border-t border-border text-center text-xs md:text-sm text-muted-foreground"> <footer className="mt-8 md:mt-12 pt-4 md:pt-6 border-t border-border text-center text-xs md:text-sm text-muted-foreground">
<p className="font-medium mb-2">ProxMenux Monitor v1.2.0</p> <p className="font-medium mb-2">ProxMenux Monitor v1.2.1.2-beta</p>
<p> <p>
<a <a
href="https://ko-fi.com/macrimi" href="https://ko-fi.com/macrimi"
+25 -1
View File
@@ -6,7 +6,7 @@ import { Dialog, DialogContent, DialogTitle } from "./ui/dialog"
import { X, Sparkles, Thermometer, Activity, HardDrive, Shield, Globe, Cpu, Zap, Sliders, Wrench, RefreshCw, Server } from "lucide-react" import { X, Sparkles, Thermometer, Activity, HardDrive, Shield, Globe, Cpu, Zap, Sliders, Wrench, RefreshCw, Server } from "lucide-react"
import { Checkbox } from "./ui/checkbox" import { Checkbox } from "./ui/checkbox"
const APP_VERSION = "1.2.1.1-beta" // Sync with AppImage/package.json const APP_VERSION = "1.2.1.2-beta" // Sync with AppImage/package.json
interface ReleaseNote { interface ReleaseNote {
date: string date: string
@@ -18,6 +18,30 @@ interface ReleaseNote {
} }
export const CHANGELOG: Record<string, ReleaseNote> = { export const CHANGELOG: Record<string, ReleaseNote> = {
"1.2.1.2-beta": {
date: "May 20, 2026",
changes: {
added: [
"Coral TPU installer - Uninstall path mirroring the NVIDIA flow, and registry-driven update notifications for both the PCIe gasket-dkms driver (tracked against feranick/gasket-driver) and the USB libedgetpu1 runtime (tracked via apt)",
"Disk I/O severity tiers - Sliding 24h window classifies dmesg ATA/SCSI errors into silent (0-10), WARNING (11-100) and CRITICAL (100+ or any hard error like UNC / Buffer I/O / Sense Key Hardware Error), so quiet days stay quiet and a single Buffer I/O event still pages immediately",
"Quiet Hours buffering - Events suppressed during a channel's quiet window are now persisted to SQLite and released as a grouped summary when the window closes, instead of being silently dropped",
],
changed: [
"Burst aggregation wording - Burst summaries now report only the additional events that arrived after the initial individual alert, so the operator no longer sees the first event counted twice (\"+N more X in window\" instead of the old \"N X in window\" overlap)",
"Known-error classifier - Word-boundary regex on ATA/UNC patterns so kernel messages like nvidia_uvm:FatalError are no longer misclassified as ATA cable issues",
"Health journal context - Excludes proxmenux-monitor.service systemd lines so internal watchdog SIGKILLs no longer leak into the body of unrelated kernel events",
"Resolved notifications severity - The \"previous severity\" now matches the severity the user actually saw in the notification, not whatever escalated value silently landed in the DB during the 24h same-key cooldown",
"log2ram apply path - The auto/update flow now restarts log2ram after writing the new size, so a configured 512M actually takes effect on the running tmpfs (previously left at 128M until a manual restart)",
"VM/CT control errors - Failed start/stop/restart now surfaces the real pvesh stderr (e.g. \"no space left on device\") in the UI toast and fires a vm_fail / ct_fail notification, instead of a bare 500 INTERNAL SERVER ERROR",
"Mobile design of Quiet Hours / Daily Digest - Time inputs are now full-height with inline labels instead of the cramped grid layout that overflowed on narrow screens",
],
fixed: [
"ATA disk error not recorded - disk_observations is now written before the SMART gate, so transient errors that don't yet trip SMART still build the per-disk history",
"Quiet Hours toggle not persisting - get_settings now returns the per-channel quiet_*/digest_* fields so the toggle's state reloads correctly after a refresh",
"Frontend 401 cascade - Login screen no longer swallows the 401 forever after a brief stale-token state; the dedup flag is cleared on mount and on successful login",
],
},
},
"1.2.1.1-beta": { "1.2.1.1-beta": {
date: "May 9, 2026", date: "May 9, 2026",
changes: { changes: {
+1 -1
View File
@@ -3584,7 +3584,7 @@ ${observationsHtml}
<!-- Footer --> <!-- Footer -->
<div class="rpt-footer"> <div class="rpt-footer">
<div>Report generated by ProxMenux Monitor</div> <div>Report generated by ProxMenux Monitor</div>
<div>ProxMenux Monitor v1.2.0</div> <div>ProxMenux Monitor v1.2.1.2-beta</div>
</div> </div>
</body> </body>
+1 -1
View File
@@ -1,6 +1,6 @@
{ {
"name": "ProxMenux-Monitor", "name": "ProxMenux-Monitor",
"version": "1.2.1.1-beta", "version": "1.2.1.2-beta",
"description": "Proxmox System Monitoring Dashboard", "description": "Proxmox System Monitoring Dashboard",
"private": true, "private": true,
"scripts": { "scripts": {
+11 -4
View File
@@ -1026,9 +1026,16 @@ def _capture_health_journal_context(categories: list, reason: str = '') -> str:
# line like "[HealthPersistence] Database initialized with 13 tables" # line like "[HealthPersistence] Database initialized with 13 tables"
# leaks into the AI context because grep -iE 'ata' matches the # leaks into the AI context because grep -iE 'ata' matches the
# substring "ata" in "dATAbase". Self-logs are never system evidence. # substring "ata" in "dATAbase". Self-logs are never system evidence.
#
# Also exclude systemd actions on the proxmenux-monitor unit itself
# (e.g. "proxmenux-monitor.service: Killed process 2010621 with
# signal SIGKILL"). When a kernel event fires within the same
# 10-min window as one of our own watchdog kills, the SIGKILL
# line would otherwise leak into the journal_context and the AI
# would paste it under the unrelated event as "📝 Log: …".
cmd = ( cmd = (
f"journalctl -b 0 --since='10 minutes ago' --no-pager -n 500 2>/dev/null | " f"journalctl -b 0 --since='10 minutes ago' --no-pager -n 500 2>/dev/null | "
f"grep -vE 'AppRun\\[|proxmenux-auth|\\[HealthPersistence\\]|\\[ProxMenux\\]|\\[NotificationManager\\]|\\[AIEnhancer\\]' | " f"grep -vE 'AppRun\\[|proxmenux-auth|\\[HealthPersistence\\]|\\[ProxMenux\\]|\\[NotificationManager\\]|\\[AIEnhancer\\]|proxmenux-monitor\\.service' | "
f"grep -iE '{pattern}' | tail -n 30" f"grep -iE '{pattern}' | tail -n 30"
) )
@@ -10344,7 +10351,7 @@ def api_health():
return jsonify({ return jsonify({
'status': 'healthy', 'status': 'healthy',
'timestamp': datetime.now().isoformat(), 'timestamp': datetime.now().isoformat(),
'version': '1.2.1.1-beta' 'version': '1.2.1.2-beta'
}) })
# ─── User-configurable health thresholds ───────────────────────────────────── # ─── User-configurable health thresholds ─────────────────────────────────────
@@ -10737,7 +10744,7 @@ def api_info():
"""Root endpoint with API information""" """Root endpoint with API information"""
return jsonify({ return jsonify({
'name': 'ProxMenux Monitor API', 'name': 'ProxMenux Monitor API',
'version': '1.2.1.1-beta', 'version': '1.2.1.2-beta',
'endpoints': [ 'endpoints': [
'/api/system', '/api/system',
'/api/system-info', '/api/system-info',
@@ -11387,7 +11394,7 @@ if __name__ == '__main__':
try: try:
import sqlite3 import sqlite3
from pathlib import Path from pathlib import Path
MONITOR_VERSION = '1.2.1.1-beta' MONITOR_VERSION = '1.2.1.2-beta'
db_path = Path('/usr/local/share/proxmenux/health_monitor.db') db_path = Path('/usr/local/share/proxmenux/health_monitor.db')
if db_path.exists(): if db_path.exists():
conn = sqlite3.connect(str(db_path), timeout=10) conn = sqlite3.connect(str(db_path), timeout=10)
+249 -1
View File
@@ -156,6 +156,90 @@ def _detect_nvidia_xfree86() -> Optional[dict]:
} }
# ── Coral TPU host driver (PCIe gasket-dkms + USB libedgetpu1) ──
#
# Two install paths share the same registry entry because the user
# thinks of them as one "Coral driver" install. The detector returns
# one entry per path that is actually present on the host, so a system
# with both M.2 and USB Coral devices gets two entries — independent
# update streams (gasket-dkms from feranick/gasket-driver on GitHub,
# libedgetpu1-std from Google's apt repo).
def _detect_coral_host() -> list[dict]:
out: list[dict] = []
# PCIe / M.2 — gasket-dkms package version, falling back to the
# registered DKMS version if the package was force-removed but the
# built modules still exist.
pcie_version: Optional[str] = None
try:
r = subprocess.run(
["dpkg-query", "-W", "-f=${Status}|${Version}", "gasket-dkms"],
capture_output=True, text=True, timeout=3,
)
if r.returncode == 0 and "ok installed" in r.stdout:
pcie_version = r.stdout.split("|", 1)[1].strip()
except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
pass
if not pcie_version:
try:
r = subprocess.run(
["dkms", "status"], capture_output=True, text=True, timeout=3,
)
if r.returncode == 0:
for line in r.stdout.splitlines():
if line.startswith("gasket"):
# "gasket, 1.0, ..." or "gasket/1.0, ..."
m = re.match(r"^gasket[, /]([^,\s]+)", line)
if m:
pcie_version = m.group(1)
break
except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
pass
if pcie_version:
out.append({
"id": "coral-host-pcie",
"type": "coral_host",
"name": "Coral TPU Driver (gasket-dkms)",
"current_version": pcie_version,
"menu_label": "GPU & TPU → Coral TPU",
"menu_script": "scripts/gpu_tpu/install_coral.sh",
"_coral_variant": "pcie",
})
# USB — libedgetpu1-std (default) or libedgetpu1-max if the user
# opted into the overclocked runtime. Either one means the USB
# path is installed.
usb_version: Optional[str] = None
usb_pkg: Optional[str] = None
for pkg in ("libedgetpu1-std", "libedgetpu1-max"):
try:
r = subprocess.run(
["dpkg-query", "-W", "-f=${Status}|${Version}", pkg],
capture_output=True, text=True, timeout=3,
)
except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
continue
if r.returncode == 0 and "ok installed" in r.stdout:
usb_version = r.stdout.split("|", 1)[1].strip()
usb_pkg = pkg
break
if usb_version and usb_pkg:
out.append({
"id": "coral-host-usb",
"type": "coral_host",
"name": f"Coral TPU Runtime ({usb_pkg})",
"current_version": usb_version,
"menu_label": "GPU & TPU → Coral TPU",
"menu_script": "scripts/gpu_tpu/install_coral.sh",
"_coral_variant": "usb",
"_coral_pkg": usb_pkg,
})
return out
def _detect_oci_apps() -> list[dict]: def _detect_oci_apps() -> list[dict]:
"""Bridge to the OCI manager so every OCI-installed app shows up """Bridge to the OCI manager so every OCI-installed app shows up
in the registry without a per-app detector here. The OCI manager in the registry without a per-app detector here. The OCI manager
@@ -350,6 +434,7 @@ def _detect_lxc_containers() -> list[dict]:
# framework normalises both shapes. # framework normalises both shapes.
_DETECTORS: list[Callable[[], Any]] = [ _DETECTORS: list[Callable[[], Any]] = [
_detect_nvidia_xfree86, _detect_nvidia_xfree86,
_detect_coral_host,
_detect_oci_apps, _detect_oci_apps,
_detect_lxc_containers, _detect_lxc_containers,
] ]
@@ -834,9 +919,171 @@ def _check_lxc_updates(entry: dict) -> dict:
} }
# ── Coral driver checker ──
#
# Two upstreams to track:
#
# PCIe (gasket-dkms) → feranick/gasket-driver on GitHub. The fork is
# actively maintained; releases are tagged like "v1.0-22". We pull
# the latest tag from the GitHub API and compare against the
# installed gasket-dkms Debian version. Because the Debian version
# string ("1.0-18") doesn't perfectly match the upstream tag
# ("v1.0-22"), we normalise both sides to the trailing "-N" build
# number for the comparison. Strict semver isn't workable here.
#
# USB (libedgetpu1-std/-max) → Google's apt repo. `apt-cache policy`
# reports installed + candidate versions in one shot, no internet
# round-trip required (apt's own cache is the canonical answer).
#
# Cache TTL for the GitHub call is 7 days — feranick's release cadence
# is roughly monthly, matching NVIDIA's pattern. The cache lives in
# memory so AppImage restarts refresh it for free.
_CORAL_GASKET_REPO = "feranick/gasket-driver"
_CORAL_CACHE_TTL = 7 * 86400
_coral_gasket_cache: dict[str, Any] = {"latest_tag": None, "fetched_at": 0}
def _coral_build_number(s: str) -> int:
"""Extract the trailing build number from a Coral version string.
Handles both upstream tag form (``v1.0-22``, ``1.0-22``) and the
Debian package form (``1.0-22``, ``1.0-18+pmx1``). Returns 0 if no
trailing ``-N`` segment exists — that pushes "no build number"
versions to the lowest rank so any tagged release shows as newer.
"""
if not s:
return 0
m = re.search(r"-(\d+)", s)
if not m:
return 0
try:
return int(m.group(1))
except (ValueError, TypeError):
return 0
def _fetch_gasket_latest_tag(force: bool = False) -> Optional[str]:
now = time.time()
if not force and _coral_gasket_cache["latest_tag"] and \
now - _coral_gasket_cache["fetched_at"] < _CORAL_CACHE_TTL:
return _coral_gasket_cache["latest_tag"]
url = f"https://api.github.com/repos/{_CORAL_GASKET_REPO}/tags?per_page=5"
try:
req = urllib.request.Request(
url,
headers={
"User-Agent": "ProxMenux-Monitor/1.0",
"Accept": "application/vnd.github+json",
},
)
with urllib.request.urlopen(req, timeout=15) as resp:
tags = json.loads(resp.read().decode("utf-8", errors="replace"))
except Exception as e:
print(f"[ProxMenux] gasket-driver tag fetch failed: {e}")
return _coral_gasket_cache.get("latest_tag")
if not isinstance(tags, list) or not tags:
return _coral_gasket_cache.get("latest_tag")
# Pick the tag with the highest trailing build number — feranick's
# tags are not strictly chronological, occasionally rebuilt.
best: Optional[str] = None
best_n = -1
for t in tags:
if not isinstance(t, dict):
continue
name = t.get("name") or ""
n = _coral_build_number(name)
if n > best_n:
best_n = n
best = name
if best:
_coral_gasket_cache["latest_tag"] = best
_coral_gasket_cache["fetched_at"] = now
return best
def _apt_cache_candidate(pkg: str) -> Optional[str]:
"""Return the candidate (newest available) version for ``pkg`` from
the local apt cache. Caller is responsible for the package existing —
a missing package returns None silently.
"""
try:
r = subprocess.run(
["apt-cache", "policy", pkg],
capture_output=True, text=True, timeout=5,
)
except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
return None
if r.returncode != 0:
return None
for line in r.stdout.splitlines():
line = line.strip()
if line.startswith("Candidate:"):
cand = line.split(":", 1)[1].strip()
if cand and cand != "(none)":
return cand
return None
def _check_coral_host(entry: dict) -> dict:
variant = entry.get("_coral_variant") or ""
current = entry.get("current_version") or ""
if variant == "pcie":
latest_tag = _fetch_gasket_latest_tag()
if not latest_tag:
return {"available": False, "latest": None,
"last_check": _now_iso(),
"error": "could not fetch gasket-driver tags"}
cur_n = _coral_build_number(current)
new_n = _coral_build_number(latest_tag)
available = new_n > cur_n
return {
"available": available,
"latest": latest_tag if available else None,
"last_check": _now_iso(),
"error": None,
"_coral_variant": "pcie",
}
if variant == "usb":
pkg = entry.get("_coral_pkg") or "libedgetpu1-std"
candidate = _apt_cache_candidate(pkg)
if not candidate:
return {"available": False, "latest": None,
"last_check": _now_iso(),
"error": f"apt-cache policy returned no candidate for {pkg}"}
# Use plain string compare via the same build-number heuristic
# apt uses dpkg version compare upstream, but for the libedgetpu
# packages a trailing "-N" build number is the only thing that
# ever moves, so the build-number compare is enough here too.
# If it ever isn't, dpkg --compare-versions is the right call.
try:
cmp = subprocess.run(
["dpkg", "--compare-versions", current, "lt", candidate],
capture_output=True, timeout=3,
)
available = cmp.returncode == 0
except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
available = candidate != current
return {
"available": available,
"latest": candidate if available else None,
"last_check": _now_iso(),
"error": None,
"_coral_variant": "usb",
"_coral_pkg": pkg,
}
return {"available": False, "latest": None,
"last_check": _now_iso(),
"error": f"unknown coral variant: {variant}"}
_CHECKERS: dict[str, Callable[[dict], dict]] = { _CHECKERS: dict[str, Callable[[dict], dict]] = {
"oci_app": _check_oci_app, "oci_app": _check_oci_app,
"nvidia_xfree86": _check_nvidia_xfree86, "nvidia_xfree86": _check_nvidia_xfree86,
"coral_host": _check_coral_host,
"lxc": _check_lxc_updates, "lxc": _check_lxc_updates,
} }
@@ -890,7 +1137,8 @@ def check_for_updates(force: bool = False) -> list[dict]:
# the LXC checker's counts dropped on the floor and the # the LXC checker's counts dropped on the floor and the
# frontend badge couldn't render. # frontend badge couldn't render.
for extra_key in ("_packages", "_upgrade_kind", "_kernel", for extra_key in ("_packages", "_upgrade_kind", "_kernel",
"_kernel_note", "_count", "_security_count"): "_kernel_note", "_count", "_security_count",
"_coral_variant", "_coral_pkg"):
if extra_key in result: if extra_key in result:
it["update_check"][extra_key] = result[extra_key] it["update_check"][extra_key] = result[extra_key]
+178 -51
View File
@@ -382,9 +382,40 @@ class JournalWatcher:
self._recent_events: Dict[str, float] = {} self._recent_events: Dict[str, float] = {}
self._dedup_window = 30 # seconds self._dedup_window = 30 # seconds
# 24h anti-cascade for disk I/O + filesystem errors (keyed by device name) # 24h anti-cascade for disk I/O + filesystem errors. The dict
# key includes a tier suffix (`sdh:warning`, `sdh:critical`)
# so a disk in WARNING cooldown can still escalate to CRITICAL
# within the same 24h if the rate accelerates.
self._disk_io_notified: Dict[str, float] = {} self._disk_io_notified: Dict[str, float] = {}
self._DISK_IO_COOLDOWN = 86400 # 24 hours self._DISK_IO_COOLDOWN = 86400 # 24 hours
# Sliding 24h window of ATA error timestamps per disk, used to
# decide notification severity tier. Don't blindly trust the
# SMART firmware self-report — the Google "Failure Trends"
# paper showed ~36% of failed drives gave no SMART warning.
# Rate-based escalation catches the dying drives that SMART
# would never flag until they were already bricked.
from collections import deque as _deque
self._disk_error_window: Dict[str, "_deque[float]"] = {}
self._DISK_ERROR_WINDOW_SECS = 86400 # 24h
# Tiers calibrated for homelab/SMB Proxmox usage:
# * 0-10/24h → transient noise (cable rattle, sleep/wake,
# PHY retrain). Silent observation only.
# * 11-100/24h → WARNING. Notify once per 24h.
# * 100+/24h → CRITICAL. Active failure.
# Hard errors (Buffer I/O, UNC, medium error, unrecovered read)
# are CRITICAL on the FIRST occurrence regardless of count —
# those are uncorrectable data losses, not transient noise.
self._DISK_TIER_WARNING = 10
self._DISK_TIER_CRITICAL = 100
# Hard-error pattern: matches any of the kernel-reported
# signals that mean data was lost or could not be recovered.
self._DISK_HARD_ERR_RE = re.compile(
r'(Buffer I/O error|UNC\b|Medium Error|medium error'
r'|Unrecovered read error|unrecovered read error'
r'|Sense Key.*Hardware Error)',
re.IGNORECASE,
)
# Track when the last full backup job notification was sent # Track when the last full backup job notification was sent
# so we can suppress per-guest "Starting Backup of VM ..." noise # so we can suppress per-guest "Starting Backup of VM ..." noise
@@ -1046,73 +1077,107 @@ class JournalWatcher:
else: else:
resolved = re.sub(r'\d+$', '', raw_device) if raw_device.startswith('sd') else raw_device resolved = re.sub(r'\d+$', '', raw_device) if raw_device.startswith('sd') else raw_device
# ── ALWAYS persist the observation, regardless of SMART ── # ── ALWAYS persist the observation, regardless of severity ──
# The disk_observation_contract is explicit (memory note # The disk_observation_contract is explicit (memory note
# disk-observation-contract): every kernel-surfaced disk # disk-observation-contract): every kernel-surfaced disk
# error must be recorded in disk_observations *even when # error must be recorded in disk_observations. The modal
# SMART reports PASSED*. Silent errors on a "healthy" disk # histogram is the per-disk audit trail; it must reflect
# are exactly the early-warning signal the modal histogram # everything the kernel saw, even noise.
# exists to surface ("324 connection errors on this disk").
# Previously this line lived AFTER a `return` gate keyed on
# smart_health != 'FAILED', so the 3162 ata8 errors on
# .1.10 (PASSED SMART) all dropped on the floor instead of
# accumulating in the per-disk audit history.
self._record_disk_io_observation(resolved, msg) self._record_disk_io_observation(resolved, msg)
# ── Gate 1: only NOTIFY when SMART reports FAILED ── # ── Update sliding 24h rate window for this disk ──
# Observation is already saved above. We avoid spamming a now = time.time()
# CRITICAL notification for transient ATA/SCSI noise on from collections import deque as _deque
# otherwise-healthy disks — the modal histogram surfaces window = self._disk_error_window.setdefault(resolved, _deque())
# those without paging the user at 3 AM. window.append(now)
cutoff = now - self._DISK_ERROR_WINDOW_SECS
while window and window[0] < cutoff:
window.popleft()
rate_24h = len(window)
# ── Decide severity tier ──
# * hard error (UNC, Buffer I/O, medium, unrecovered read)
# → CRITICAL on first occurrence, no count threshold.
# These are uncorrectable: data is gone.
# * SMART self-report FAILED → CRITICAL (firmware admits it).
# * rate_24h > _DISK_TIER_CRITICAL → CRITICAL (active failure
# even if SMART still says PASSED).
# * rate_24h > _DISK_TIER_WARNING → WARNING (suspicious,
# worth a heads-up).
# * Otherwise → silent observation only (transient noise).
is_hard_error = bool(self._DISK_HARD_ERR_RE.search(msg))
smart_health = self._quick_smart_health(resolved) smart_health = self._quick_smart_health(resolved)
if smart_health != 'FAILED': if is_hard_error or smart_health == 'FAILED' or rate_24h > self._DISK_TIER_CRITICAL:
tier = 'critical'
elif rate_24h > self._DISK_TIER_WARNING:
tier = 'warning'
else:
# Silent — observation already saved, that's enough.
return return
# ── Gate 2: 24-hour dedup per device ── # ── 24h anti-cascade per (device, tier) ──
# Check both in-memory cache AND the DB (user dismiss clears DB cooldowns). # Independent cooldown per tier so a disk that fires WARNING
# If user dismissed the error, _clear_disk_io_cooldown() removed the DB # at noon can still escalate to CRITICAL the same day when
# entry, so we should refresh from DB to get the real state. # the rate jumps past _DISK_TIER_CRITICAL — they're
now = time.time() # different keys.
cooldown_key = f'{resolved}:{tier}'
# First check in-memory cache last_notified = self._disk_io_notified.get(cooldown_key, 0)
last_notified = self._disk_io_notified.get(resolved, 0)
if now - last_notified < self._DISK_IO_COOLDOWN: if now - last_notified < self._DISK_IO_COOLDOWN:
# In-memory says we already notified. But user might have dismissed # In-memory says cooldown active. Re-verify in DB in
# the error, which clears the DB. Re-check DB to be sure. # case the user dismissed (which clears the DB entry).
db_ts = self._get_disk_io_cooldown_from_db(resolved) db_ts = self._get_disk_io_cooldown_from_db(cooldown_key)
if db_ts is not None and now - db_ts < self._DISK_IO_COOLDOWN: if db_ts is not None and now - db_ts < self._DISK_IO_COOLDOWN:
return # DB confirms cooldown is still active return
# DB says cooldown was cleared (user dismissed) - proceed to notify # Dismissed → DB cleared → proceed to notify and refresh state.
# Update in-memory cache del self._disk_io_notified[cooldown_key]
del self._disk_io_notified[resolved]
self._disk_io_notified[cooldown_key] = now
self._disk_io_notified[resolved] = now self._save_disk_io_notified(cooldown_key, now)
self._save_disk_io_notified(resolved, now)
# ── Build enriched notification ── # ── Build enriched notification ──
device_info = self._identify_block_device(resolved) device_info = self._identify_block_device(resolved)
parts = [] parts = []
parts.append(f'Disk /dev/{resolved}: I/O errors detected') if tier == 'critical':
parts.append('SMART status: FAILED -- disk is failing') if is_hard_error:
parts.append(f'Disk /dev/{resolved}: UNRECOVERABLE error detected')
elif smart_health == 'FAILED':
parts.append(f'Disk /dev/{resolved}: SMART reports FAILED')
else:
parts.append(
f'Disk /dev/{resolved}: high I/O error rate '
f'({rate_24h} errors in last 24h)'
)
else: # warning
parts.append(
f'Disk /dev/{resolved}: elevated I/O error rate '
f'({rate_24h} errors in last 24h)'
)
parts.append(f'SMART status: {smart_health}')
if device_info: if device_info:
parts.append(f'Device: {device_info}') parts.append(f'Device: {device_info}')
else: else:
parts.append(f'Device: /dev/{resolved}') parts.append(f'Device: /dev/{resolved}')
# Translate the raw kernel error code # Translate the raw kernel error code
detail = self._translate_ata_error(msg) detail = self._translate_ata_error(msg)
if detail: if detail:
parts.append(f'Error detail: {detail}') parts.append(f'Error detail: {detail}')
parts.append(f'Action: Replace disk /dev/{resolved} as soon as possible.') if tier == 'critical':
parts.append(f'Action: Replace disk /dev/{resolved} as soon as possible.')
else:
parts.append(
f'Action: Monitor /dev/{resolved} closely. '
f'Plan a backup verification and replacement if rate grows.'
)
parts.append(f' Check details: smartctl -a /dev/{resolved}') parts.append(f' Check details: smartctl -a /dev/{resolved}')
enriched = '\n'.join(parts) enriched = '\n'.join(parts)
dev_display = f'/dev/{resolved}' dev_display = f'/dev/{resolved}'
# Capture journal context for AI enrichment. # Capture journal context for AI enrichment.
# `raw_device` is the original ATA-port literal extracted by the regex # `raw_device` is the original ATA-port literal extracted by the regex
# (e.g. "ata8"). The previous code used a name `ata_port` that was # (e.g. "ata8"). The previous code used a name `ata_port` that was
@@ -1123,12 +1188,15 @@ class JournalWatcher:
keywords=[resolved, raw_device, 'I/O error', 'exception', 'SMART'], keywords=[resolved, raw_device, 'I/O error', 'exception', 'SMART'],
lines=30 lines=30
) )
self._emit('disk_io_error', 'CRITICAL', { severity = 'CRITICAL' if tier == 'critical' else 'WARNING'
self._emit('disk_io_error', severity, {
'device': dev_display, 'device': dev_display,
'reason': enriched, 'reason': enriched,
'hostname': self._hostname, 'hostname': self._hostname,
'smart_status': 'FAILED', 'smart_status': smart_health,
'rate_24h': rate_24h,
'tier': tier,
'_journal_context': journal_ctx, '_journal_context': journal_ctx,
}, entity='disk', entity_id=resolved) }, entity='disk', entity_id=resolved)
return return
@@ -2229,6 +2297,17 @@ class PollingCollector:
self._notified_proxmenux_beta_version: str | None = None self._notified_proxmenux_beta_version: str | None = None
# In-memory cache: error_key -> last notification timestamp # In-memory cache: error_key -> last notification timestamp
self._last_notified: Dict[str, float] = {} self._last_notified: Dict[str, float] = {}
# In-memory cache: error_key -> severity actually sent in the last
# notification. Decoupled from `_known_errors[k].severity` (which
# always reflects the most-recent DB row) so a recovery message
# quotes the same severity the user saw. Without this, an error
# that fired WARNING, silently escalated to CRITICAL during its
# 24h same-key cooldown, then resolved, would be reported as
# "previous severity: CRITICAL" — confusing the operator who only
# ever saw the WARNING. Not persisted across restarts: the
# post-restart first-poll guard (`_first_poll_done`) already
# suppresses spurious recoveries.
self._notified_severity: Dict[str, str] = {}
# Track known error keys + metadata so we can detect new ones AND emit recovery # Track known error keys + metadata so we can detect new ones AND emit recovery
# Dict[error_key, dict(category, severity, reason, first_seen, error_key)] # Dict[error_key, dict(category, severity, reason, first_seen, error_key)]
self._known_errors: Dict[str, dict] = {} self._known_errors: Dict[str, dict] = {}
@@ -2572,6 +2651,11 @@ class PollingCollector:
# Track that we notified # Track that we notified
self._last_notified[error_key] = now self._last_notified[error_key] = now
self._persist_last_notified(error_key, now) self._persist_last_notified(error_key, now)
# Snapshot the severity we actually delivered, so a future
# recovery message quotes the same value the user saw — not
# whatever silently-escalated severity ended up in the DB
# during the same-key 24h cooldown window.
self._notified_severity[error_key] = emit_severity
# ── Emit recovery notifications for errors that resolved ── # ── Emit recovery notifications for errors that resolved ──
resolved_keys = set(self._known_errors.keys()) - set(current_keys.keys()) resolved_keys = set(self._known_errors.keys()) - set(current_keys.keys())
@@ -2674,24 +2758,32 @@ class PollingCollector:
else: else:
clean_reason = 'Condition resolved' clean_reason = 'Condition resolved'
# `original_severity` must match what the user actually saw
# in the most-recent notification for this error, not the
# latest DB severity. See `_notified_severity` docstring at
# __init__ for the failure mode this avoids.
original_severity = self._notified_severity.get(
key, old_meta.get('severity', 'WARNING'),
)
data = { data = {
'hostname': self._hostname, 'hostname': self._hostname,
'category': category, 'category': category,
'reason': clean_reason, 'reason': clean_reason,
'error_key': key, 'error_key': key,
'severity': 'OK', 'severity': 'OK',
'original_severity': old_meta.get('severity', 'WARNING'), 'original_severity': original_severity,
'first_seen': first_seen, 'first_seen': first_seen,
'duration': duration, 'duration': duration,
'is_recovery': True, 'is_recovery': True,
} }
self._queue.put(NotificationEvent( self._queue.put(NotificationEvent(
'error_resolved', 'OK', data, source='health', 'error_resolved', 'OK', data, source='health',
entity=entity, entity_id=eid or key, entity=entity, entity_id=eid or key,
)) ))
self._last_notified.pop(key, None) self._last_notified.pop(key, None)
self._notified_severity.pop(key, None)
self._known_errors = current_keys self._known_errors = current_keys
self._first_poll_done = True self._first_poll_done = True
@@ -3356,6 +3448,41 @@ class PollingCollector:
} }
return 'nvidia_driver_update_available', data return 'nvidia_driver_update_available', data
if item_type == 'coral_host':
variant = update.get('_coral_variant') or item.get('_coral_variant') or ''
if variant == 'pcie':
variant_label = 'gasket-dkms (PCIe / M.2) driver'
upgrade_reason = (
'feranick/gasket-driver has published a newer release. '
'The installer rebuilds the gasket + apex kernel modules '
'via DKMS against the running kernel.'
)
reboot_note = (
'Reinstalling rebuilds the DKMS module and requires a '
'reboot to load the new driver.'
)
elif variant == 'usb':
pkg = update.get('_coral_pkg') or item.get('_coral_pkg') or 'libedgetpu1'
variant_label = f'{pkg} runtime (USB Accelerator)'
upgrade_reason = (
'A newer Edge TPU runtime is available from the Google '
'Coral apt repository.'
)
reboot_note = (
'The USB runtime upgrade does not require a reboot.'
)
else:
variant_label = 'Coral TPU driver'
upgrade_reason = 'A newer Coral driver is available.'
reboot_note = ''
data = {
**common,
'variant_label': variant_label,
'upgrade_reason': upgrade_reason,
'reboot_note': reboot_note,
}
return 'coral_driver_update_available', data
# Unknown type — don't notify (keeps the queue clean if a # Unknown type — don't notify (keeps the queue clean if a
# future detector lands without a corresponding event mapping). # future detector lands without a corresponding event mapping).
return '', {} return '', {}
+11 -1
View File
@@ -627,9 +627,19 @@ class BurstAggregator:
else: else:
details = '\n'.join(detail_lines) details = '\n'.join(detail_lines)
# The first event in the bucket was already sent individually on
# ingest (see line 547 — "fast alert" path). The burst summary
# must therefore describe the *additional* events that arrived
# after that initial alert, otherwise the user receives both a
# "1 system problem" individual notification AND a "2 system
# problems" burst summary that double-counts the first event.
# `count` reports the additional count; `total_count` is exposed
# for templates that want to show "N more (X total in window)".
additional_count = max(len(events) - 1, 1)
data = { data = {
'hostname': first.data.get('hostname') or _resolve_display_hostname(self._config), 'hostname': first.data.get('hostname') or _resolve_display_hostname(self._config),
'count': str(len(events)), 'count': str(additional_count),
'total_count': str(len(events)),
'window': window_str, 'window': window_str,
'entity_list': entity_list, 'entity_list': entity_list,
'event_type': first.event_type, 'event_type': first.event_type,
+46 -14
View File
@@ -1176,60 +1176,91 @@ TEMPLATES = {
'group': 'updates', 'group': 'updates',
'default_enabled': True, 'default_enabled': True,
}, },
# Sprint 14.7 follow-up: host-side Coral TPU driver. Mirrors the
# NVIDIA flow — there's no in-dashboard "Apply update" button; the
# operator reruns the installer from the post-install menu. The
# PCIe (gasket-dkms) and USB (libedgetpu1-*) variants share one
# template and use {variant_label} to surface which is moving so
# the body stays readable in either case.
'coral_driver_update_available': {
'title': '{hostname}: Coral TPU driver update available — {latest_version}',
'body': (
'A newer {variant_label} is available.\n'
'🔹 Currently installed: {current_version}\n'
'🟢 Latest available: {latest_version}\n\n'
'{upgrade_reason}\n\n'
'💡 To reinstall:\n'
' • From the ProxMenux post-install menu: {menu_label}\n\n'
'{reboot_note}'
),
'label': 'Coral TPU driver update available',
'group': 'updates',
'default_enabled': True,
},
# ── Burst aggregation summaries (hidden -- auto-generated by BurstAggregator) ── # ── Burst aggregation summaries (hidden -- auto-generated by BurstAggregator) ──
# These inherit enabled state from their parent event type at dispatch time. # These inherit enabled state from their parent event type at dispatch time.
#
# IMPORTANT — `{count}` here is the count of *additional* events that
# arrived AFTER the first one was already sent individually on the
# fast-alert path (see notification_manager.py:_create_summary). It is
# NOT the total event count in the window; that lives in `{total_count}`.
# The wording must reflect "more / additional" so the user does not
# mistake a 2-event burst for a duplicate of the initial individual
# notification. The first event has already been delivered when this
# summary fires.
'burst_auth_fail': { 'burst_auth_fail': {
'title': '{hostname}: {count} auth failures in {window}', 'title': '{hostname}: +{count} more auth failures in {window}',
'body': '{count} authentication failures detected in {window}.\nSources: {entity_list}', 'body': '+{count} additional authentication failures detected in {window} ({total_count} total).\nSources: {entity_list}',
'label': 'Auth failures burst', 'label': 'Auth failures burst',
'group': 'security', 'group': 'security',
'default_enabled': True, 'default_enabled': True,
'hidden': True, 'hidden': True,
}, },
'burst_ip_block': { 'burst_ip_block': {
'title': '{hostname}: Fail2Ban banned {count} IPs in {window}', 'title': '{hostname}: Fail2Ban banned +{count} more IPs in {window}',
'body': '{count} IPs banned by Fail2Ban in {window}.\nIPs: {entity_list}', 'body': '+{count} additional IPs banned by Fail2Ban in {window} ({total_count} total).\nIPs: {entity_list}',
'label': 'IP block burst', 'label': 'IP block burst',
'group': 'security', 'group': 'security',
'default_enabled': True, 'default_enabled': True,
'hidden': True, 'hidden': True,
}, },
'burst_disk_io': { 'burst_disk_io': {
'title': '{hostname}: {count} disk I/O errors on {entity_list}', 'title': '{hostname}: +{count} more disk I/O errors on {entity_list}',
'body': '{count} I/O errors detected in {window}.\nDevices: {entity_list}', 'body': '+{count} additional I/O errors detected in {window} ({total_count} total).\nDevices: {entity_list}',
'label': 'Disk I/O burst', 'label': 'Disk I/O burst',
'group': 'storage', 'group': 'storage',
'default_enabled': True, 'default_enabled': True,
'hidden': True, 'hidden': True,
}, },
'burst_cluster': { 'burst_cluster': {
'title': '{hostname}: Cluster flapping detected ({count} changes)', 'title': '{hostname}: Cluster flapping detected (+{count} more changes)',
'body': 'Cluster state changed {count} times in {window}.\nNodes: {entity_list}', 'body': 'Cluster state changed +{count} more times in {window} ({total_count} total).\nNodes: {entity_list}',
'label': 'Cluster flapping burst', 'label': 'Cluster flapping burst',
'group': 'cluster', 'group': 'cluster',
'default_enabled': True, 'default_enabled': True,
'hidden': True, 'hidden': True,
}, },
'burst_service_fail': { 'burst_service_fail': {
'title': '{hostname}: {count} services failed in {window}', 'title': '{hostname}: +{count} more services failed in {window}',
'body': '{count} service failures detected in {window}.\nThis typically indicates a node reboot or PVE service restart.\n\nAdditional failures:\n{details}', 'body': '+{count} additional service failures detected in {window} ({total_count} total).\nThis typically indicates a node reboot or PVE service restart.\n\nAdditional failures:\n{details}',
'label': 'Service fail burst', 'label': 'Service fail burst',
'group': 'services', 'group': 'services',
'default_enabled': True, 'default_enabled': True,
'hidden': True, 'hidden': True,
}, },
'burst_system': { 'burst_system': {
'title': '{hostname}: {count} system problems in {window}', 'title': '{hostname}: +{count} more system problems in {window}',
'body': '{count} system problems detected in {window}.\n\nAdditional issues:\n{details}', 'body': '+{count} additional system problems detected in {window} ({total_count} total).\n\nAdditional issues:\n{details}',
'label': 'System problems burst', 'label': 'System problems burst',
'group': 'services', 'group': 'services',
'default_enabled': True, 'default_enabled': True,
'hidden': True, 'hidden': True,
}, },
'burst_generic': { 'burst_generic': {
'title': '{hostname}: {count} {event_type} events in {window}', 'title': '{hostname}: +{count} more {event_type} events in {window}',
'body': '{count} events of type {event_type} in {window}.\n\nAdditional events:\n{details}', 'body': '+{count} additional events of type {event_type} in {window} ({total_count} total).\n\nAdditional events:\n{details}',
'label': 'Generic burst', 'label': 'Generic burst',
'group': 'other', 'group': 'other',
'default_enabled': True, 'default_enabled': True,
@@ -1559,6 +1590,7 @@ EVENT_EMOJI = {
'post_install_update': '', # sparkles 'post_install_update': '', # sparkles
'secure_gateway_update_available': '\U0001F510', # 🔐 closed lock with key 'secure_gateway_update_available': '\U0001F510', # 🔐 closed lock with key
'nvidia_driver_update_available': '\U0001F3AE', # 🎮 video game (GPU) 'nvidia_driver_update_available': '\U0001F3AE', # 🎮 video game (GPU)
'coral_driver_update_available': '\U0001F9E0', # 🧠 brain (TPU/inference)
# AI # AI
'ai_model_migrated': '\U0001F504', # arrows counterclockwise (refresh/update) 'ai_model_migrated': '\U0001F504', # arrows counterclockwise (refresh/update)
# GPU / PCIe # GPU / PCIe
+1 -1
View File
@@ -83,7 +83,7 @@ PROXMOX_KNOWN_ERRORS: List[Dict[str, Any]] = [
"category": "disks" "category": "disks"
}, },
{ {
"pattern": r"ata.*error|ATA.*bus.*error|Emask.*0x|DRDY.*ERR|UNC.*error", "pattern": r"\bata\d.*\berror\b|\bATA\b.*bus.*error|Emask.*0x|DRDY.*ERR|\bUNC\b.*error",
"cause": "ATA communication error with disk", "cause": "ATA communication error with disk",
"cause_detailed": "The SATA/ATA controller encountered communication errors with the disk. This can indicate cable issues, controller problems, or disk failure.", "cause_detailed": "The SATA/ATA controller encountered communication errors with the disk. This can indicate cable issues, controller problems, or disk failure.",
"severity": "warning", "severity": "warning",
+1 -1
View File
@@ -1 +1 @@
1.2.1.1 1.2.1.2
+254 -30
View File
@@ -430,6 +430,181 @@ EOF
# ============================================================ # ============================================================
# Final prompt # Final prompt
# ============================================================ # ============================================================
# ============================================================
# Install-state detection (Coral PCIe gasket DKMS / USB libedgetpu)
# ============================================================
# Sets the following globals so main() can branch into install vs
# uninstall like nvidia_installer.sh does. We treat "installed" as
# loosely as possible — even a half-installed DKMS or a stale
# libedgetpu1-std package counts, because the uninstall path needs
# to clean those up too.
CORAL_PCIE_INSTALLED=false
CORAL_USB_INSTALLED=false
CORAL_PCIE_DKMS_VERSION=""
CORAL_USB_RUNTIME_VERSION=""
detect_coral_install_state() {
CORAL_PCIE_INSTALLED=false
CORAL_USB_INSTALLED=false
CORAL_PCIE_DKMS_VERSION=""
CORAL_USB_RUNTIME_VERSION=""
# PCIe / M.2 path: any of these means gasket is installed.
# * `dkms status` lists a gasket entry
# * `dpkg -s gasket-dkms` reports installed
# * /dev/apex_* nodes exist (modules loaded right now)
if command -v dkms >/dev/null 2>&1; then
local dkms_line
dkms_line=$(dkms status 2>/dev/null | grep -E '^gasket' | head -n1)
if [[ -n "$dkms_line" ]]; then
CORAL_PCIE_INSTALLED=true
# `dkms status` formats vary across releases:
# "gasket, 1.0, 6.8.12-1-pve, x86_64: installed"
# "gasket/1.0, ..."
CORAL_PCIE_DKMS_VERSION=$(echo "$dkms_line" \
| sed -E 's|^gasket[, /]([^,]+).*|\1|' | tr -d ' ')
fi
fi
if ! $CORAL_PCIE_INSTALLED \
&& dpkg-query -W -f='${Status}' gasket-dkms 2>/dev/null \
| grep -q 'ok installed'; then
CORAL_PCIE_INSTALLED=true
fi
if ! $CORAL_PCIE_INSTALLED && ls /dev/apex_* >/dev/null 2>&1; then
CORAL_PCIE_INSTALLED=true
fi
# USB path: `libedgetpu1-std` (or the -max variant) installed.
if dpkg-query -W -f='${Status}' libedgetpu1-std 2>/dev/null \
| grep -q 'ok installed'; then
CORAL_USB_INSTALLED=true
CORAL_USB_RUNTIME_VERSION=$(dpkg-query -W -f='${Version}' \
libedgetpu1-std 2>/dev/null)
elif dpkg-query -W -f='${Status}' libedgetpu1-max 2>/dev/null \
| grep -q 'ok installed'; then
CORAL_USB_INSTALLED=true
CORAL_USB_RUNTIME_VERSION=$(dpkg-query -W -f='${Version}' \
libedgetpu1-max 2>/dev/null)
fi
}
# ============================================================
# Action menu (install vs uninstall) — only shown when something
# is already installed. Mirrors nvidia_installer.sh::
# show_action_menu_if_installed so the UX is consistent across
# host driver scripts.
# ============================================================
show_coral_action_menu_if_installed() {
if ! $CORAL_PCIE_INSTALLED && ! $CORAL_USB_INSTALLED; then
ACTION="install"
return 0
fi
local hint=""
if $CORAL_PCIE_INSTALLED; then
hint+="$(translate 'PCIe/M.2 gasket-dkms')${CORAL_PCIE_DKMS_VERSION:+ ($CORAL_PCIE_DKMS_VERSION)}\n"
fi
if $CORAL_USB_INSTALLED; then
hint+="$(translate 'USB libedgetpu1')${CORAL_USB_RUNTIME_VERSION:+ ($CORAL_USB_RUNTIME_VERSION)}\n"
fi
local menu_choices=(
"install" "$(translate 'Reinstall / update Coral drivers')"
"remove" "$(translate 'Uninstall Coral drivers and configuration')"
)
if command -v hybrid_menu >/dev/null 2>&1; then
ACTION=$(hybrid_menu "ProxMenux" \
"$(translate 'Coral TPU is already installed on this host:')\n\n${hint}\n$(translate 'Choose an action:')" \
18 80 8 "${menu_choices[@]}") || ACTION="cancel"
else
ACTION=$(dialog --backtitle "ProxMenux" \
--title "$(translate 'Coral Actions')" \
--menu "\n$(translate 'Coral TPU is already installed:')\n${hint}\n$(translate 'Choose an action:')" \
18 80 8 \
"install" "$(translate 'Reinstall / update Coral drivers')" \
"remove" "$(translate 'Uninstall Coral drivers and configuration')" \
3>&1 1>&2 2>&3) || ACTION="cancel"
fi
}
# ============================================================
# complete_coral_uninstall — full removal of everything the
# installer puts on the host. Mirrors complete_nvidia_uninstall.
# Idempotent: missing pieces are no-ops, never errors.
# ============================================================
complete_coral_uninstall() {
msg_info "$(translate 'Stopping Coral kernel modules...')"
modprobe -r apex 2>>"$LOG_FILE" || true
modprobe -r gasket 2>>"$LOG_FILE" || true
msg_ok "$(translate 'Coral kernel modules unloaded.')"
# DKMS removal for every registered gasket version.
if command -v dkms >/dev/null 2>&1; then
local versions
versions=$(dkms status 2>/dev/null \
| awk -F'[,/ ]+' '/^gasket/ {print $2}' | sort -u)
if [[ -n "$versions" ]]; then
msg_info "$(translate 'Removing gasket DKMS modules...')"
local v
while IFS= read -r v; do
[[ -z "$v" ]] && continue
dkms remove -m gasket -v "$v" --all >>"$LOG_FILE" 2>&1 || true
done <<<"$versions"
msg_ok "$(translate 'gasket DKMS entries removed.')"
fi
fi
msg_info "$(translate 'Removing Coral packages...')"
apt-get -y purge gasket-dkms libedgetpu1-std libedgetpu1-max \
>>"$LOG_FILE" 2>&1 || true
apt-get -y autoremove --purge >>"$LOG_FILE" 2>&1 || true
msg_ok "$(translate 'Coral packages purged.')"
# udev rules created by our installer.
rm -f /etc/udev/rules.d/99-coral-apex.rules
# Restore the upstream udev rule group (set it back to its default
# GROUP="plugdev") in case dkms-postinstall reinstalls gasket-dkms
# later — apex group may not exist next time.
if [[ -f /usr/lib/udev/rules.d/60-gasket-dkms.rules ]]; then
sed -i 's/GROUP="apex"/GROUP="plugdev"/g' \
/usr/lib/udev/rules.d/60-gasket-dkms.rules || true
fi
udevadm control --reload-rules
udevadm trigger --subsystem-match=apex >/dev/null 2>&1 || true
# Apex system group: only remove if no one else is using it.
if getent group apex >/dev/null 2>&1; then
local apex_members
apex_members=$(getent group apex | cut -d: -f4)
if [[ -z "$apex_members" ]]; then
groupdel apex >>"$LOG_FILE" 2>&1 || true
msg_ok "$(translate 'apex group removed.')"
else
msg_warn "$(translate 'apex group still has members; left in place:') $apex_members"
fi
fi
# Google Coral APT repo + keyring (only added during USB install).
rm -f /etc/apt/sources.list.d/coral-edgetpu.list \
/etc/apt/sources.list.d/coral-cloud.list \
/usr/share/keyrings/coral-edgetpu-archive-keyring.gpg \
/etc/apt/trusted.gpg.d/coral-edgetpu-archive-keyring.gpg \
2>/dev/null || true
# Update component status if utils.sh exposes the helper (older
# ProxMenux releases didn't have it; uninstall must still work).
if declare -f update_component_status >/dev/null 2>&1; then
update_component_status "coral_driver" "removed" "" "gpu" '{}'
fi
msg_ok "$(translate 'Coral uninstallation completed.')"
}
restart_prompt() { restart_prompt() {
if whiptail --title "$(translate 'Coral TPU Installation')" --yesno \ if whiptail --title "$(translate 'Coral TPU Installation')" --yesno \
"$(translate 'The installation requires a server restart to apply changes. Do you want to restart now?')" 10 70; then "$(translate 'The installation requires a server restart to apply changes. Do you want to restart now?')" 10 70; then
@@ -449,46 +624,95 @@ main() {
: >"$LOG_FILE" : >"$LOG_FILE"
detect_coral_hardware detect_coral_hardware
detect_coral_install_state
# Nothing plugged in — nothing to do. # No hardware AND no leftover install → nothing to do.
if [[ "$CORAL_PCIE_COUNT" -eq 0 && "$CORAL_USB_COUNT" -eq 0 ]]; then if [[ "$CORAL_PCIE_COUNT" -eq 0 && "$CORAL_USB_COUNT" -eq 0 ]] \
&& ! $CORAL_PCIE_INSTALLED && ! $CORAL_USB_INSTALLED; then
no_hardware_dialog no_hardware_dialog
exit 0 exit 0
fi fi
pre_install_prompt # If something is already installed, offer reinstall/uninstall choice.
# Same UX as nvidia_installer.sh. When nothing is installed yet,
# ACTION="install" automatically.
show_coral_action_menu_if_installed
show_proxmenux_logo case "$ACTION" in
msg_title "$(translate 'Coral TPU Installation')" install)
# No hardware but user picked install → bail out, can't install
# for nothing. (The earlier "no hardware AND no install" exit
# already handles the fully-empty case.)
if [[ "$CORAL_PCIE_COUNT" -eq 0 && "$CORAL_USB_COUNT" -eq 0 ]]; then
no_hardware_dialog
exit 0
fi
# Force non-interactive apt/dpkg for the whole run so cleanup_broken_gasket_dkms pre_install_prompt
# and the two install paths never get blocked by package-maintainer prompts.
export DEBIAN_FRONTEND=noninteractive
# Branch 1 — PCIe / M.2 (kernel modules). Runs first so the reboot reminder show_proxmenux_logo
# at the end only appears when we actually touched kernel modules. msg_title "$(translate 'Coral TPU Installation')"
if [[ "$CORAL_PCIE_COUNT" -gt 0 ]]; then
msg_info2 "$(translate 'Coral M.2 / PCIe detected — installing gasket and apex kernel modules...')"
install_gasket_apex_dkms
fi
# Branch 2 — USB (user-space runtime). # Force non-interactive apt/dpkg for the whole run so cleanup_broken_gasket_dkms
if [[ "$CORAL_USB_COUNT" -gt 0 ]]; then # and the two install paths never get blocked by package-maintainer prompts.
msg_info2 "$(translate 'Coral USB Accelerator detected — installing Edge TPU runtime...')" export DEBIAN_FRONTEND=noninteractive
install_libedgetpu_runtime
fi
echo # Branch 1 — PCIe / M.2 (kernel modules). Runs first so the reboot reminder
if [[ "$CORAL_PCIE_COUNT" -gt 0 ]]; then # at the end only appears when we actually touched kernel modules.
msg_success "$(translate 'Coral TPU drivers installed and loaded successfully.')" if [[ "$CORAL_PCIE_COUNT" -gt 0 ]]; then
restart_prompt msg_info2 "$(translate 'Coral M.2 / PCIe detected — installing gasket and apex kernel modules...')"
else install_gasket_apex_dkms
# USB-only install. No reboot required; the udev rules and runtime are fi
# already active. Ready to passthrough the device to an LXC/VM.
msg_success "$(translate 'Coral USB runtime installed. No reboot required.')" # Branch 2 — USB (user-space runtime).
msg_success "$(translate 'Completed. Press Enter to return to menu...')" if [[ "$CORAL_USB_COUNT" -gt 0 ]]; then
read -r msg_info2 "$(translate 'Coral USB Accelerator detected — installing Edge TPU runtime...')"
fi install_libedgetpu_runtime
fi
echo
if [[ "$CORAL_PCIE_COUNT" -gt 0 ]]; then
msg_success "$(translate 'Coral TPU drivers installed and loaded successfully.')"
restart_prompt
else
# USB-only install. No reboot required; the udev rules and runtime are
# already active. Ready to passthrough the device to an LXC/VM.
msg_success "$(translate 'Coral USB runtime installed. No reboot required.')"
msg_success "$(translate 'Completed. Press Enter to return to menu...')"
read -r
fi
;;
remove)
# Confirm before purging — gasket-dkms uninstall is destructive
# to LXC containers that have apex passthrough; warn the user.
if ! dialog --backtitle "ProxMenux" \
--title "$(translate 'Coral TPU Uninstall')" \
--yesno "\n$(translate 'This will remove the Coral TPU drivers (gasket DKMS + libedgetpu) and related configuration. Any LXC container with apex passthrough will lose access to /dev/apex_* after reboot. Continue?')" \
14 78; then
exit 0
fi
show_proxmenux_logo
msg_title "$(translate 'Coral TPU Uninstall')"
export DEBIAN_FRONTEND=noninteractive
complete_coral_uninstall
# PCIe path created kernel modules → a reboot is the cleanest
# way to flush them. USB-only uninstall doesn't need one.
if $CORAL_PCIE_INSTALLED; then
restart_prompt
else
msg_success "$(translate 'Completed. Press Enter to return to menu...')"
read -r
fi
;;
cancel|*)
exit 0
;;
esac
} }
main main