mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-05-22 00:24:48 +00:00
Update Beta 1.2.1.2
This commit is contained in:
Binary file not shown.
@@ -1 +1 @@
|
||||
150694a49a5b0a4546a2bf5fedcc0914d37666d0cdeac1d9fdc58793c131b4bd ProxMenux-1.2.1.1-beta.AppImage
|
||||
0d74347d2feae2be4b8c6d62d6cd9b1b15b94ef431c088b5580560f6b4751594 ProxMenux-1.2.1.2-beta.AppImage
|
||||
|
||||
@@ -271,7 +271,7 @@ export function Login({ onLogin }: LoginProps) {
|
||||
</form>
|
||||
</div>
|
||||
|
||||
<p className="text-center text-sm text-muted-foreground">ProxMenux Monitor v1.2.0</p>
|
||||
<p className="text-center text-sm text-muted-foreground">ProxMenux Monitor v1.2.1.2-beta</p>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
|
||||
@@ -814,7 +814,7 @@ export function ProxmoxDashboard() {
|
||||
</Tabs>
|
||||
|
||||
<footer className="mt-8 md:mt-12 pt-4 md:pt-6 border-t border-border text-center text-xs md:text-sm text-muted-foreground">
|
||||
<p className="font-medium mb-2">ProxMenux Monitor v1.2.0</p>
|
||||
<p className="font-medium mb-2">ProxMenux Monitor v1.2.1.2-beta</p>
|
||||
<p>
|
||||
<a
|
||||
href="https://ko-fi.com/macrimi"
|
||||
|
||||
@@ -6,7 +6,7 @@ import { Dialog, DialogContent, DialogTitle } from "./ui/dialog"
|
||||
import { X, Sparkles, Thermometer, Activity, HardDrive, Shield, Globe, Cpu, Zap, Sliders, Wrench, RefreshCw, Server } from "lucide-react"
|
||||
import { Checkbox } from "./ui/checkbox"
|
||||
|
||||
const APP_VERSION = "1.2.1.1-beta" // Sync with AppImage/package.json
|
||||
const APP_VERSION = "1.2.1.2-beta" // Sync with AppImage/package.json
|
||||
|
||||
interface ReleaseNote {
|
||||
date: string
|
||||
@@ -18,6 +18,30 @@ interface ReleaseNote {
|
||||
}
|
||||
|
||||
export const CHANGELOG: Record<string, ReleaseNote> = {
|
||||
"1.2.1.2-beta": {
|
||||
date: "May 20, 2026",
|
||||
changes: {
|
||||
added: [
|
||||
"Coral TPU installer - Uninstall path mirroring the NVIDIA flow, and registry-driven update notifications for both the PCIe gasket-dkms driver (tracked against feranick/gasket-driver) and the USB libedgetpu1 runtime (tracked via apt)",
|
||||
"Disk I/O severity tiers - Sliding 24h window classifies dmesg ATA/SCSI errors into silent (0-10), WARNING (11-100) and CRITICAL (100+ or any hard error like UNC / Buffer I/O / Sense Key Hardware Error), so quiet days stay quiet and a single Buffer I/O event still pages immediately",
|
||||
"Quiet Hours buffering - Events suppressed during a channel's quiet window are now persisted to SQLite and released as a grouped summary when the window closes, instead of being silently dropped",
|
||||
],
|
||||
changed: [
|
||||
"Burst aggregation wording - Burst summaries now report only the additional events that arrived after the initial individual alert, so the operator no longer sees the first event counted twice (\"+N more X in window\" instead of the old \"N X in window\" overlap)",
|
||||
"Known-error classifier - Word-boundary regex on ATA/UNC patterns so kernel messages like nvidia_uvm:FatalError are no longer misclassified as ATA cable issues",
|
||||
"Health journal context - Excludes proxmenux-monitor.service systemd lines so internal watchdog SIGKILLs no longer leak into the body of unrelated kernel events",
|
||||
"Resolved notifications severity - The \"previous severity\" now matches the severity the user actually saw in the notification, not whatever escalated value silently landed in the DB during the 24h same-key cooldown",
|
||||
"log2ram apply path - The auto/update flow now restarts log2ram after writing the new size, so a configured 512M actually takes effect on the running tmpfs (previously left at 128M until a manual restart)",
|
||||
"VM/CT control errors - Failed start/stop/restart now surfaces the real pvesh stderr (e.g. \"no space left on device\") in the UI toast and fires a vm_fail / ct_fail notification, instead of a bare 500 INTERNAL SERVER ERROR",
|
||||
"Mobile design of Quiet Hours / Daily Digest - Time inputs are now full-height with inline labels instead of the cramped grid layout that overflowed on narrow screens",
|
||||
],
|
||||
fixed: [
|
||||
"ATA disk error not recorded - disk_observations is now written before the SMART gate, so transient errors that don't yet trip SMART still build the per-disk history",
|
||||
"Quiet Hours toggle not persisting - get_settings now returns the per-channel quiet_*/digest_* fields so the toggle's state reloads correctly after a refresh",
|
||||
"Frontend 401 cascade - Login screen no longer swallows the 401 forever after a brief stale-token state; the dedup flag is cleared on mount and on successful login",
|
||||
],
|
||||
},
|
||||
},
|
||||
"1.2.1.1-beta": {
|
||||
date: "May 9, 2026",
|
||||
changes: {
|
||||
|
||||
@@ -3584,7 +3584,7 @@ ${observationsHtml}
|
||||
<!-- Footer -->
|
||||
<div class="rpt-footer">
|
||||
<div>Report generated by ProxMenux Monitor</div>
|
||||
<div>ProxMenux Monitor v1.2.0</div>
|
||||
<div>ProxMenux Monitor v1.2.1.2-beta</div>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "ProxMenux-Monitor",
|
||||
"version": "1.2.1.1-beta",
|
||||
"version": "1.2.1.2-beta",
|
||||
"description": "Proxmox System Monitoring Dashboard",
|
||||
"private": true,
|
||||
"scripts": {
|
||||
|
||||
@@ -1026,9 +1026,16 @@ def _capture_health_journal_context(categories: list, reason: str = '') -> str:
|
||||
# line like "[HealthPersistence] Database initialized with 13 tables"
|
||||
# leaks into the AI context because grep -iE 'ata' matches the
|
||||
# substring "ata" in "dATAbase". Self-logs are never system evidence.
|
||||
#
|
||||
# Also exclude systemd actions on the proxmenux-monitor unit itself
|
||||
# (e.g. "proxmenux-monitor.service: Killed process 2010621 with
|
||||
# signal SIGKILL"). When a kernel event fires within the same
|
||||
# 10-min window as one of our own watchdog kills, the SIGKILL
|
||||
# line would otherwise leak into the journal_context and the AI
|
||||
# would paste it under the unrelated event as "📝 Log: …".
|
||||
cmd = (
|
||||
f"journalctl -b 0 --since='10 minutes ago' --no-pager -n 500 2>/dev/null | "
|
||||
f"grep -vE 'AppRun\\[|proxmenux-auth|\\[HealthPersistence\\]|\\[ProxMenux\\]|\\[NotificationManager\\]|\\[AIEnhancer\\]' | "
|
||||
f"grep -vE 'AppRun\\[|proxmenux-auth|\\[HealthPersistence\\]|\\[ProxMenux\\]|\\[NotificationManager\\]|\\[AIEnhancer\\]|proxmenux-monitor\\.service' | "
|
||||
f"grep -iE '{pattern}' | tail -n 30"
|
||||
)
|
||||
|
||||
@@ -10344,7 +10351,7 @@ def api_health():
|
||||
return jsonify({
|
||||
'status': 'healthy',
|
||||
'timestamp': datetime.now().isoformat(),
|
||||
'version': '1.2.1.1-beta'
|
||||
'version': '1.2.1.2-beta'
|
||||
})
|
||||
|
||||
# ─── User-configurable health thresholds ─────────────────────────────────────
|
||||
@@ -10737,7 +10744,7 @@ def api_info():
|
||||
"""Root endpoint with API information"""
|
||||
return jsonify({
|
||||
'name': 'ProxMenux Monitor API',
|
||||
'version': '1.2.1.1-beta',
|
||||
'version': '1.2.1.2-beta',
|
||||
'endpoints': [
|
||||
'/api/system',
|
||||
'/api/system-info',
|
||||
@@ -11387,7 +11394,7 @@ if __name__ == '__main__':
|
||||
try:
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
MONITOR_VERSION = '1.2.1.1-beta'
|
||||
MONITOR_VERSION = '1.2.1.2-beta'
|
||||
db_path = Path('/usr/local/share/proxmenux/health_monitor.db')
|
||||
if db_path.exists():
|
||||
conn = sqlite3.connect(str(db_path), timeout=10)
|
||||
|
||||
@@ -156,6 +156,90 @@ def _detect_nvidia_xfree86() -> Optional[dict]:
|
||||
}
|
||||
|
||||
|
||||
# ── Coral TPU host driver (PCIe gasket-dkms + USB libedgetpu1) ──
|
||||
#
|
||||
# Two install paths share the same registry entry because the user
|
||||
# thinks of them as one "Coral driver" install. The detector returns
|
||||
# one entry per path that is actually present on the host, so a system
|
||||
# with both M.2 and USB Coral devices gets two entries — independent
|
||||
# update streams (gasket-dkms from feranick/gasket-driver on GitHub,
|
||||
# libedgetpu1-std from Google's apt repo).
|
||||
|
||||
|
||||
def _detect_coral_host() -> list[dict]:
|
||||
out: list[dict] = []
|
||||
|
||||
# PCIe / M.2 — gasket-dkms package version, falling back to the
|
||||
# registered DKMS version if the package was force-removed but the
|
||||
# built modules still exist.
|
||||
pcie_version: Optional[str] = None
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["dpkg-query", "-W", "-f=${Status}|${Version}", "gasket-dkms"],
|
||||
capture_output=True, text=True, timeout=3,
|
||||
)
|
||||
if r.returncode == 0 and "ok installed" in r.stdout:
|
||||
pcie_version = r.stdout.split("|", 1)[1].strip()
|
||||
except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
|
||||
pass
|
||||
if not pcie_version:
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["dkms", "status"], capture_output=True, text=True, timeout=3,
|
||||
)
|
||||
if r.returncode == 0:
|
||||
for line in r.stdout.splitlines():
|
||||
if line.startswith("gasket"):
|
||||
# "gasket, 1.0, ..." or "gasket/1.0, ..."
|
||||
m = re.match(r"^gasket[, /]([^,\s]+)", line)
|
||||
if m:
|
||||
pcie_version = m.group(1)
|
||||
break
|
||||
except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
|
||||
pass
|
||||
if pcie_version:
|
||||
out.append({
|
||||
"id": "coral-host-pcie",
|
||||
"type": "coral_host",
|
||||
"name": "Coral TPU Driver (gasket-dkms)",
|
||||
"current_version": pcie_version,
|
||||
"menu_label": "GPU & TPU → Coral TPU",
|
||||
"menu_script": "scripts/gpu_tpu/install_coral.sh",
|
||||
"_coral_variant": "pcie",
|
||||
})
|
||||
|
||||
# USB — libedgetpu1-std (default) or libedgetpu1-max if the user
|
||||
# opted into the overclocked runtime. Either one means the USB
|
||||
# path is installed.
|
||||
usb_version: Optional[str] = None
|
||||
usb_pkg: Optional[str] = None
|
||||
for pkg in ("libedgetpu1-std", "libedgetpu1-max"):
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["dpkg-query", "-W", "-f=${Status}|${Version}", pkg],
|
||||
capture_output=True, text=True, timeout=3,
|
||||
)
|
||||
except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
|
||||
continue
|
||||
if r.returncode == 0 and "ok installed" in r.stdout:
|
||||
usb_version = r.stdout.split("|", 1)[1].strip()
|
||||
usb_pkg = pkg
|
||||
break
|
||||
if usb_version and usb_pkg:
|
||||
out.append({
|
||||
"id": "coral-host-usb",
|
||||
"type": "coral_host",
|
||||
"name": f"Coral TPU Runtime ({usb_pkg})",
|
||||
"current_version": usb_version,
|
||||
"menu_label": "GPU & TPU → Coral TPU",
|
||||
"menu_script": "scripts/gpu_tpu/install_coral.sh",
|
||||
"_coral_variant": "usb",
|
||||
"_coral_pkg": usb_pkg,
|
||||
})
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def _detect_oci_apps() -> list[dict]:
|
||||
"""Bridge to the OCI manager so every OCI-installed app shows up
|
||||
in the registry without a per-app detector here. The OCI manager
|
||||
@@ -350,6 +434,7 @@ def _detect_lxc_containers() -> list[dict]:
|
||||
# framework normalises both shapes.
|
||||
_DETECTORS: list[Callable[[], Any]] = [
|
||||
_detect_nvidia_xfree86,
|
||||
_detect_coral_host,
|
||||
_detect_oci_apps,
|
||||
_detect_lxc_containers,
|
||||
]
|
||||
@@ -834,9 +919,171 @@ def _check_lxc_updates(entry: dict) -> dict:
|
||||
}
|
||||
|
||||
|
||||
# ── Coral driver checker ──
|
||||
#
|
||||
# Two upstreams to track:
|
||||
#
|
||||
# PCIe (gasket-dkms) → feranick/gasket-driver on GitHub. The fork is
|
||||
# actively maintained; releases are tagged like "v1.0-22". We pull
|
||||
# the latest tag from the GitHub API and compare against the
|
||||
# installed gasket-dkms Debian version. Because the Debian version
|
||||
# string ("1.0-18") doesn't perfectly match the upstream tag
|
||||
# ("v1.0-22"), we normalise both sides to the trailing "-N" build
|
||||
# number for the comparison. Strict semver isn't workable here.
|
||||
#
|
||||
# USB (libedgetpu1-std/-max) → Google's apt repo. `apt-cache policy`
|
||||
# reports installed + candidate versions in one shot, no internet
|
||||
# round-trip required (apt's own cache is the canonical answer).
|
||||
#
|
||||
# Cache TTL for the GitHub call is 7 days — feranick's release cadence
|
||||
# is roughly monthly, matching NVIDIA's pattern. The cache lives in
|
||||
# memory so AppImage restarts refresh it for free.
|
||||
|
||||
_CORAL_GASKET_REPO = "feranick/gasket-driver"
|
||||
_CORAL_CACHE_TTL = 7 * 86400
|
||||
_coral_gasket_cache: dict[str, Any] = {"latest_tag": None, "fetched_at": 0}
|
||||
|
||||
|
||||
def _coral_build_number(s: str) -> int:
|
||||
"""Extract the trailing build number from a Coral version string.
|
||||
|
||||
Handles both upstream tag form (``v1.0-22``, ``1.0-22``) and the
|
||||
Debian package form (``1.0-22``, ``1.0-18+pmx1``). Returns 0 if no
|
||||
trailing ``-N`` segment exists — that pushes "no build number"
|
||||
versions to the lowest rank so any tagged release shows as newer.
|
||||
"""
|
||||
if not s:
|
||||
return 0
|
||||
m = re.search(r"-(\d+)", s)
|
||||
if not m:
|
||||
return 0
|
||||
try:
|
||||
return int(m.group(1))
|
||||
except (ValueError, TypeError):
|
||||
return 0
|
||||
|
||||
|
||||
def _fetch_gasket_latest_tag(force: bool = False) -> Optional[str]:
|
||||
now = time.time()
|
||||
if not force and _coral_gasket_cache["latest_tag"] and \
|
||||
now - _coral_gasket_cache["fetched_at"] < _CORAL_CACHE_TTL:
|
||||
return _coral_gasket_cache["latest_tag"]
|
||||
url = f"https://api.github.com/repos/{_CORAL_GASKET_REPO}/tags?per_page=5"
|
||||
try:
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
headers={
|
||||
"User-Agent": "ProxMenux-Monitor/1.0",
|
||||
"Accept": "application/vnd.github+json",
|
||||
},
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
tags = json.loads(resp.read().decode("utf-8", errors="replace"))
|
||||
except Exception as e:
|
||||
print(f"[ProxMenux] gasket-driver tag fetch failed: {e}")
|
||||
return _coral_gasket_cache.get("latest_tag")
|
||||
if not isinstance(tags, list) or not tags:
|
||||
return _coral_gasket_cache.get("latest_tag")
|
||||
# Pick the tag with the highest trailing build number — feranick's
|
||||
# tags are not strictly chronological, occasionally rebuilt.
|
||||
best: Optional[str] = None
|
||||
best_n = -1
|
||||
for t in tags:
|
||||
if not isinstance(t, dict):
|
||||
continue
|
||||
name = t.get("name") or ""
|
||||
n = _coral_build_number(name)
|
||||
if n > best_n:
|
||||
best_n = n
|
||||
best = name
|
||||
if best:
|
||||
_coral_gasket_cache["latest_tag"] = best
|
||||
_coral_gasket_cache["fetched_at"] = now
|
||||
return best
|
||||
|
||||
|
||||
def _apt_cache_candidate(pkg: str) -> Optional[str]:
|
||||
"""Return the candidate (newest available) version for ``pkg`` from
|
||||
the local apt cache. Caller is responsible for the package existing —
|
||||
a missing package returns None silently.
|
||||
"""
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["apt-cache", "policy", pkg],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
|
||||
return None
|
||||
if r.returncode != 0:
|
||||
return None
|
||||
for line in r.stdout.splitlines():
|
||||
line = line.strip()
|
||||
if line.startswith("Candidate:"):
|
||||
cand = line.split(":", 1)[1].strip()
|
||||
if cand and cand != "(none)":
|
||||
return cand
|
||||
return None
|
||||
|
||||
|
||||
def _check_coral_host(entry: dict) -> dict:
|
||||
variant = entry.get("_coral_variant") or ""
|
||||
current = entry.get("current_version") or ""
|
||||
|
||||
if variant == "pcie":
|
||||
latest_tag = _fetch_gasket_latest_tag()
|
||||
if not latest_tag:
|
||||
return {"available": False, "latest": None,
|
||||
"last_check": _now_iso(),
|
||||
"error": "could not fetch gasket-driver tags"}
|
||||
cur_n = _coral_build_number(current)
|
||||
new_n = _coral_build_number(latest_tag)
|
||||
available = new_n > cur_n
|
||||
return {
|
||||
"available": available,
|
||||
"latest": latest_tag if available else None,
|
||||
"last_check": _now_iso(),
|
||||
"error": None,
|
||||
"_coral_variant": "pcie",
|
||||
}
|
||||
|
||||
if variant == "usb":
|
||||
pkg = entry.get("_coral_pkg") or "libedgetpu1-std"
|
||||
candidate = _apt_cache_candidate(pkg)
|
||||
if not candidate:
|
||||
return {"available": False, "latest": None,
|
||||
"last_check": _now_iso(),
|
||||
"error": f"apt-cache policy returned no candidate for {pkg}"}
|
||||
# Use plain string compare via the same build-number heuristic
|
||||
# apt uses dpkg version compare upstream, but for the libedgetpu
|
||||
# packages a trailing "-N" build number is the only thing that
|
||||
# ever moves, so the build-number compare is enough here too.
|
||||
# If it ever isn't, dpkg --compare-versions is the right call.
|
||||
try:
|
||||
cmp = subprocess.run(
|
||||
["dpkg", "--compare-versions", current, "lt", candidate],
|
||||
capture_output=True, timeout=3,
|
||||
)
|
||||
available = cmp.returncode == 0
|
||||
except (FileNotFoundError, OSError, subprocess.TimeoutExpired):
|
||||
available = candidate != current
|
||||
return {
|
||||
"available": available,
|
||||
"latest": candidate if available else None,
|
||||
"last_check": _now_iso(),
|
||||
"error": None,
|
||||
"_coral_variant": "usb",
|
||||
"_coral_pkg": pkg,
|
||||
}
|
||||
|
||||
return {"available": False, "latest": None,
|
||||
"last_check": _now_iso(),
|
||||
"error": f"unknown coral variant: {variant}"}
|
||||
|
||||
|
||||
_CHECKERS: dict[str, Callable[[dict], dict]] = {
|
||||
"oci_app": _check_oci_app,
|
||||
"nvidia_xfree86": _check_nvidia_xfree86,
|
||||
"coral_host": _check_coral_host,
|
||||
"lxc": _check_lxc_updates,
|
||||
}
|
||||
|
||||
@@ -890,7 +1137,8 @@ def check_for_updates(force: bool = False) -> list[dict]:
|
||||
# the LXC checker's counts dropped on the floor and the
|
||||
# frontend badge couldn't render.
|
||||
for extra_key in ("_packages", "_upgrade_kind", "_kernel",
|
||||
"_kernel_note", "_count", "_security_count"):
|
||||
"_kernel_note", "_count", "_security_count",
|
||||
"_coral_variant", "_coral_pkg"):
|
||||
if extra_key in result:
|
||||
it["update_check"][extra_key] = result[extra_key]
|
||||
|
||||
|
||||
@@ -382,9 +382,40 @@ class JournalWatcher:
|
||||
self._recent_events: Dict[str, float] = {}
|
||||
self._dedup_window = 30 # seconds
|
||||
|
||||
# 24h anti-cascade for disk I/O + filesystem errors (keyed by device name)
|
||||
# 24h anti-cascade for disk I/O + filesystem errors. The dict
|
||||
# key includes a tier suffix (`sdh:warning`, `sdh:critical`)
|
||||
# so a disk in WARNING cooldown can still escalate to CRITICAL
|
||||
# within the same 24h if the rate accelerates.
|
||||
self._disk_io_notified: Dict[str, float] = {}
|
||||
self._DISK_IO_COOLDOWN = 86400 # 24 hours
|
||||
|
||||
# Sliding 24h window of ATA error timestamps per disk, used to
|
||||
# decide notification severity tier. Don't blindly trust the
|
||||
# SMART firmware self-report — the Google "Failure Trends"
|
||||
# paper showed ~36% of failed drives gave no SMART warning.
|
||||
# Rate-based escalation catches the dying drives that SMART
|
||||
# would never flag until they were already bricked.
|
||||
from collections import deque as _deque
|
||||
self._disk_error_window: Dict[str, "_deque[float]"] = {}
|
||||
self._DISK_ERROR_WINDOW_SECS = 86400 # 24h
|
||||
# Tiers calibrated for homelab/SMB Proxmox usage:
|
||||
# * 0-10/24h → transient noise (cable rattle, sleep/wake,
|
||||
# PHY retrain). Silent observation only.
|
||||
# * 11-100/24h → WARNING. Notify once per 24h.
|
||||
# * 100+/24h → CRITICAL. Active failure.
|
||||
# Hard errors (Buffer I/O, UNC, medium error, unrecovered read)
|
||||
# are CRITICAL on the FIRST occurrence regardless of count —
|
||||
# those are uncorrectable data losses, not transient noise.
|
||||
self._DISK_TIER_WARNING = 10
|
||||
self._DISK_TIER_CRITICAL = 100
|
||||
# Hard-error pattern: matches any of the kernel-reported
|
||||
# signals that mean data was lost or could not be recovered.
|
||||
self._DISK_HARD_ERR_RE = re.compile(
|
||||
r'(Buffer I/O error|UNC\b|Medium Error|medium error'
|
||||
r'|Unrecovered read error|unrecovered read error'
|
||||
r'|Sense Key.*Hardware Error)',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Track when the last full backup job notification was sent
|
||||
# so we can suppress per-guest "Starting Backup of VM ..." noise
|
||||
@@ -1046,73 +1077,107 @@ class JournalWatcher:
|
||||
else:
|
||||
resolved = re.sub(r'\d+$', '', raw_device) if raw_device.startswith('sd') else raw_device
|
||||
|
||||
# ── ALWAYS persist the observation, regardless of SMART ──
|
||||
# ── ALWAYS persist the observation, regardless of severity ──
|
||||
# The disk_observation_contract is explicit (memory note
|
||||
# disk-observation-contract): every kernel-surfaced disk
|
||||
# error must be recorded in disk_observations *even when
|
||||
# SMART reports PASSED*. Silent errors on a "healthy" disk
|
||||
# are exactly the early-warning signal the modal histogram
|
||||
# exists to surface ("324 connection errors on this disk").
|
||||
# Previously this line lived AFTER a `return` gate keyed on
|
||||
# smart_health != 'FAILED', so the 3162 ata8 errors on
|
||||
# .1.10 (PASSED SMART) all dropped on the floor instead of
|
||||
# accumulating in the per-disk audit history.
|
||||
# error must be recorded in disk_observations. The modal
|
||||
# histogram is the per-disk audit trail; it must reflect
|
||||
# everything the kernel saw, even noise.
|
||||
self._record_disk_io_observation(resolved, msg)
|
||||
|
||||
# ── Gate 1: only NOTIFY when SMART reports FAILED ──
|
||||
# Observation is already saved above. We avoid spamming a
|
||||
# CRITICAL notification for transient ATA/SCSI noise on
|
||||
# otherwise-healthy disks — the modal histogram surfaces
|
||||
# those without paging the user at 3 AM.
|
||||
# ── Update sliding 24h rate window for this disk ──
|
||||
now = time.time()
|
||||
from collections import deque as _deque
|
||||
window = self._disk_error_window.setdefault(resolved, _deque())
|
||||
window.append(now)
|
||||
cutoff = now - self._DISK_ERROR_WINDOW_SECS
|
||||
while window and window[0] < cutoff:
|
||||
window.popleft()
|
||||
rate_24h = len(window)
|
||||
|
||||
# ── Decide severity tier ──
|
||||
# * hard error (UNC, Buffer I/O, medium, unrecovered read)
|
||||
# → CRITICAL on first occurrence, no count threshold.
|
||||
# These are uncorrectable: data is gone.
|
||||
# * SMART self-report FAILED → CRITICAL (firmware admits it).
|
||||
# * rate_24h > _DISK_TIER_CRITICAL → CRITICAL (active failure
|
||||
# even if SMART still says PASSED).
|
||||
# * rate_24h > _DISK_TIER_WARNING → WARNING (suspicious,
|
||||
# worth a heads-up).
|
||||
# * Otherwise → silent observation only (transient noise).
|
||||
is_hard_error = bool(self._DISK_HARD_ERR_RE.search(msg))
|
||||
smart_health = self._quick_smart_health(resolved)
|
||||
if smart_health != 'FAILED':
|
||||
if is_hard_error or smart_health == 'FAILED' or rate_24h > self._DISK_TIER_CRITICAL:
|
||||
tier = 'critical'
|
||||
elif rate_24h > self._DISK_TIER_WARNING:
|
||||
tier = 'warning'
|
||||
else:
|
||||
# Silent — observation already saved, that's enough.
|
||||
return
|
||||
|
||||
# ── Gate 2: 24-hour dedup per device ──
|
||||
# Check both in-memory cache AND the DB (user dismiss clears DB cooldowns).
|
||||
# If user dismissed the error, _clear_disk_io_cooldown() removed the DB
|
||||
# entry, so we should refresh from DB to get the real state.
|
||||
now = time.time()
|
||||
|
||||
# First check in-memory cache
|
||||
last_notified = self._disk_io_notified.get(resolved, 0)
|
||||
|
||||
# ── 24h anti-cascade per (device, tier) ──
|
||||
# Independent cooldown per tier so a disk that fires WARNING
|
||||
# at noon can still escalate to CRITICAL the same day when
|
||||
# the rate jumps past _DISK_TIER_CRITICAL — they're
|
||||
# different keys.
|
||||
cooldown_key = f'{resolved}:{tier}'
|
||||
last_notified = self._disk_io_notified.get(cooldown_key, 0)
|
||||
if now - last_notified < self._DISK_IO_COOLDOWN:
|
||||
# In-memory says we already notified. But user might have dismissed
|
||||
# the error, which clears the DB. Re-check DB to be sure.
|
||||
db_ts = self._get_disk_io_cooldown_from_db(resolved)
|
||||
# In-memory says cooldown active. Re-verify in DB in
|
||||
# case the user dismissed (which clears the DB entry).
|
||||
db_ts = self._get_disk_io_cooldown_from_db(cooldown_key)
|
||||
if db_ts is not None and now - db_ts < self._DISK_IO_COOLDOWN:
|
||||
return # DB confirms cooldown is still active
|
||||
# DB says cooldown was cleared (user dismissed) - proceed to notify
|
||||
# Update in-memory cache
|
||||
del self._disk_io_notified[resolved]
|
||||
|
||||
self._disk_io_notified[resolved] = now
|
||||
self._save_disk_io_notified(resolved, now)
|
||||
|
||||
return
|
||||
# Dismissed → DB cleared → proceed to notify and refresh state.
|
||||
del self._disk_io_notified[cooldown_key]
|
||||
|
||||
self._disk_io_notified[cooldown_key] = now
|
||||
self._save_disk_io_notified(cooldown_key, now)
|
||||
|
||||
# ── Build enriched notification ──
|
||||
device_info = self._identify_block_device(resolved)
|
||||
|
||||
|
||||
parts = []
|
||||
parts.append(f'Disk /dev/{resolved}: I/O errors detected')
|
||||
parts.append('SMART status: FAILED -- disk is failing')
|
||||
|
||||
if tier == 'critical':
|
||||
if is_hard_error:
|
||||
parts.append(f'Disk /dev/{resolved}: UNRECOVERABLE error detected')
|
||||
elif smart_health == 'FAILED':
|
||||
parts.append(f'Disk /dev/{resolved}: SMART reports FAILED')
|
||||
else:
|
||||
parts.append(
|
||||
f'Disk /dev/{resolved}: high I/O error rate '
|
||||
f'({rate_24h} errors in last 24h)'
|
||||
)
|
||||
else: # warning
|
||||
parts.append(
|
||||
f'Disk /dev/{resolved}: elevated I/O error rate '
|
||||
f'({rate_24h} errors in last 24h)'
|
||||
)
|
||||
|
||||
parts.append(f'SMART status: {smart_health}')
|
||||
|
||||
if device_info:
|
||||
parts.append(f'Device: {device_info}')
|
||||
else:
|
||||
parts.append(f'Device: /dev/{resolved}')
|
||||
|
||||
|
||||
# Translate the raw kernel error code
|
||||
detail = self._translate_ata_error(msg)
|
||||
if detail:
|
||||
parts.append(f'Error detail: {detail}')
|
||||
|
||||
parts.append(f'Action: Replace disk /dev/{resolved} as soon as possible.')
|
||||
|
||||
if tier == 'critical':
|
||||
parts.append(f'Action: Replace disk /dev/{resolved} as soon as possible.')
|
||||
else:
|
||||
parts.append(
|
||||
f'Action: Monitor /dev/{resolved} closely. '
|
||||
f'Plan a backup verification and replacement if rate grows.'
|
||||
)
|
||||
parts.append(f' Check details: smartctl -a /dev/{resolved}')
|
||||
|
||||
|
||||
enriched = '\n'.join(parts)
|
||||
dev_display = f'/dev/{resolved}'
|
||||
|
||||
|
||||
# Capture journal context for AI enrichment.
|
||||
# `raw_device` is the original ATA-port literal extracted by the regex
|
||||
# (e.g. "ata8"). The previous code used a name `ata_port` that was
|
||||
@@ -1123,12 +1188,15 @@ class JournalWatcher:
|
||||
keywords=[resolved, raw_device, 'I/O error', 'exception', 'SMART'],
|
||||
lines=30
|
||||
)
|
||||
|
||||
self._emit('disk_io_error', 'CRITICAL', {
|
||||
|
||||
severity = 'CRITICAL' if tier == 'critical' else 'WARNING'
|
||||
self._emit('disk_io_error', severity, {
|
||||
'device': dev_display,
|
||||
'reason': enriched,
|
||||
'hostname': self._hostname,
|
||||
'smart_status': 'FAILED',
|
||||
'smart_status': smart_health,
|
||||
'rate_24h': rate_24h,
|
||||
'tier': tier,
|
||||
'_journal_context': journal_ctx,
|
||||
}, entity='disk', entity_id=resolved)
|
||||
return
|
||||
@@ -2229,6 +2297,17 @@ class PollingCollector:
|
||||
self._notified_proxmenux_beta_version: str | None = None
|
||||
# In-memory cache: error_key -> last notification timestamp
|
||||
self._last_notified: Dict[str, float] = {}
|
||||
# In-memory cache: error_key -> severity actually sent in the last
|
||||
# notification. Decoupled from `_known_errors[k].severity` (which
|
||||
# always reflects the most-recent DB row) so a recovery message
|
||||
# quotes the same severity the user saw. Without this, an error
|
||||
# that fired WARNING, silently escalated to CRITICAL during its
|
||||
# 24h same-key cooldown, then resolved, would be reported as
|
||||
# "previous severity: CRITICAL" — confusing the operator who only
|
||||
# ever saw the WARNING. Not persisted across restarts: the
|
||||
# post-restart first-poll guard (`_first_poll_done`) already
|
||||
# suppresses spurious recoveries.
|
||||
self._notified_severity: Dict[str, str] = {}
|
||||
# Track known error keys + metadata so we can detect new ones AND emit recovery
|
||||
# Dict[error_key, dict(category, severity, reason, first_seen, error_key)]
|
||||
self._known_errors: Dict[str, dict] = {}
|
||||
@@ -2572,6 +2651,11 @@ class PollingCollector:
|
||||
# Track that we notified
|
||||
self._last_notified[error_key] = now
|
||||
self._persist_last_notified(error_key, now)
|
||||
# Snapshot the severity we actually delivered, so a future
|
||||
# recovery message quotes the same value the user saw — not
|
||||
# whatever silently-escalated severity ended up in the DB
|
||||
# during the same-key 24h cooldown window.
|
||||
self._notified_severity[error_key] = emit_severity
|
||||
|
||||
# ── Emit recovery notifications for errors that resolved ──
|
||||
resolved_keys = set(self._known_errors.keys()) - set(current_keys.keys())
|
||||
@@ -2674,24 +2758,32 @@ class PollingCollector:
|
||||
else:
|
||||
clean_reason = 'Condition resolved'
|
||||
|
||||
# `original_severity` must match what the user actually saw
|
||||
# in the most-recent notification for this error, not the
|
||||
# latest DB severity. See `_notified_severity` docstring at
|
||||
# __init__ for the failure mode this avoids.
|
||||
original_severity = self._notified_severity.get(
|
||||
key, old_meta.get('severity', 'WARNING'),
|
||||
)
|
||||
data = {
|
||||
'hostname': self._hostname,
|
||||
'category': category,
|
||||
'reason': clean_reason,
|
||||
'error_key': key,
|
||||
'severity': 'OK',
|
||||
'original_severity': old_meta.get('severity', 'WARNING'),
|
||||
'original_severity': original_severity,
|
||||
'first_seen': first_seen,
|
||||
'duration': duration,
|
||||
'is_recovery': True,
|
||||
}
|
||||
|
||||
|
||||
self._queue.put(NotificationEvent(
|
||||
'error_resolved', 'OK', data, source='health',
|
||||
entity=entity, entity_id=eid or key,
|
||||
))
|
||||
|
||||
|
||||
self._last_notified.pop(key, None)
|
||||
self._notified_severity.pop(key, None)
|
||||
|
||||
self._known_errors = current_keys
|
||||
self._first_poll_done = True
|
||||
@@ -3356,6 +3448,41 @@ class PollingCollector:
|
||||
}
|
||||
return 'nvidia_driver_update_available', data
|
||||
|
||||
if item_type == 'coral_host':
|
||||
variant = update.get('_coral_variant') or item.get('_coral_variant') or ''
|
||||
if variant == 'pcie':
|
||||
variant_label = 'gasket-dkms (PCIe / M.2) driver'
|
||||
upgrade_reason = (
|
||||
'feranick/gasket-driver has published a newer release. '
|
||||
'The installer rebuilds the gasket + apex kernel modules '
|
||||
'via DKMS against the running kernel.'
|
||||
)
|
||||
reboot_note = (
|
||||
'Reinstalling rebuilds the DKMS module and requires a '
|
||||
'reboot to load the new driver.'
|
||||
)
|
||||
elif variant == 'usb':
|
||||
pkg = update.get('_coral_pkg') or item.get('_coral_pkg') or 'libedgetpu1'
|
||||
variant_label = f'{pkg} runtime (USB Accelerator)'
|
||||
upgrade_reason = (
|
||||
'A newer Edge TPU runtime is available from the Google '
|
||||
'Coral apt repository.'
|
||||
)
|
||||
reboot_note = (
|
||||
'The USB runtime upgrade does not require a reboot.'
|
||||
)
|
||||
else:
|
||||
variant_label = 'Coral TPU driver'
|
||||
upgrade_reason = 'A newer Coral driver is available.'
|
||||
reboot_note = ''
|
||||
data = {
|
||||
**common,
|
||||
'variant_label': variant_label,
|
||||
'upgrade_reason': upgrade_reason,
|
||||
'reboot_note': reboot_note,
|
||||
}
|
||||
return 'coral_driver_update_available', data
|
||||
|
||||
# Unknown type — don't notify (keeps the queue clean if a
|
||||
# future detector lands without a corresponding event mapping).
|
||||
return '', {}
|
||||
|
||||
@@ -627,9 +627,19 @@ class BurstAggregator:
|
||||
else:
|
||||
details = '\n'.join(detail_lines)
|
||||
|
||||
# The first event in the bucket was already sent individually on
|
||||
# ingest (see line 547 — "fast alert" path). The burst summary
|
||||
# must therefore describe the *additional* events that arrived
|
||||
# after that initial alert, otherwise the user receives both a
|
||||
# "1 system problem" individual notification AND a "2 system
|
||||
# problems" burst summary that double-counts the first event.
|
||||
# `count` reports the additional count; `total_count` is exposed
|
||||
# for templates that want to show "N more (X total in window)".
|
||||
additional_count = max(len(events) - 1, 1)
|
||||
data = {
|
||||
'hostname': first.data.get('hostname') or _resolve_display_hostname(self._config),
|
||||
'count': str(len(events)),
|
||||
'count': str(additional_count),
|
||||
'total_count': str(len(events)),
|
||||
'window': window_str,
|
||||
'entity_list': entity_list,
|
||||
'event_type': first.event_type,
|
||||
|
||||
@@ -1176,60 +1176,91 @@ TEMPLATES = {
|
||||
'group': 'updates',
|
||||
'default_enabled': True,
|
||||
},
|
||||
|
||||
# Sprint 14.7 follow-up: host-side Coral TPU driver. Mirrors the
|
||||
# NVIDIA flow — there's no in-dashboard "Apply update" button; the
|
||||
# operator reruns the installer from the post-install menu. The
|
||||
# PCIe (gasket-dkms) and USB (libedgetpu1-*) variants share one
|
||||
# template and use {variant_label} to surface which is moving so
|
||||
# the body stays readable in either case.
|
||||
'coral_driver_update_available': {
|
||||
'title': '{hostname}: Coral TPU driver update available — {latest_version}',
|
||||
'body': (
|
||||
'A newer {variant_label} is available.\n'
|
||||
'🔹 Currently installed: {current_version}\n'
|
||||
'🟢 Latest available: {latest_version}\n\n'
|
||||
'{upgrade_reason}\n\n'
|
||||
'💡 To reinstall:\n'
|
||||
' • From the ProxMenux post-install menu: {menu_label}\n\n'
|
||||
'{reboot_note}'
|
||||
),
|
||||
'label': 'Coral TPU driver update available',
|
||||
'group': 'updates',
|
||||
'default_enabled': True,
|
||||
},
|
||||
|
||||
# ── Burst aggregation summaries (hidden -- auto-generated by BurstAggregator) ──
|
||||
# These inherit enabled state from their parent event type at dispatch time.
|
||||
#
|
||||
# IMPORTANT — `{count}` here is the count of *additional* events that
|
||||
# arrived AFTER the first one was already sent individually on the
|
||||
# fast-alert path (see notification_manager.py:_create_summary). It is
|
||||
# NOT the total event count in the window; that lives in `{total_count}`.
|
||||
# The wording must reflect "more / additional" so the user does not
|
||||
# mistake a 2-event burst for a duplicate of the initial individual
|
||||
# notification. The first event has already been delivered when this
|
||||
# summary fires.
|
||||
'burst_auth_fail': {
|
||||
'title': '{hostname}: {count} auth failures in {window}',
|
||||
'body': '{count} authentication failures detected in {window}.\nSources: {entity_list}',
|
||||
'title': '{hostname}: +{count} more auth failures in {window}',
|
||||
'body': '+{count} additional authentication failures detected in {window} ({total_count} total).\nSources: {entity_list}',
|
||||
'label': 'Auth failures burst',
|
||||
'group': 'security',
|
||||
'default_enabled': True,
|
||||
'hidden': True,
|
||||
},
|
||||
'burst_ip_block': {
|
||||
'title': '{hostname}: Fail2Ban banned {count} IPs in {window}',
|
||||
'body': '{count} IPs banned by Fail2Ban in {window}.\nIPs: {entity_list}',
|
||||
'title': '{hostname}: Fail2Ban banned +{count} more IPs in {window}',
|
||||
'body': '+{count} additional IPs banned by Fail2Ban in {window} ({total_count} total).\nIPs: {entity_list}',
|
||||
'label': 'IP block burst',
|
||||
'group': 'security',
|
||||
'default_enabled': True,
|
||||
'hidden': True,
|
||||
},
|
||||
'burst_disk_io': {
|
||||
'title': '{hostname}: {count} disk I/O errors on {entity_list}',
|
||||
'body': '{count} I/O errors detected in {window}.\nDevices: {entity_list}',
|
||||
'title': '{hostname}: +{count} more disk I/O errors on {entity_list}',
|
||||
'body': '+{count} additional I/O errors detected in {window} ({total_count} total).\nDevices: {entity_list}',
|
||||
'label': 'Disk I/O burst',
|
||||
'group': 'storage',
|
||||
'default_enabled': True,
|
||||
'hidden': True,
|
||||
},
|
||||
'burst_cluster': {
|
||||
'title': '{hostname}: Cluster flapping detected ({count} changes)',
|
||||
'body': 'Cluster state changed {count} times in {window}.\nNodes: {entity_list}',
|
||||
'title': '{hostname}: Cluster flapping detected (+{count} more changes)',
|
||||
'body': 'Cluster state changed +{count} more times in {window} ({total_count} total).\nNodes: {entity_list}',
|
||||
'label': 'Cluster flapping burst',
|
||||
'group': 'cluster',
|
||||
'default_enabled': True,
|
||||
'hidden': True,
|
||||
},
|
||||
'burst_service_fail': {
|
||||
'title': '{hostname}: {count} services failed in {window}',
|
||||
'body': '{count} service failures detected in {window}.\nThis typically indicates a node reboot or PVE service restart.\n\nAdditional failures:\n{details}',
|
||||
'title': '{hostname}: +{count} more services failed in {window}',
|
||||
'body': '+{count} additional service failures detected in {window} ({total_count} total).\nThis typically indicates a node reboot or PVE service restart.\n\nAdditional failures:\n{details}',
|
||||
'label': 'Service fail burst',
|
||||
'group': 'services',
|
||||
'default_enabled': True,
|
||||
'hidden': True,
|
||||
},
|
||||
'burst_system': {
|
||||
'title': '{hostname}: {count} system problems in {window}',
|
||||
'body': '{count} system problems detected in {window}.\n\nAdditional issues:\n{details}',
|
||||
'title': '{hostname}: +{count} more system problems in {window}',
|
||||
'body': '+{count} additional system problems detected in {window} ({total_count} total).\n\nAdditional issues:\n{details}',
|
||||
'label': 'System problems burst',
|
||||
'group': 'services',
|
||||
'default_enabled': True,
|
||||
'hidden': True,
|
||||
},
|
||||
'burst_generic': {
|
||||
'title': '{hostname}: {count} {event_type} events in {window}',
|
||||
'body': '{count} events of type {event_type} in {window}.\n\nAdditional events:\n{details}',
|
||||
'title': '{hostname}: +{count} more {event_type} events in {window}',
|
||||
'body': '+{count} additional events of type {event_type} in {window} ({total_count} total).\n\nAdditional events:\n{details}',
|
||||
'label': 'Generic burst',
|
||||
'group': 'other',
|
||||
'default_enabled': True,
|
||||
@@ -1559,6 +1590,7 @@ EVENT_EMOJI = {
|
||||
'post_install_update': '✨', # sparkles
|
||||
'secure_gateway_update_available': '\U0001F510', # 🔐 closed lock with key
|
||||
'nvidia_driver_update_available': '\U0001F3AE', # 🎮 video game (GPU)
|
||||
'coral_driver_update_available': '\U0001F9E0', # 🧠 brain (TPU/inference)
|
||||
# AI
|
||||
'ai_model_migrated': '\U0001F504', # arrows counterclockwise (refresh/update)
|
||||
# GPU / PCIe
|
||||
|
||||
@@ -83,7 +83,7 @@ PROXMOX_KNOWN_ERRORS: List[Dict[str, Any]] = [
|
||||
"category": "disks"
|
||||
},
|
||||
{
|
||||
"pattern": r"ata.*error|ATA.*bus.*error|Emask.*0x|DRDY.*ERR|UNC.*error",
|
||||
"pattern": r"\bata\d.*\berror\b|\bATA\b.*bus.*error|Emask.*0x|DRDY.*ERR|\bUNC\b.*error",
|
||||
"cause": "ATA communication error with disk",
|
||||
"cause_detailed": "The SATA/ATA controller encountered communication errors with the disk. This can indicate cable issues, controller problems, or disk failure.",
|
||||
"severity": "warning",
|
||||
|
||||
+1
-1
@@ -1 +1 @@
|
||||
1.2.1.1
|
||||
1.2.1.2
|
||||
|
||||
@@ -430,6 +430,181 @@ EOF
|
||||
# ============================================================
|
||||
# Final prompt
|
||||
# ============================================================
|
||||
# ============================================================
|
||||
# Install-state detection (Coral PCIe gasket DKMS / USB libedgetpu)
|
||||
# ============================================================
|
||||
# Sets the following globals so main() can branch into install vs
|
||||
# uninstall like nvidia_installer.sh does. We treat "installed" as
|
||||
# loosely as possible — even a half-installed DKMS or a stale
|
||||
# libedgetpu1-std package counts, because the uninstall path needs
|
||||
# to clean those up too.
|
||||
|
||||
CORAL_PCIE_INSTALLED=false
|
||||
CORAL_USB_INSTALLED=false
|
||||
CORAL_PCIE_DKMS_VERSION=""
|
||||
CORAL_USB_RUNTIME_VERSION=""
|
||||
|
||||
detect_coral_install_state() {
|
||||
CORAL_PCIE_INSTALLED=false
|
||||
CORAL_USB_INSTALLED=false
|
||||
CORAL_PCIE_DKMS_VERSION=""
|
||||
CORAL_USB_RUNTIME_VERSION=""
|
||||
|
||||
# PCIe / M.2 path: any of these means gasket is installed.
|
||||
# * `dkms status` lists a gasket entry
|
||||
# * `dpkg -s gasket-dkms` reports installed
|
||||
# * /dev/apex_* nodes exist (modules loaded right now)
|
||||
if command -v dkms >/dev/null 2>&1; then
|
||||
local dkms_line
|
||||
dkms_line=$(dkms status 2>/dev/null | grep -E '^gasket' | head -n1)
|
||||
if [[ -n "$dkms_line" ]]; then
|
||||
CORAL_PCIE_INSTALLED=true
|
||||
# `dkms status` formats vary across releases:
|
||||
# "gasket, 1.0, 6.8.12-1-pve, x86_64: installed"
|
||||
# "gasket/1.0, ..."
|
||||
CORAL_PCIE_DKMS_VERSION=$(echo "$dkms_line" \
|
||||
| sed -E 's|^gasket[, /]([^,]+).*|\1|' | tr -d ' ')
|
||||
fi
|
||||
fi
|
||||
if ! $CORAL_PCIE_INSTALLED \
|
||||
&& dpkg-query -W -f='${Status}' gasket-dkms 2>/dev/null \
|
||||
| grep -q 'ok installed'; then
|
||||
CORAL_PCIE_INSTALLED=true
|
||||
fi
|
||||
if ! $CORAL_PCIE_INSTALLED && ls /dev/apex_* >/dev/null 2>&1; then
|
||||
CORAL_PCIE_INSTALLED=true
|
||||
fi
|
||||
|
||||
# USB path: `libedgetpu1-std` (or the -max variant) installed.
|
||||
if dpkg-query -W -f='${Status}' libedgetpu1-std 2>/dev/null \
|
||||
| grep -q 'ok installed'; then
|
||||
CORAL_USB_INSTALLED=true
|
||||
CORAL_USB_RUNTIME_VERSION=$(dpkg-query -W -f='${Version}' \
|
||||
libedgetpu1-std 2>/dev/null)
|
||||
elif dpkg-query -W -f='${Status}' libedgetpu1-max 2>/dev/null \
|
||||
| grep -q 'ok installed'; then
|
||||
CORAL_USB_INSTALLED=true
|
||||
CORAL_USB_RUNTIME_VERSION=$(dpkg-query -W -f='${Version}' \
|
||||
libedgetpu1-max 2>/dev/null)
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Action menu (install vs uninstall) — only shown when something
|
||||
# is already installed. Mirrors nvidia_installer.sh::
|
||||
# show_action_menu_if_installed so the UX is consistent across
|
||||
# host driver scripts.
|
||||
# ============================================================
|
||||
show_coral_action_menu_if_installed() {
|
||||
if ! $CORAL_PCIE_INSTALLED && ! $CORAL_USB_INSTALLED; then
|
||||
ACTION="install"
|
||||
return 0
|
||||
fi
|
||||
|
||||
local hint=""
|
||||
if $CORAL_PCIE_INSTALLED; then
|
||||
hint+=" • $(translate 'PCIe/M.2 gasket-dkms')${CORAL_PCIE_DKMS_VERSION:+ ($CORAL_PCIE_DKMS_VERSION)}\n"
|
||||
fi
|
||||
if $CORAL_USB_INSTALLED; then
|
||||
hint+=" • $(translate 'USB libedgetpu1')${CORAL_USB_RUNTIME_VERSION:+ ($CORAL_USB_RUNTIME_VERSION)}\n"
|
||||
fi
|
||||
|
||||
local menu_choices=(
|
||||
"install" "$(translate 'Reinstall / update Coral drivers')"
|
||||
"remove" "$(translate 'Uninstall Coral drivers and configuration')"
|
||||
)
|
||||
|
||||
if command -v hybrid_menu >/dev/null 2>&1; then
|
||||
ACTION=$(hybrid_menu "ProxMenux" \
|
||||
"$(translate 'Coral TPU is already installed on this host:')\n\n${hint}\n$(translate 'Choose an action:')" \
|
||||
18 80 8 "${menu_choices[@]}") || ACTION="cancel"
|
||||
else
|
||||
ACTION=$(dialog --backtitle "ProxMenux" \
|
||||
--title "$(translate 'Coral Actions')" \
|
||||
--menu "\n$(translate 'Coral TPU is already installed:')\n${hint}\n$(translate 'Choose an action:')" \
|
||||
18 80 8 \
|
||||
"install" "$(translate 'Reinstall / update Coral drivers')" \
|
||||
"remove" "$(translate 'Uninstall Coral drivers and configuration')" \
|
||||
3>&1 1>&2 2>&3) || ACTION="cancel"
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
# ============================================================
|
||||
# complete_coral_uninstall — full removal of everything the
|
||||
# installer puts on the host. Mirrors complete_nvidia_uninstall.
|
||||
# Idempotent: missing pieces are no-ops, never errors.
|
||||
# ============================================================
|
||||
complete_coral_uninstall() {
|
||||
msg_info "$(translate 'Stopping Coral kernel modules...')"
|
||||
modprobe -r apex 2>>"$LOG_FILE" || true
|
||||
modprobe -r gasket 2>>"$LOG_FILE" || true
|
||||
msg_ok "$(translate 'Coral kernel modules unloaded.')"
|
||||
|
||||
# DKMS removal for every registered gasket version.
|
||||
if command -v dkms >/dev/null 2>&1; then
|
||||
local versions
|
||||
versions=$(dkms status 2>/dev/null \
|
||||
| awk -F'[,/ ]+' '/^gasket/ {print $2}' | sort -u)
|
||||
if [[ -n "$versions" ]]; then
|
||||
msg_info "$(translate 'Removing gasket DKMS modules...')"
|
||||
local v
|
||||
while IFS= read -r v; do
|
||||
[[ -z "$v" ]] && continue
|
||||
dkms remove -m gasket -v "$v" --all >>"$LOG_FILE" 2>&1 || true
|
||||
done <<<"$versions"
|
||||
msg_ok "$(translate 'gasket DKMS entries removed.')"
|
||||
fi
|
||||
fi
|
||||
|
||||
msg_info "$(translate 'Removing Coral packages...')"
|
||||
apt-get -y purge gasket-dkms libedgetpu1-std libedgetpu1-max \
|
||||
>>"$LOG_FILE" 2>&1 || true
|
||||
apt-get -y autoremove --purge >>"$LOG_FILE" 2>&1 || true
|
||||
msg_ok "$(translate 'Coral packages purged.')"
|
||||
|
||||
# udev rules created by our installer.
|
||||
rm -f /etc/udev/rules.d/99-coral-apex.rules
|
||||
# Restore the upstream udev rule group (set it back to its default
|
||||
# GROUP="plugdev") in case dkms-postinstall reinstalls gasket-dkms
|
||||
# later — apex group may not exist next time.
|
||||
if [[ -f /usr/lib/udev/rules.d/60-gasket-dkms.rules ]]; then
|
||||
sed -i 's/GROUP="apex"/GROUP="plugdev"/g' \
|
||||
/usr/lib/udev/rules.d/60-gasket-dkms.rules || true
|
||||
fi
|
||||
udevadm control --reload-rules
|
||||
udevadm trigger --subsystem-match=apex >/dev/null 2>&1 || true
|
||||
|
||||
# Apex system group: only remove if no one else is using it.
|
||||
if getent group apex >/dev/null 2>&1; then
|
||||
local apex_members
|
||||
apex_members=$(getent group apex | cut -d: -f4)
|
||||
if [[ -z "$apex_members" ]]; then
|
||||
groupdel apex >>"$LOG_FILE" 2>&1 || true
|
||||
msg_ok "$(translate 'apex group removed.')"
|
||||
else
|
||||
msg_warn "$(translate 'apex group still has members; left in place:') $apex_members"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Google Coral APT repo + keyring (only added during USB install).
|
||||
rm -f /etc/apt/sources.list.d/coral-edgetpu.list \
|
||||
/etc/apt/sources.list.d/coral-cloud.list \
|
||||
/usr/share/keyrings/coral-edgetpu-archive-keyring.gpg \
|
||||
/etc/apt/trusted.gpg.d/coral-edgetpu-archive-keyring.gpg \
|
||||
2>/dev/null || true
|
||||
|
||||
# Update component status if utils.sh exposes the helper (older
|
||||
# ProxMenux releases didn't have it; uninstall must still work).
|
||||
if declare -f update_component_status >/dev/null 2>&1; then
|
||||
update_component_status "coral_driver" "removed" "" "gpu" '{}'
|
||||
fi
|
||||
|
||||
msg_ok "$(translate 'Coral uninstallation completed.')"
|
||||
}
|
||||
|
||||
|
||||
restart_prompt() {
|
||||
if whiptail --title "$(translate 'Coral TPU Installation')" --yesno \
|
||||
"$(translate 'The installation requires a server restart to apply changes. Do you want to restart now?')" 10 70; then
|
||||
@@ -449,46 +624,95 @@ main() {
|
||||
: >"$LOG_FILE"
|
||||
|
||||
detect_coral_hardware
|
||||
detect_coral_install_state
|
||||
|
||||
# Nothing plugged in — nothing to do.
|
||||
if [[ "$CORAL_PCIE_COUNT" -eq 0 && "$CORAL_USB_COUNT" -eq 0 ]]; then
|
||||
# No hardware AND no leftover install → nothing to do.
|
||||
if [[ "$CORAL_PCIE_COUNT" -eq 0 && "$CORAL_USB_COUNT" -eq 0 ]] \
|
||||
&& ! $CORAL_PCIE_INSTALLED && ! $CORAL_USB_INSTALLED; then
|
||||
no_hardware_dialog
|
||||
exit 0
|
||||
fi
|
||||
|
||||
pre_install_prompt
|
||||
# If something is already installed, offer reinstall/uninstall choice.
|
||||
# Same UX as nvidia_installer.sh. When nothing is installed yet,
|
||||
# ACTION="install" automatically.
|
||||
show_coral_action_menu_if_installed
|
||||
|
||||
show_proxmenux_logo
|
||||
msg_title "$(translate 'Coral TPU Installation')"
|
||||
case "$ACTION" in
|
||||
install)
|
||||
# No hardware but user picked install → bail out, can't install
|
||||
# for nothing. (The earlier "no hardware AND no install" exit
|
||||
# already handles the fully-empty case.)
|
||||
if [[ "$CORAL_PCIE_COUNT" -eq 0 && "$CORAL_USB_COUNT" -eq 0 ]]; then
|
||||
no_hardware_dialog
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Force non-interactive apt/dpkg for the whole run so cleanup_broken_gasket_dkms
|
||||
# and the two install paths never get blocked by package-maintainer prompts.
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
pre_install_prompt
|
||||
|
||||
# Branch 1 — PCIe / M.2 (kernel modules). Runs first so the reboot reminder
|
||||
# at the end only appears when we actually touched kernel modules.
|
||||
if [[ "$CORAL_PCIE_COUNT" -gt 0 ]]; then
|
||||
msg_info2 "$(translate 'Coral M.2 / PCIe detected — installing gasket and apex kernel modules...')"
|
||||
install_gasket_apex_dkms
|
||||
fi
|
||||
show_proxmenux_logo
|
||||
msg_title "$(translate 'Coral TPU Installation')"
|
||||
|
||||
# Branch 2 — USB (user-space runtime).
|
||||
if [[ "$CORAL_USB_COUNT" -gt 0 ]]; then
|
||||
msg_info2 "$(translate 'Coral USB Accelerator detected — installing Edge TPU runtime...')"
|
||||
install_libedgetpu_runtime
|
||||
fi
|
||||
# Force non-interactive apt/dpkg for the whole run so cleanup_broken_gasket_dkms
|
||||
# and the two install paths never get blocked by package-maintainer prompts.
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
echo
|
||||
if [[ "$CORAL_PCIE_COUNT" -gt 0 ]]; then
|
||||
msg_success "$(translate 'Coral TPU drivers installed and loaded successfully.')"
|
||||
restart_prompt
|
||||
else
|
||||
# USB-only install. No reboot required; the udev rules and runtime are
|
||||
# already active. Ready to passthrough the device to an LXC/VM.
|
||||
msg_success "$(translate 'Coral USB runtime installed. No reboot required.')"
|
||||
msg_success "$(translate 'Completed. Press Enter to return to menu...')"
|
||||
read -r
|
||||
fi
|
||||
# Branch 1 — PCIe / M.2 (kernel modules). Runs first so the reboot reminder
|
||||
# at the end only appears when we actually touched kernel modules.
|
||||
if [[ "$CORAL_PCIE_COUNT" -gt 0 ]]; then
|
||||
msg_info2 "$(translate 'Coral M.2 / PCIe detected — installing gasket and apex kernel modules...')"
|
||||
install_gasket_apex_dkms
|
||||
fi
|
||||
|
||||
# Branch 2 — USB (user-space runtime).
|
||||
if [[ "$CORAL_USB_COUNT" -gt 0 ]]; then
|
||||
msg_info2 "$(translate 'Coral USB Accelerator detected — installing Edge TPU runtime...')"
|
||||
install_libedgetpu_runtime
|
||||
fi
|
||||
|
||||
echo
|
||||
if [[ "$CORAL_PCIE_COUNT" -gt 0 ]]; then
|
||||
msg_success "$(translate 'Coral TPU drivers installed and loaded successfully.')"
|
||||
restart_prompt
|
||||
else
|
||||
# USB-only install. No reboot required; the udev rules and runtime are
|
||||
# already active. Ready to passthrough the device to an LXC/VM.
|
||||
msg_success "$(translate 'Coral USB runtime installed. No reboot required.')"
|
||||
msg_success "$(translate 'Completed. Press Enter to return to menu...')"
|
||||
read -r
|
||||
fi
|
||||
;;
|
||||
|
||||
remove)
|
||||
# Confirm before purging — gasket-dkms uninstall is destructive
|
||||
# to LXC containers that have apex passthrough; warn the user.
|
||||
if ! dialog --backtitle "ProxMenux" \
|
||||
--title "$(translate 'Coral TPU Uninstall')" \
|
||||
--yesno "\n$(translate 'This will remove the Coral TPU drivers (gasket DKMS + libedgetpu) and related configuration. Any LXC container with apex passthrough will lose access to /dev/apex_* after reboot. Continue?')" \
|
||||
14 78; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
show_proxmenux_logo
|
||||
msg_title "$(translate 'Coral TPU Uninstall')"
|
||||
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
complete_coral_uninstall
|
||||
|
||||
# PCIe path created kernel modules → a reboot is the cleanest
|
||||
# way to flush them. USB-only uninstall doesn't need one.
|
||||
if $CORAL_PCIE_INSTALLED; then
|
||||
restart_prompt
|
||||
else
|
||||
msg_success "$(translate 'Completed. Press Enter to return to menu...')"
|
||||
read -r
|
||||
fi
|
||||
;;
|
||||
|
||||
cancel|*)
|
||||
exit 0
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
main
|
||||
|
||||
Reference in New Issue
Block a user