Update notification service

This commit is contained in:
MacRimi
2026-03-27 19:40:17 +01:00
parent 6bb9313b95
commit 0edc2cc3af
4 changed files with 64 additions and 5 deletions

View File

@@ -175,7 +175,7 @@ class HealthMonitor:
r'proxmenux-monitor.*failed at step exec',
r'proxmenux-monitor\.appimage',
# ─<EFBFBD><EFBFBD><EFBFBD> PVE scheduler operational noise ──
# ─ PVE scheduler operational noise ──
# pvescheduler emits "could not update job state" every minute
# when a scheduled job reference is stale. This is cosmetic,
# not a system problem.
@@ -2118,7 +2118,7 @@ class HealthMonitor:
except Exception:
pass
# ── Record disk observation (always, even if transient) <EFBFBD><EFBFBD><EFBFBD>
# ── Record disk observation (always, even if transient)
# Signature must be stable across cycles: strip volatile
# data (hex values, counts, timestamps) to dedup properly.
# e.g. "ata8.00: exception Emask 0x1 SAct 0xc1000000"

View File

@@ -79,7 +79,7 @@ class _SharedState:
_shared_state = _SharedState()
# ─── Event Object ──────────────<EFBFBD><EFBFBD>──────────────────────────────────
# ─── Event Object ────────────────────────────────────────────────
class NotificationEvent:
"""Represents a detected event ready for notification dispatch.
@@ -2154,6 +2154,9 @@ class PollingCollector:
- Journal errors (for AI enrichment)
Emits a single "system_startup" notification with full report data.
IMPORTANT: Only emits if this is a REAL system boot, not a service restart.
Checks system uptime to distinguish between the two cases.
"""
# Wait until health grace period is over (5 min) for complete picture
if startup_grace.is_startup_health_grace():
@@ -2163,6 +2166,14 @@ class PollingCollector:
if startup_grace.was_startup_aggregated():
return
# CRITICAL: Check if this is a real system boot
# If the system was already running for > 10 min when service started,
# this is just a service restart, not a system boot - skip notification
if not startup_grace.is_real_system_boot():
# Mark as aggregated to prevent future checks, but don't send notification
startup_grace.mark_startup_aggregated()
return
# Collect comprehensive startup report
report = startup_grace.collect_startup_report()
@@ -2332,7 +2343,7 @@ class PollingCollector:
for pkg in all_pkgs:
if pkg['name'] in self._IMPORTANT_PKGS and pkg['cur']:
important_lines.append(
f"{pkg['name']} ({pkg['cur']} -> {pkg['new']})"
f"{pkg['name']} ({pkg['cur']} {pkg['new']})"
)
# ── Emit structured update_summary ─────────────────────
@@ -2358,7 +2369,7 @@ class PollingCollector:
'current_version': pve_manager_info['cur'],
'new_version': pve_manager_info['new'],
'version': pve_manager_info['new'],
'details': f"pve-manager {pve_manager_info['cur']} -> {pve_manager_info['new']}",
'details': f"pve-manager {pve_manager_info['cur']} {pve_manager_info['new']}",
}
self._queue.put(NotificationEvent(
'pve_update', 'INFO', pve_data,

View File

@@ -28,6 +28,22 @@ STARTUP_VM_GRACE_SECONDS = 180 # 3 minutes for VM/CT start aggregation
STARTUP_HEALTH_GRACE_SECONDS = 300 # 5 minutes for health warning suppression
SHUTDOWN_GRACE_SECONDS = 120 # 2 minutes for VM/CT stop suppression
# Maximum system uptime to consider this a real server boot (not just service restart)
# If system uptime > this value when service starts, skip startup notification
MAX_BOOT_UPTIME_SECONDS = 600 # 10 minutes - if system was up longer, it's a service restart
def _get_system_uptime() -> float:
"""
Get actual system uptime in seconds from /proc/uptime.
Returns 0 if unable to read (will default to treating as new boot).
"""
try:
with open('/proc/uptime', 'r') as f:
return float(f.readline().split()[0])
except Exception:
return 0
# Categories to suppress during startup grace period
# These categories typically have transient issues during boot
STARTUP_GRACE_CATEGORIES: Set[str] = {
@@ -68,6 +84,11 @@ class _StartupGraceState:
# Startup time = when service started (module load time)
self._startup_time: float = time.time()
# Check if this is a REAL system boot or just a service restart
# by comparing system uptime to our threshold
system_uptime = _get_system_uptime()
self._is_real_boot: bool = system_uptime < MAX_BOOT_UPTIME_SECONDS
# Shutdown tracking
self._shutdown_time: float = 0
@@ -115,6 +136,19 @@ class _StartupGraceState:
return self.is_startup_health_grace()
return False
def is_real_system_boot(self) -> bool:
"""
Check if the service started during a real system boot.
Returns False if the system was already running for more than 10 minutes
when the service started (indicates a service restart, not a system boot).
This prevents sending "System startup completed" notifications when
just restarting the ProxMenux Monitor service.
"""
with self._lock:
return self._is_real_boot
def get_startup_elapsed(self) -> float:
"""Get seconds elapsed since service startup."""
with self._lock:
@@ -230,6 +264,19 @@ def was_startup_aggregated() -> bool:
"""Check if startup aggregation has already been processed."""
return _state.was_startup_aggregated()
def is_real_system_boot() -> bool:
"""
Check if this is a real system boot (not just a service restart).
Returns True if the system uptime was less than 10 minutes when the
service started. Returns False if the system was already running
longer (indicates the service was restarted, not the whole system).
Use this to prevent sending "System startup completed" notifications
when just restarting the ProxMenux Monitor service.
"""
return _state.is_real_system_boot()
# ─── Startup Report Collection ───────────────────────────────────────────────