Update notification service

2026-06-01 21:14:49 +00:00 · 2026-03-27 19:40:17 +01:00
parent 6bb9313b95
commit 0edc2cc3af
4 changed files with 64 additions and 5 deletions
@@ -175,7 +175,7 @@ class HealthMonitor:
        r'proxmenux-monitor.*failed at step exec',
        r'proxmenux-monitor\.appimage',
        
-        # ─��� PVE scheduler operational noise ──
+        # ── PVE scheduler operational noise ──
        # pvescheduler emits "could not update job state" every minute
        # when a scheduled job reference is stale.  This is cosmetic,
        # not a system problem.
@@ -2118,7 +2118,7 @@ class HealthMonitor:
                            except Exception:
                                pass
                        
-                        # ── Record disk observation (always, even if transient) ���─
+                        # ── Record disk observation (always, even if transient) ──
                        # Signature must be stable across cycles: strip volatile
                        # data (hex values, counts, timestamps) to dedup properly.
                        # e.g. "ata8.00: exception Emask 0x1 SAct 0xc1000000"
@@ -79,7 +79,7 @@ class _SharedState:
 _shared_state = _SharedState()


-# ─── Event Object ──────────────��──────────────────────────────────
+# ─── Event Object ─────────────────────────────────────────────────

 class NotificationEvent:
    """Represents a detected event ready for notification dispatch.
@@ -2154,6 +2154,9 @@ class PollingCollector:
        - Journal errors (for AI enrichment)
        
        Emits a single "system_startup" notification with full report data.
+        
+        IMPORTANT: Only emits if this is a REAL system boot, not a service restart.
+        Checks system uptime to distinguish between the two cases.
        """
        # Wait until health grace period is over (5 min) for complete picture
        if startup_grace.is_startup_health_grace():
@@ -2163,6 +2166,14 @@ class PollingCollector:
        if startup_grace.was_startup_aggregated():
            return
        
+        # CRITICAL: Check if this is a real system boot
+        # If the system was already running for > 10 min when service started,
+        # this is just a service restart, not a system boot - skip notification
+        if not startup_grace.is_real_system_boot():
+            # Mark as aggregated to prevent future checks, but don't send notification
+            startup_grace.mark_startup_aggregated()
+            return
+        
        # Collect comprehensive startup report
        report = startup_grace.collect_startup_report()
        
@@ -2332,7 +2343,7 @@ class PollingCollector:
            for pkg in all_pkgs:
                if pkg['name'] in self._IMPORTANT_PKGS and pkg['cur']:
                    important_lines.append(
-                        f"{pkg['name']} ({pkg['cur']} -> {pkg['new']})"
+                        f"{pkg['name']} ({pkg['cur']} → {pkg['new']})"
                    )
            
            # ── Emit structured update_summary ─────────────────────
@@ -2358,7 +2369,7 @@ class PollingCollector:
                    'current_version': pve_manager_info['cur'],
                    'new_version': pve_manager_info['new'],
                    'version': pve_manager_info['new'],
-                    'details': f"pve-manager {pve_manager_info['cur']} -> {pve_manager_info['new']}",
+                    'details': f"pve-manager {pve_manager_info['cur']} → {pve_manager_info['new']}",
                }
                self._queue.put(NotificationEvent(
                    'pve_update', 'INFO', pve_data,
@@ -28,6 +28,22 @@ STARTUP_VM_GRACE_SECONDS = 180      # 3 minutes for VM/CT start aggregation
 STARTUP_HEALTH_GRACE_SECONDS = 300  # 5 minutes for health warning suppression
 SHUTDOWN_GRACE_SECONDS = 120        # 2 minutes for VM/CT stop suppression

+# Maximum system uptime to consider this a real server boot (not just service restart)
+# If system uptime > this value when service starts, skip startup notification
+MAX_BOOT_UPTIME_SECONDS = 600       # 10 minutes - if system was up longer, it's a service restart
+
+
+def _get_system_uptime() -> float:
+    """
+    Get actual system uptime in seconds from /proc/uptime.
+    Returns 0 if unable to read (will default to treating as new boot).
+    """
+    try:
+        with open('/proc/uptime', 'r') as f:
+            return float(f.readline().split()[0])
+    except Exception:
+        return 0
+
 # Categories to suppress during startup grace period
 # These categories typically have transient issues during boot
 STARTUP_GRACE_CATEGORIES: Set[str] = {
@@ -68,6 +84,11 @@ class _StartupGraceState:
        # Startup time = when service started (module load time)
        self._startup_time: float = time.time()
        
+        # Check if this is a REAL system boot or just a service restart
+        # by comparing system uptime to our threshold
+        system_uptime = _get_system_uptime()
+        self._is_real_boot: bool = system_uptime < MAX_BOOT_UPTIME_SECONDS
+        
        # Shutdown tracking
        self._shutdown_time: float = 0
        
@@ -115,6 +136,19 @@ class _StartupGraceState:
            return self.is_startup_health_grace()
        return False
    
+    def is_real_system_boot(self) -> bool:
+        """
+        Check if the service started during a real system boot.
+        
+        Returns False if the system was already running for more than 10 minutes
+        when the service started (indicates a service restart, not a system boot).
+        
+        This prevents sending "System startup completed" notifications when
+        just restarting the ProxMenux Monitor service.
+        """
+        with self._lock:
+            return self._is_real_boot
+    
    def get_startup_elapsed(self) -> float:
        """Get seconds elapsed since service startup."""
        with self._lock:
@@ -230,6 +264,19 @@ def was_startup_aggregated() -> bool:
    """Check if startup aggregation has already been processed."""
    return _state.was_startup_aggregated()

+def is_real_system_boot() -> bool:
+    """
+    Check if this is a real system boot (not just a service restart).
+    
+    Returns True if the system uptime was less than 10 minutes when the
+    service started. Returns False if the system was already running
+    longer (indicates the service was restarted, not the whole system).
+    
+    Use this to prevent sending "System startup completed" notifications
+    when just restarting the ProxMenux Monitor service.
+    """
+    return _state.is_real_system_boot()
+

 # ─── Startup Report Collection ───────────────────────────────────────────────