Update notification service

2026-06-03 13:54:41 +00:00 · 2026-03-25 22:43:42 +01:00
parent 6da20aab05
commit 8b6755d866
3 changed files with 33 additions and 17 deletions
@@ -90,7 +90,7 @@ cp "$SCRIPT_DIR/hardware_monitor.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "
 cp "$SCRIPT_DIR/proxmox_storage_monitor.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  proxmox_storage_monitor.py not found"
 cp "$SCRIPT_DIR/flask_script_runner.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  flask_script_runner.py not found"
 cp "$SCRIPT_DIR/security_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  security_manager.py not found"
-cp "$SCRIPT_DIR/flask_security_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠���  flask_security_routes.py not found"
+cp "$SCRIPT_DIR/flask_security_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  flask_security_routes.py not found"
 cp "$SCRIPT_DIR/notification_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  notification_manager.py not found"
 cp "$SCRIPT_DIR/notification_channels.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  notification_channels.py not found"
 cp "$SCRIPT_DIR/notification_templates.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  notification_templates.py not found"
@@ -33,6 +33,19 @@ except ImportError:
 # ============================================================================
 DEBUG_PERF = False

+# Startup grace period: suppress transient issues during boot
+# This is set when the module loads (service start)
+_MODULE_START_TIME = time.time()
+_STARTUP_HEALTH_GRACE_SECONDS = 300  # 5 minutes
+
+def _is_startup_health_grace() -> bool:
+    """Check if we're within the startup health grace period (5 min).
+    
+    Used to downgrade transient errors (high latency, storage not ready)
+    to INFO level during system boot, preventing false CRITICAL alerts.
+    """
+    return (time.time() - _MODULE_START_TIME) < _STARTUP_HEALTH_GRACE_SECONDS
+
 def _perf_log(section: str, elapsed_ms: float):
    """Log performance timing for a section. Only logs if DEBUG_PERF is True."""
    if DEBUG_PERF:
@@ -2512,12 +2525,24 @@ class HealthMonitor:
                    return loss_result
                
                # Evaluate latency thresholds
+                # During startup grace period, downgrade CRITICAL/WARNING to INFO
+                # to avoid false alerts from transient boot-time latency spikes
+                in_grace_period = _is_startup_health_grace()
+                
                if avg_latency > self.NETWORK_LATENCY_CRITICAL:
-                    status = 'CRITICAL'
-                    reason = f'Latency {avg_latency:.1f}ms to gateway >{self.NETWORK_LATENCY_CRITICAL}ms'
+                    if in_grace_period:
+                        status = 'INFO'
+                        reason = f'Latency {avg_latency:.1f}ms (startup grace, will stabilize)'
+                    else:
+                        status = 'CRITICAL'
+                        reason = f'Latency {avg_latency:.1f}ms to gateway >{self.NETWORK_LATENCY_CRITICAL}ms'
                elif avg_latency > self.NETWORK_LATENCY_WARNING:
-                    status = 'WARNING'
-                    reason = f'Latency {avg_latency:.1f}ms to gateway >{self.NETWORK_LATENCY_WARNING}ms'
+                    if in_grace_period:
+                        status = 'INFO'
+                        reason = f'Latency {avg_latency:.1f}ms (startup grace, will stabilize)'
+                    else:
+                        status = 'WARNING'
+                        reason = f'Latency {avg_latency:.1f}ms to gateway >{self.NETWORK_LATENCY_WARNING}ms'
                else:
                    status = 'OK'
                    reason = None
@@ -221,7 +221,7 @@ def capture_journal_context(keywords: list, lines: int = 30,
        return ""


-# ─── Journal Watcher (Real-time) ─────────────────────────────────
+# ─── Journal Watcher (Real-time) ───��─────────────────────────────

 class JournalWatcher:
    """Watches journald in real-time for critical system events.
@@ -1640,13 +1640,9 @@ class TaskWatcher:
        # let PollingCollector emit one "System startup: X VMs, Y CTs started".
        _STARTUP_EVENTS = {'vm_start', 'ct_start'}
        if event_type in _STARTUP_EVENTS and not is_error:
-            is_startup = _shared_state.is_startup_period()
-            elapsed = time.time() - _shared_state._startup_time
-            print(f"[TaskWatcher] {event_type} for {vmid}: is_startup_period={is_startup}, elapsed={elapsed:.1f}s")
-            if is_startup:
+            if _shared_state.is_startup_period():
                vm_type = 'ct' if event_type == 'ct_start' else 'vm'
                _shared_state.add_startup_vm(vmid, vmname or f'ID {vmid}', vm_type)
-                print(f"[TaskWatcher] Aggregated {event_type} for {vmid}, total pending: {len(_shared_state._startup_vms)}")
                return
        
        self._queue.put(NotificationEvent(
@@ -2189,16 +2185,11 @@ class PollingCollector:
        if _shared_state.was_startup_aggregated():
            return
        
-        print(f"[PollingCollector] Startup period ended, checking for aggregated VMs...")
-        
        # Get all collected startup VMs/CTs
        startup_items = _shared_state.get_and_clear_startup_vms()
        if not startup_items:
-            print(f"[PollingCollector] No VMs/CTs collected during startup period")
            return
        
-        print(f"[PollingCollector] Emitting aggregated startup notification for {len(startup_items)} items")
-        
        # Count VMs and CTs
        vms = [(vmid, name) for vmid, name, vtype in startup_items if vtype == 'vm']
        cts = [(vmid, name) for vmid, name, vtype in startup_items if vtype == 'ct']
@@ -2289,7 +2280,7 @@ class PollingCollector:
            if total == 0:
                return
            
-            # ── Parse every Inst line ────────────────��─────────────
+            # ── Parse every Inst line ──────────────────────────────
            all_pkgs: list[dict] = []   # {name, cur, new}
            security_pkgs: list[dict] = []
            pve_pkgs: list[dict] = []