Update notification service

This commit is contained in:
MacRimi
2026-03-25 22:43:42 +01:00
parent 6da20aab05
commit 8b6755d866
3 changed files with 33 additions and 17 deletions

View File

@@ -90,7 +90,7 @@ cp "$SCRIPT_DIR/hardware_monitor.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "
cp "$SCRIPT_DIR/proxmox_storage_monitor.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ proxmox_storage_monitor.py not found"
cp "$SCRIPT_DIR/flask_script_runner.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ flask_script_runner.py not found"
cp "$SCRIPT_DIR/security_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ security_manager.py not found"
cp "$SCRIPT_DIR/flask_security_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠<EFBFBD><EFBFBD><EFBFBD> flask_security_routes.py not found"
cp "$SCRIPT_DIR/flask_security_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠ flask_security_routes.py not found"
cp "$SCRIPT_DIR/notification_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_manager.py not found"
cp "$SCRIPT_DIR/notification_channels.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_channels.py not found"
cp "$SCRIPT_DIR/notification_templates.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_templates.py not found"

View File

@@ -33,6 +33,19 @@ except ImportError:
# ============================================================================
DEBUG_PERF = False
# Startup grace period: suppress transient issues during boot
# This is set when the module loads (service start)
_MODULE_START_TIME = time.time()
_STARTUP_HEALTH_GRACE_SECONDS = 300 # 5 minutes
def _is_startup_health_grace() -> bool:
"""Check if we're within the startup health grace period (5 min).
Used to downgrade transient errors (high latency, storage not ready)
to INFO level during system boot, preventing false CRITICAL alerts.
"""
return (time.time() - _MODULE_START_TIME) < _STARTUP_HEALTH_GRACE_SECONDS
def _perf_log(section: str, elapsed_ms: float):
"""Log performance timing for a section. Only logs if DEBUG_PERF is True."""
if DEBUG_PERF:
@@ -2512,12 +2525,24 @@ class HealthMonitor:
return loss_result
# Evaluate latency thresholds
# During startup grace period, downgrade CRITICAL/WARNING to INFO
# to avoid false alerts from transient boot-time latency spikes
in_grace_period = _is_startup_health_grace()
if avg_latency > self.NETWORK_LATENCY_CRITICAL:
status = 'CRITICAL'
reason = f'Latency {avg_latency:.1f}ms to gateway >{self.NETWORK_LATENCY_CRITICAL}ms'
if in_grace_period:
status = 'INFO'
reason = f'Latency {avg_latency:.1f}ms (startup grace, will stabilize)'
else:
status = 'CRITICAL'
reason = f'Latency {avg_latency:.1f}ms to gateway >{self.NETWORK_LATENCY_CRITICAL}ms'
elif avg_latency > self.NETWORK_LATENCY_WARNING:
status = 'WARNING'
reason = f'Latency {avg_latency:.1f}ms to gateway >{self.NETWORK_LATENCY_WARNING}ms'
if in_grace_period:
status = 'INFO'
reason = f'Latency {avg_latency:.1f}ms (startup grace, will stabilize)'
else:
status = 'WARNING'
reason = f'Latency {avg_latency:.1f}ms to gateway >{self.NETWORK_LATENCY_WARNING}ms'
else:
status = 'OK'
reason = None

View File

@@ -221,7 +221,7 @@ def capture_journal_context(keywords: list, lines: int = 30,
return ""
# ─── Journal Watcher (Real-time) ────────────────────────────────
# ─── Journal Watcher (Real-time) ───<EFBFBD><EFBFBD>─────────────────────────────
class JournalWatcher:
"""Watches journald in real-time for critical system events.
@@ -1640,13 +1640,9 @@ class TaskWatcher:
# let PollingCollector emit one "System startup: X VMs, Y CTs started".
_STARTUP_EVENTS = {'vm_start', 'ct_start'}
if event_type in _STARTUP_EVENTS and not is_error:
is_startup = _shared_state.is_startup_period()
elapsed = time.time() - _shared_state._startup_time
print(f"[TaskWatcher] {event_type} for {vmid}: is_startup_period={is_startup}, elapsed={elapsed:.1f}s")
if is_startup:
if _shared_state.is_startup_period():
vm_type = 'ct' if event_type == 'ct_start' else 'vm'
_shared_state.add_startup_vm(vmid, vmname or f'ID {vmid}', vm_type)
print(f"[TaskWatcher] Aggregated {event_type} for {vmid}, total pending: {len(_shared_state._startup_vms)}")
return
self._queue.put(NotificationEvent(
@@ -2189,16 +2185,11 @@ class PollingCollector:
if _shared_state.was_startup_aggregated():
return
print(f"[PollingCollector] Startup period ended, checking for aggregated VMs...")
# Get all collected startup VMs/CTs
startup_items = _shared_state.get_and_clear_startup_vms()
if not startup_items:
print(f"[PollingCollector] No VMs/CTs collected during startup period")
return
print(f"[PollingCollector] Emitting aggregated startup notification for {len(startup_items)} items")
# Count VMs and CTs
vms = [(vmid, name) for vmid, name, vtype in startup_items if vtype == 'vm']
cts = [(vmid, name) for vmid, name, vtype in startup_items if vtype == 'ct']
@@ -2289,7 +2280,7 @@ class PollingCollector:
if total == 0:
return
# ── Parse every Inst line ────────────────<EFBFBD><EFBFBD>─────────────
# ── Parse every Inst line ─────────────────────────────
all_pkgs: list[dict] = [] # {name, cur, new}
security_pkgs: list[dict] = []
pve_pkgs: list[dict] = []