mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-17 17:42:19 +00:00
Update notification service
This commit is contained in:
@@ -90,7 +90,7 @@ cp "$SCRIPT_DIR/hardware_monitor.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "
|
||||
cp "$SCRIPT_DIR/proxmox_storage_monitor.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ proxmox_storage_monitor.py not found"
|
||||
cp "$SCRIPT_DIR/flask_script_runner.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ flask_script_runner.py not found"
|
||||
cp "$SCRIPT_DIR/security_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ security_manager.py not found"
|
||||
cp "$SCRIPT_DIR/flask_security_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠<EFBFBD><EFBFBD><EFBFBD> flask_security_routes.py not found"
|
||||
cp "$SCRIPT_DIR/flask_security_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ flask_security_routes.py not found"
|
||||
cp "$SCRIPT_DIR/notification_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_manager.py not found"
|
||||
cp "$SCRIPT_DIR/notification_channels.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_channels.py not found"
|
||||
cp "$SCRIPT_DIR/notification_templates.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_templates.py not found"
|
||||
|
||||
@@ -33,6 +33,19 @@ except ImportError:
|
||||
# ============================================================================
|
||||
DEBUG_PERF = False
|
||||
|
||||
# Startup grace period: suppress transient issues during boot
|
||||
# This is set when the module loads (service start)
|
||||
_MODULE_START_TIME = time.time()
|
||||
_STARTUP_HEALTH_GRACE_SECONDS = 300 # 5 minutes
|
||||
|
||||
def _is_startup_health_grace() -> bool:
|
||||
"""Check if we're within the startup health grace period (5 min).
|
||||
|
||||
Used to downgrade transient errors (high latency, storage not ready)
|
||||
to INFO level during system boot, preventing false CRITICAL alerts.
|
||||
"""
|
||||
return (time.time() - _MODULE_START_TIME) < _STARTUP_HEALTH_GRACE_SECONDS
|
||||
|
||||
def _perf_log(section: str, elapsed_ms: float):
|
||||
"""Log performance timing for a section. Only logs if DEBUG_PERF is True."""
|
||||
if DEBUG_PERF:
|
||||
@@ -2512,12 +2525,24 @@ class HealthMonitor:
|
||||
return loss_result
|
||||
|
||||
# Evaluate latency thresholds
|
||||
# During startup grace period, downgrade CRITICAL/WARNING to INFO
|
||||
# to avoid false alerts from transient boot-time latency spikes
|
||||
in_grace_period = _is_startup_health_grace()
|
||||
|
||||
if avg_latency > self.NETWORK_LATENCY_CRITICAL:
|
||||
status = 'CRITICAL'
|
||||
reason = f'Latency {avg_latency:.1f}ms to gateway >{self.NETWORK_LATENCY_CRITICAL}ms'
|
||||
if in_grace_period:
|
||||
status = 'INFO'
|
||||
reason = f'Latency {avg_latency:.1f}ms (startup grace, will stabilize)'
|
||||
else:
|
||||
status = 'CRITICAL'
|
||||
reason = f'Latency {avg_latency:.1f}ms to gateway >{self.NETWORK_LATENCY_CRITICAL}ms'
|
||||
elif avg_latency > self.NETWORK_LATENCY_WARNING:
|
||||
status = 'WARNING'
|
||||
reason = f'Latency {avg_latency:.1f}ms to gateway >{self.NETWORK_LATENCY_WARNING}ms'
|
||||
if in_grace_period:
|
||||
status = 'INFO'
|
||||
reason = f'Latency {avg_latency:.1f}ms (startup grace, will stabilize)'
|
||||
else:
|
||||
status = 'WARNING'
|
||||
reason = f'Latency {avg_latency:.1f}ms to gateway >{self.NETWORK_LATENCY_WARNING}ms'
|
||||
else:
|
||||
status = 'OK'
|
||||
reason = None
|
||||
|
||||
@@ -221,7 +221,7 @@ def capture_journal_context(keywords: list, lines: int = 30,
|
||||
return ""
|
||||
|
||||
|
||||
# ─── Journal Watcher (Real-time) ─────────────────────────────────
|
||||
# ─── Journal Watcher (Real-time) ───<EFBFBD><EFBFBD>─────────────────────────────
|
||||
|
||||
class JournalWatcher:
|
||||
"""Watches journald in real-time for critical system events.
|
||||
@@ -1640,13 +1640,9 @@ class TaskWatcher:
|
||||
# let PollingCollector emit one "System startup: X VMs, Y CTs started".
|
||||
_STARTUP_EVENTS = {'vm_start', 'ct_start'}
|
||||
if event_type in _STARTUP_EVENTS and not is_error:
|
||||
is_startup = _shared_state.is_startup_period()
|
||||
elapsed = time.time() - _shared_state._startup_time
|
||||
print(f"[TaskWatcher] {event_type} for {vmid}: is_startup_period={is_startup}, elapsed={elapsed:.1f}s")
|
||||
if is_startup:
|
||||
if _shared_state.is_startup_period():
|
||||
vm_type = 'ct' if event_type == 'ct_start' else 'vm'
|
||||
_shared_state.add_startup_vm(vmid, vmname or f'ID {vmid}', vm_type)
|
||||
print(f"[TaskWatcher] Aggregated {event_type} for {vmid}, total pending: {len(_shared_state._startup_vms)}")
|
||||
return
|
||||
|
||||
self._queue.put(NotificationEvent(
|
||||
@@ -2189,16 +2185,11 @@ class PollingCollector:
|
||||
if _shared_state.was_startup_aggregated():
|
||||
return
|
||||
|
||||
print(f"[PollingCollector] Startup period ended, checking for aggregated VMs...")
|
||||
|
||||
# Get all collected startup VMs/CTs
|
||||
startup_items = _shared_state.get_and_clear_startup_vms()
|
||||
if not startup_items:
|
||||
print(f"[PollingCollector] No VMs/CTs collected during startup period")
|
||||
return
|
||||
|
||||
print(f"[PollingCollector] Emitting aggregated startup notification for {len(startup_items)} items")
|
||||
|
||||
# Count VMs and CTs
|
||||
vms = [(vmid, name) for vmid, name, vtype in startup_items if vtype == 'vm']
|
||||
cts = [(vmid, name) for vmid, name, vtype in startup_items if vtype == 'ct']
|
||||
@@ -2289,7 +2280,7 @@ class PollingCollector:
|
||||
if total == 0:
|
||||
return
|
||||
|
||||
# ── Parse every Inst line ────────────────<EFBFBD><EFBFBD>─────────────
|
||||
# ── Parse every Inst line ──────────────────────────────
|
||||
all_pkgs: list[dict] = [] # {name, cur, new}
|
||||
security_pkgs: list[dict] = []
|
||||
pve_pkgs: list[dict] = []
|
||||
|
||||
Reference in New Issue
Block a user