Update notification service

This commit is contained in:
MacRimi
2026-03-03 13:40:46 +01:00
parent f0b8ed20a2
commit da3f99a254
5 changed files with 475 additions and 141 deletions

View File

@@ -575,13 +575,31 @@ def _temperature_collector_loop():
def _health_collector_loop():
"""Background thread: run full health checks every 5 minutes.
Keeps the health cache always fresh and records events/errors in the DB
so the future notification service can consume them."""
Keeps the health cache always fresh and records events/errors in the DB.
Also emits notifications when a health category degrades (OK -> WARNING/CRITICAL)."""
from health_monitor import health_monitor
# Wait 30s after startup to let other services initialize
time.sleep(30)
# Track previous status per category to detect transitions
_prev_statuses = {}
# Severity ranking for comparison
_SEV_RANK = {'OK': 0, 'INFO': 0, 'UNKNOWN': 1, 'WARNING': 2, 'CRITICAL': 3}
# Human-readable category names
_CAT_NAMES = {
'cpu': 'CPU Usage & Temperature',
'memory': 'Memory & Swap',
'storage': 'Storage Mounts & Space',
'disks': 'Disk I/O & Errors',
'network': 'Network Interfaces',
'vms': 'VMs & Containers',
'services': 'PVE Services',
'logs': 'System Logs',
'updates': 'System Updates',
'security': 'Security',
}
while True:
try:
# Run full health check (results get cached internally + recorded in DB)
@@ -598,6 +616,64 @@ def _health_collector_loop():
health_monitor.cached_results['_bg_detailed'] = result
health_monitor.last_check_times['_bg_overall'] = time.time()
health_monitor.last_check_times['_bg_detailed'] = time.time()
# ── Health degradation notifications ──
# Compare each category's current status to previous cycle.
# Notify when a category DEGRADES (OK->WARNING, WARNING->CRITICAL, etc.)
# Include the detailed 'reason' so the user knows exactly what triggered it.
details = result.get('details', {})
degraded = []
for cat_key, cat_data in details.items():
cur_status = cat_data.get('status', 'OK')
prev_status = _prev_statuses.get(cat_key, 'OK')
cur_rank = _SEV_RANK.get(cur_status, 0)
prev_rank = _SEV_RANK.get(prev_status, 0)
if cur_rank > prev_rank and cur_rank >= 2: # WARNING or CRITICAL
reason = cat_data.get('reason', f'{cat_key} status changed to {cur_status}')
cat_name = _CAT_NAMES.get(cat_key, cat_key)
degraded.append({
'category': cat_name,
'status': cur_status,
'reason': reason,
})
_prev_statuses[cat_key] = cur_status
# Send grouped notification if any categories degraded
if degraded and notification_manager._enabled:
hostname = result.get('hostname', '')
if not hostname:
import socket as _sock
hostname = _sock.gethostname()
if len(degraded) == 1:
d = degraded[0]
title = f"{hostname}: Health {d['status']} - {d['category']}"
body = d['reason']
severity = d['status']
else:
# Multiple categories degraded at once -- group them
max_sev = max(degraded, key=lambda x: _SEV_RANK.get(x['status'], 0))['status']
title = f"{hostname}: {len(degraded)} health checks degraded"
lines = []
for d in degraded:
lines.append(f" [{d['status']}] {d['category']}: {d['reason']}")
body = '\n'.join(lines)
severity = max_sev
try:
notification_manager.send_notification(
event_type='health_degraded',
severity=severity,
title=title,
message=body,
data={'hostname': hostname, 'count': str(len(degraded))},
source='health_monitor',
)
except Exception as e:
print(f"[ProxMenux] Health notification error: {e}")
except Exception as e:
print(f"[ProxMenux] Health collector error: {e}")