Update notification service

This commit is contained in:
MacRimi
2026-03-26 20:04:53 +01:00
parent 839a20df97
commit 7c5e7208b9
4 changed files with 530 additions and 276 deletions

View File

@@ -2144,60 +2144,98 @@ class PollingCollector:
self._first_poll_done = True
def _check_startup_aggregation(self):
"""Check if startup period ended and emit aggregated VM/CT start message.
"""Check if startup period ended and emit comprehensive startup report.
During the startup grace period, TaskWatcher collects VM/CT starts instead
of emitting individual notifications. Once the period ends, this method
emits a single aggregated "System startup" notification.
At the end of the health grace period, collects:
- VMs/CTs that started successfully
- VMs/CTs that failed to start
- Service status
- Storage status
- Journal errors (for AI enrichment)
Emits a single "system_startup" notification with full report data.
"""
# Only check once startup period is over
if _shared_state.is_startup_period():
# Wait until health grace period is over (5 min) for complete picture
if startup_grace.is_startup_health_grace():
return
# Only emit once
if _shared_state.was_startup_aggregated():
if startup_grace.was_startup_aggregated():
return
# Get all collected startup VMs/CTs
startup_items = _shared_state.get_and_clear_startup_vms()
if not startup_items:
return
# Collect comprehensive startup report
report = startup_grace.collect_startup_report()
# Count VMs and CTs
vms = [(vmid, name) for vmid, name, vtype in startup_items if vtype == 'vm']
cts = [(vmid, name) for vmid, name, vtype in startup_items if vtype == 'ct']
# Generate human-readable summary
summary = startup_grace.format_startup_summary(report)
vm_count = len(vms)
ct_count = len(cts)
total = vm_count + ct_count
# Count totals
vms_ok = len(report.get('vms_started', []))
cts_ok = len(report.get('cts_started', []))
vms_fail = len(report.get('vms_failed', []))
cts_fail = len(report.get('cts_failed', []))
total_ok = vms_ok + cts_ok
total_fail = vms_fail + cts_fail
# Build entity list (max 10 items for readability)
# Build entity list for backwards compatibility
entity_names = []
for vmid, name in (vms + cts)[:10]:
entity_names.append(f'{name} ({vmid})')
if total > 10:
entity_names.append(f'...and {total - 10} more')
for vm in report.get('vms_started', [])[:5]:
entity_names.append(f"{vm['name']} ({vm['vmid']})")
for ct in report.get('cts_started', [])[:5]:
entity_names.append(f"{ct['name']} ({ct['vmid']})")
if total_ok > 10:
entity_names.append(f"...and {total_ok - 10} more")
# Build summary text
parts = []
if vm_count:
parts.append(f'{vm_count} VM{"s" if vm_count != 1 else ""}')
if ct_count:
parts.append(f'{ct_count} CT{"s" if ct_count != 1 else ""}')
summary = ' and '.join(parts) + ' started'
# Determine severity based on issues
has_issues = (
total_fail > 0 or
not report.get('services_ok', True) or
not report.get('storage_ok', True) or
report.get('health_status') in ['CRITICAL', 'WARNING']
)
severity = 'WARNING' if has_issues else 'INFO'
# Build notification data
data = {
'hostname': self._hostname,
'summary': summary,
'vm_count': vm_count,
'ct_count': ct_count,
'total_count': total,
# VM/CT counts (backwards compatible)
'vm_count': vms_ok,
'ct_count': cts_ok,
'total_count': total_ok,
'entity_list': ', '.join(entity_names),
'reason': f'System startup completed: {summary}',
# New: failure counts
'vms_failed_count': vms_fail,
'cts_failed_count': cts_fail,
'total_failed': total_fail,
# New: detailed lists
'vms_started': report.get('vms_started', []),
'cts_started': report.get('cts_started', []),
'vms_failed': report.get('vms_failed', []),
'cts_failed': report.get('cts_failed', []),
# New: system status
'services_ok': report.get('services_ok', True),
'services_failed': report.get('services_failed', []),
'storage_ok': report.get('storage_ok', True),
'storage_unavailable': report.get('storage_unavailable', []),
'health_status': report.get('health_status', 'UNKNOWN'),
'health_issues': report.get('health_issues', []),
# For AI enrichment
'_journal_context': report.get('_journal_context', ''),
# Metadata
'startup_duration_seconds': report.get('startup_duration_seconds', 0),
'has_issues': has_issues,
'reason': summary.split('\n')[0], # First line as reason
}
self._queue.put(NotificationEvent(
'system_startup', 'INFO', data, source='polling',
'system_startup', severity, data, source='polling',
entity='node', entity_id='',
))
@@ -2500,7 +2538,7 @@ class PollingCollector:
except Exception as e:
print(f"[PollingCollector] AI model check failed: {e}")
# ── Persistence helpers ───────────────────────────────────
# ── Persistence helpers ──────────────────────────────<EFBFBD><EFBFBD>─────
def _load_last_notified(self):
"""Load per-error notification timestamps from DB on startup."""