diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py index 70a81170..77c7a631 100644 --- a/AppImage/scripts/health_persistence.py +++ b/AppImage/scripts/health_persistence.py @@ -1093,7 +1093,7 @@ class HealthPersistence: conn.commit() conn.close() - # ─── System Capabilities Cache ─────────────────────────────── + # ─── System Capabilities Cache ────────────────────��────────── def get_capability(self, cap_key: str) -> Optional[str]: """ diff --git a/AppImage/scripts/notification_events.py b/AppImage/scripts/notification_events.py index 8962d86b..642eb9e7 100644 --- a/AppImage/scripts/notification_events.py +++ b/AppImage/scripts/notification_events.py @@ -2144,60 +2144,98 @@ class PollingCollector: self._first_poll_done = True def _check_startup_aggregation(self): - """Check if startup period ended and emit aggregated VM/CT start message. + """Check if startup period ended and emit comprehensive startup report. - During the startup grace period, TaskWatcher collects VM/CT starts instead - of emitting individual notifications. Once the period ends, this method - emits a single aggregated "System startup" notification. + At the end of the health grace period, collects: + - VMs/CTs that started successfully + - VMs/CTs that failed to start + - Service status + - Storage status + - Journal errors (for AI enrichment) + + Emits a single "system_startup" notification with full report data. """ - # Only check once startup period is over - if _shared_state.is_startup_period(): + # Wait until health grace period is over (5 min) for complete picture + if startup_grace.is_startup_health_grace(): return # Only emit once - if _shared_state.was_startup_aggregated(): + if startup_grace.was_startup_aggregated(): return - # Get all collected startup VMs/CTs - startup_items = _shared_state.get_and_clear_startup_vms() - if not startup_items: - return + # Collect comprehensive startup report + report = startup_grace.collect_startup_report() - # Count VMs and CTs - vms = [(vmid, name) for vmid, name, vtype in startup_items if vtype == 'vm'] - cts = [(vmid, name) for vmid, name, vtype in startup_items if vtype == 'ct'] + # Generate human-readable summary + summary = startup_grace.format_startup_summary(report) - vm_count = len(vms) - ct_count = len(cts) - total = vm_count + ct_count + # Count totals + vms_ok = len(report.get('vms_started', [])) + cts_ok = len(report.get('cts_started', [])) + vms_fail = len(report.get('vms_failed', [])) + cts_fail = len(report.get('cts_failed', [])) + total_ok = vms_ok + cts_ok + total_fail = vms_fail + cts_fail - # Build entity list (max 10 items for readability) + # Build entity list for backwards compatibility entity_names = [] - for vmid, name in (vms + cts)[:10]: - entity_names.append(f'{name} ({vmid})') - if total > 10: - entity_names.append(f'...and {total - 10} more') + for vm in report.get('vms_started', [])[:5]: + entity_names.append(f"{vm['name']} ({vm['vmid']})") + for ct in report.get('cts_started', [])[:5]: + entity_names.append(f"{ct['name']} ({ct['vmid']})") + if total_ok > 10: + entity_names.append(f"...and {total_ok - 10} more") - # Build summary text - parts = [] - if vm_count: - parts.append(f'{vm_count} VM{"s" if vm_count != 1 else ""}') - if ct_count: - parts.append(f'{ct_count} CT{"s" if ct_count != 1 else ""}') - summary = ' and '.join(parts) + ' started' + # Determine severity based on issues + has_issues = ( + total_fail > 0 or + not report.get('services_ok', True) or + not report.get('storage_ok', True) or + report.get('health_status') in ['CRITICAL', 'WARNING'] + ) + severity = 'WARNING' if has_issues else 'INFO' + # Build notification data data = { 'hostname': self._hostname, 'summary': summary, - 'vm_count': vm_count, - 'ct_count': ct_count, - 'total_count': total, + + # VM/CT counts (backwards compatible) + 'vm_count': vms_ok, + 'ct_count': cts_ok, + 'total_count': total_ok, 'entity_list': ', '.join(entity_names), - 'reason': f'System startup completed: {summary}', + + # New: failure counts + 'vms_failed_count': vms_fail, + 'cts_failed_count': cts_fail, + 'total_failed': total_fail, + + # New: detailed lists + 'vms_started': report.get('vms_started', []), + 'cts_started': report.get('cts_started', []), + 'vms_failed': report.get('vms_failed', []), + 'cts_failed': report.get('cts_failed', []), + + # New: system status + 'services_ok': report.get('services_ok', True), + 'services_failed': report.get('services_failed', []), + 'storage_ok': report.get('storage_ok', True), + 'storage_unavailable': report.get('storage_unavailable', []), + 'health_status': report.get('health_status', 'UNKNOWN'), + 'health_issues': report.get('health_issues', []), + + # For AI enrichment + '_journal_context': report.get('_journal_context', ''), + + # Metadata + 'startup_duration_seconds': report.get('startup_duration_seconds', 0), + 'has_issues': has_issues, + 'reason': summary.split('\n')[0], # First line as reason } self._queue.put(NotificationEvent( - 'system_startup', 'INFO', data, source='polling', + 'system_startup', severity, data, source='polling', entity='node', entity_id='', )) @@ -2500,7 +2538,7 @@ class PollingCollector: except Exception as e: print(f"[PollingCollector] AI model check failed: {e}") - # ── Persistence helpers ──────────────────────────────────── + # ── Persistence helpers ──────────────────────────────��───── def _load_last_notified(self): """Load per-error notification timestamps from DB on startup.""" diff --git a/AppImage/scripts/notification_templates.py b/AppImage/scripts/notification_templates.py index 897556f0..57ee75b8 100644 --- a/AppImage/scripts/notification_templates.py +++ b/AppImage/scripts/notification_templates.py @@ -17,7 +17,7 @@ import socket import time import urllib.request import urllib.error -from typing import Dict, Any, Optional, List +from typing import Dict, Any, Optional, List, Tuple # ─── vzdump message parser ─────────────────────────────────────── @@ -314,6 +314,90 @@ def _format_vzdump_body(parsed: Dict[str, Any], is_success: bool) -> str: return '\n'.join(parts) +def _format_system_startup(data: Dict[str, Any]) -> Tuple[str, str]: + """ + Format comprehensive system startup report. + + Returns (title, body) tuple for the notification. + Handles both simple startups (all OK) and those with issues. + """ + hostname = data.get('hostname', 'unknown') + has_issues = data.get('has_issues', False) + + # Build title + if has_issues: + total_issues = ( + data.get('total_failed', 0) + + len(data.get('services_failed', [])) + + len(data.get('storage_unavailable', [])) + ) + title = f"{hostname}: System startup - {total_issues} issue(s) detected" + else: + title = f"{hostname}: System startup completed" + + # Build body + parts = [] + + # Overall status + if not has_issues: + parts.append("All systems operational.") + + # VMs/CTs started + vms_ok = len(data.get('vms_started', [])) + cts_ok = len(data.get('cts_started', [])) + if vms_ok or cts_ok: + count_parts = [] + if vms_ok: + count_parts.append(f"{vms_ok} VM{'s' if vms_ok > 1 else ''}") + if cts_ok: + count_parts.append(f"{cts_ok} CT{'s' if cts_ok > 1 else ''}") + + # List names (up to 5) + names = [] + for vm in data.get('vms_started', [])[:3]: + names.append(f"{vm['name']} ({vm['vmid']})") + for ct in data.get('cts_started', [])[:3]: + names.append(f"{ct['name']} ({ct['vmid']})") + + line = f"\u2705 {' and '.join(count_parts)} started" + if names: + if len(names) <= 5: + line += f": {', '.join(names)}" + else: + line += f": {', '.join(names[:5])}..." + parts.append(line) + + # Failed VMs/CTs + for vm in data.get('vms_failed', []): + reason = vm.get('reason', 'unknown error') + parts.append(f"\u274C VM failed: {vm['name']} - {reason}") + + for ct in data.get('cts_failed', []): + reason = ct.get('reason', 'unknown error') + parts.append(f"\u274C CT failed: {ct['name']} - {reason}") + + # Storage issues + storage_unavailable = data.get('storage_unavailable', []) + if storage_unavailable: + names = [s['name'] for s in storage_unavailable[:3]] + parts.append(f"\u26A0\uFE0F Storage: {len(storage_unavailable)} unavailable ({', '.join(names)})") + + # Service issues + services_failed = data.get('services_failed', []) + if services_failed: + names = [s['name'] for s in services_failed[:3]] + parts.append(f"\u26A0\uFE0F Services: {len(services_failed)} failed ({', '.join(names)})") + + # Startup duration + duration = data.get('startup_duration_seconds', 0) + if duration: + minutes = int(duration // 60) + parts.append(f"\u23F1\uFE0F Startup completed in {minutes} min") + + body = '\n'.join(parts) + return title, body + + # ─── Severity Icons ────────────────────────────────────────────── SEVERITY_ICONS = { @@ -645,11 +729,12 @@ TEMPLATES = { # ── Services events ── 'system_startup': { - 'title': '{hostname}: System startup — {summary}', - 'body': 'System startup completed.\n{summary}\n\nGuests: {entity_list}', - 'label': 'System startup', + 'title': '{hostname}: {reason}', + 'body': '{summary}', + 'label': 'System startup report', 'group': 'services', 'default_enabled': True, + 'formatter': '_format_system_startup', }, 'system_shutdown': { 'title': '{hostname}: System shutting down', @@ -959,7 +1044,19 @@ def render_template(event_type: str, data: Dict[str, Any]) -> Dict[str, Any]: pve_message = data.get('pve_message', '') pve_title = data.get('pve_title', '') - if event_type in ('backup_complete', 'backup_fail') and pve_message: + # Check for custom formatter function + formatter_name = template.get('formatter') + if formatter_name and formatter_name in globals(): + formatter_func = globals()[formatter_name] + try: + title, body_text = formatter_func(data) + except Exception: + # Fallback to standard formatting if formatter fails + try: + body_text = template['body'].format(**variables) + except (KeyError, ValueError): + body_text = template['body'] + elif event_type in ('backup_complete', 'backup_fail') and pve_message: parsed = _parse_vzdump_message(pve_message) if parsed: is_success = (event_type == 'backup_complete') @@ -1288,134 +1385,165 @@ AI_DETAIL_TOKENS = { # System prompt template - informative, no recommendations AI_SYSTEM_PROMPT = """You are a system notification formatter for ProxMenux Monitor, a Proxmox VE monitoring tool. -Your task is to translate and reformat incoming server alert messages into {language}. +Your task is to translate and lightly reformat incoming server alert messages into {language}. + +═══ CORE ROLE ═══ +You are a formatter, not an analyst. +Translate, clean, and present the message clearly. +Do NOT reinterpret the event, do NOT add meaning, and do NOT rebuild the message from scratch. ═══ ABSOLUTE RULES ═══ - 1. Translate BOTH title and body to {language}. Every word, label, and unit must be in {language}. - 2. NO markdown: no **bold**, no *italic*, no `code`, no headers (#), no bullet lists (- or *) - 3. Plain text only — the output is sent to chat apps and email which handle their own formatting - 4. Tone: factual, concise, technical. No greetings, no closings, no apologies - 5. DO NOT add recommendations, action items, or suggestions ("you should…", "consider…") - 6. Present ONLY the facts already in the input — do not invent or assume information - 7. OUTPUT ONLY THE FINAL RESULT — never include both original and processed versions. - Do NOT append "Original message:", "Original:", "Source:", or any before/after comparison. - Return ONLY the single, final formatted message in {language}. - 8. PLAIN NARRATIVE LINES — if a line in the input is a complete sentence (not a "Label: value" - pair), translate it as-is. Never prepend "Message:", "Note:", or any other label to a sentence. - 9. Detail level to apply: {detail_level} - - brief → 2-3 lines, essential data only (status + key metric) - - standard → short paragraph covering who/what/where and the key value - - detailed → full technical breakdown of all available fields - 10. Keep the "hostname: " prefix in the title. Translate only the descriptive part. - Example: "pve01: Updates available" → "pve01: Actualizaciones disponibles" - 11. EMPTY LIST VALUES — if a list field is empty, "none", or "0": - Always write the translated word for "none" on the line after the label, never leave it blank. - Example: 🗂️ Important packages:\\n• none - Example (Spanish): 🗂️ Paquetes importantes:\\n• ninguno - Example (Français): 🗂️ Paquets importants:\\n• aucun - 12. DEDUPLICATION — input may contain redundant or repeated information from multiple monitoring sources: - - Identify and merge duplicate facts (same device, same error, same metric mentioned twice) - - Present each unique fact exactly once in a clear, consolidated form - - If the same data appears in different formats, choose the most informative version - 13. PROXMOX CONTEXT — silently translate Proxmox technical references into plain language. - Never explain what the term means — just use the human-readable equivalent directly. +1. Translate BOTH title and body into {language}. - Service / process name mapping (replace the raw name with the friendly form): - - "pve-container@XXXX.service" → "Container CT XXXX" - - "qemu-server@XXXX.service" → "Virtual Machine VM XXXX" - - "pvesr-XXXX" → "storage replication job for XXXX" - - "vzdump" → "backup process" - - "pveproxy" → "Proxmox web proxy" - - "pvedaemon" → "Proxmox daemon" - - "pvestatd" → "Proxmox statistics service" - - "pvescheduler" → "Proxmox task scheduler" - - "pve-cluster" → "Proxmox cluster service" - - "corosync" → "cluster communication service" - - "ceph-osd@N" → "Ceph storage disk N" - - "ceph-mon" → "Ceph monitor service" +2. Translate human-readable text only. + Do NOT translate: + - hostnames + - device paths (/dev/sdX, /dev/nvmeXnX) + - filesystem paths + - IDs, VMIDs, CTIDs, UUIDs + - timestamps, dates, archive names, PBS paths + - version numbers + - technical units (B, KB, MB, GB, TB, KiB, MiB, GiB, TiB, %, ms, s) - systemd message patterns (rewrite the whole phrase, not just the service name): - - "systemd[1]: pve-container@9000.service: Failed" - → "Container CT 9000 service failed" - - "systemd[1]: qemu-server@100.service: Failed with result 'exit-code'" - → "Virtual Machine VM 100 failed to start" - - "systemd[1]: Started pve-container@9000.service" - → "Container CT 9000 started" +3. Plain text only. + No markdown: no **bold**, no *italic*, no `code`, no headers (#), no markdown lists (- or *). + The bullet character "•" is allowed only where explicitly required. - ATA / SMART / kernel error patterns (replace raw kernel log with plain description): - - "ata8.00: exception Emask 0x1 SAct 0x4ce0 SErr 0x40000 action 0x0" - → "ATA controller error on port 8" - - "blk_update_request: I/O error, dev sdX, sector NNNN" - → "I/O error on disk /dev/sdX at sector NNNN" - - "SCSI error: return code = 0x08000002" - → "SCSI communication error" +4. Tone: factual, concise, technical. + No greetings, no closings, no apologies, no conversational filler. + +5. Do NOT add recommendations, action items, remediation, or suggestions. + +6. Present ONLY the facts already present in the input. + Do NOT invent, assume, explain, soften, or escalate anything. + +7. Do NOT change severity or status meaning. + For example: + - "failed" must stay a failure + - "warning" must stay a warning + - "degraded" must stay degraded + +8. Preserve structure whenever possible. + Keep the same fields, lines, and data already present in the input. + Do NOT remove important lines such as storage, archive path, totals, durations, target node, reason, or summaries. + +9. Reordering must be minimal. + Only reorder lines if it clearly improves readability without changing meaning. + +10. PLAIN NARRATIVE LINES: + If a line is already a complete sentence, translate it as a sentence. + Do NOT prepend labels like "Message:", "Note:", or "Details:" unless they already exist in the input. + +11. Detail level to apply: {detail_level} + - brief → compact output, keep only essential lines, but never remove critical facts + - standard → preserve structure with moderate cleanup + - detailed → preserve all available technical details + +12. DEDUPLICATION: + Remove ONLY exact duplicates or obviously duplicated repeated lines. + Do NOT merge distinct facts just because they look similar. + Do NOT summarize multiple separate events into one. + +13. Keep the "hostname: " prefix in the title. + Translate only the descriptive part. + Example: "pve01: Updates available" → "pve01: Actualizaciones disponibles" + +14. EMPTY VALUES: + If a list field is empty, "none", "0", or equivalent, write the translated word for "none". + Never leave a declared field blank. + +15. UNKNOWN INPUT: + If the message format is unfamiliar, preserve it as closely as possible and translate faithfully. + Do NOT force it into another template. + +═══ PROXMOX CONTEXT ═══ +Silently replace raw Proxmox technical references with the clearer forms below. +Do NOT explain them. Just use the friendly equivalent directly. + +Service / process mappings: +- "pve-container@XXXX.service" → "Container CT XXXX" +- "qemu-server@XXXX.service" → "Virtual Machine VM XXXX" +- "pvesr-XXXX" → "storage replication job for XXXX" +- "vzdump" → "backup process" +- "pveproxy" → "Proxmox web proxy" +- "pvedaemon" → "Proxmox daemon" +- "pvestatd" → "Proxmox statistics service" +- "pvescheduler" → "Proxmox task scheduler" +- "pve-cluster" → "Proxmox cluster service" +- "corosync" → "cluster communication service" +- "ceph-osd@N" → "Ceph storage disk N" +- "ceph-mon" → "Ceph monitor service" + +Systemd-style patterns: +- "systemd[1]: pve-container@9000.service: Failed" + → "Container CT 9000 service failed" +- "systemd[1]: qemu-server@100.service: Failed with result 'exit-code'" + → "Virtual Machine VM 100 failed to start" +- "systemd[1]: Started pve-container@9000.service" + → "Container CT 9000 started" + +Kernel / storage patterns: +- "ata8.00: exception Emask ..." + → "ATA controller error on port 8" +- "blk_update_request: I/O error, dev sdX, sector NNNN" + → "I/O error on disk /dev/sdX at sector NNNN" +- "SCSI error: return code = 0x08000002" + → "SCSI communication error" + +Apply these mappings in titles, field values, and body text when the raw technical string appears. - Apply these mappings everywhere: in the body narrative, in field values, and when - the raw technical string appears inside a longer sentence. {emoji_instructions} -═══ MESSAGE TYPES — FORMAT RULES ═══ +═══ MESSAGE-TYPE GUIDANCE ═══ BACKUP (backup_complete / backup_fail / backup_start): - Input contains: VM/CT names, IDs, size, duration, storage location, status per VM - Output body: first line is plain text (no emoji) describing the event briefly. - Then list each VM/CT with its fields. End with a summary line. - PARTIAL FAILURE RULE: if some VMs succeeded and at least one failed, use a combined title - like "Backup partially failed" / "Copia de seguridad parcialmente fallida" — never say - "backup failed" when there are also successful VMs in the same job. - NEVER omit the storage/archive line or the summary line — always include them even for long jobs. +- Preserve per-VM / per-CT detail if present. +- Preserve size, duration, storage/archive path, and final summary if present. +- If both successes and failures are present in the same backup job, use a title equivalent to "Backup partially failed". +- Do NOT collapse multi-guest backup results into a single generic sentence. UPDATES (update_summary): - - Each count on its own line with its label. - - Package list uses "• " (bullet + space) per package, NOT the 🗂️ emoji on each line. - - The 🗂️ emoji goes only on the "Important packages:" header line. - - NEVER add a redundant summary line repeating the total count. - -PVE UPDATE (pve_update): - - First line: plain sentence announcing the new version (no emoji on this line). - - Blank line after intro. - - Current version: 🔹 prefix | New version: 🟢 prefix - - Blank line before packages block. - - Packages header: 🗂️ | Package lines: 📌 prefix with version arrow v{{old}} ➜ v{{new}} +- Keep each count on its own line. +- Keep the important packages block if present. +- Use "• " for package items. +- Do NOT add a redundant summary line repeating totals already shown. -DISK / SMART ERRORS (disk_io_error / storage_unavailable): - Input contains: device name, error type, SMART values or I/O error codes - Output body: device, then the specific error or failing attribute - DEDUPLICATION: Input may contain repeated or similar information from multiple sources. - If you see the same device, error count, or technical details mentioned multiple times, - consolidate them into a single, clear statement. Never repeat the same information twice. +PVE UPDATE (pve_update): +- Preserve current version, new version, and package list if present. +- Keep the announcement concise. + +DISK / SMART / STORAGE (disk_io_error / storage_unavailable): +- Preserve device, specific error, failing attribute, and counts if present. +- Do NOT repeat the same disk fact twice. RESOURCES (cpu_high / ram_high / temp_high / load_high): - Input contains: current value, threshold, core count - Output: current value vs threshold, context if available +- Preserve current value, threshold, and context if present. SECURITY (auth_fail / ip_block): - Input contains: source IP, user, service, jail, failure count - Output: list each field on its own line +- Keep source IP, user, service, jail, and failure count on separate clear lines if present. -VM/CT LIFECYCLE (vm_start, vm_stop, vm_fail, ct_*, migration_*, replication_*): - Input contains: VM name, ID, target node (migrations), reason (failures) - Output: one or two lines confirming the event with key facts +VM / CT LIFECYCLE (vm_*, ct_*, migration_*, replication_*): +- Keep name, ID, state, reason, and target node if present. +- Keep lifecycle messages compact unless detail_level is detailed. -CLUSTER (split_brain / node_disconnect / node_reconnect): - Input: node name, quorum status - Output: state change + quorum value +CLUSTER / HEALTH: +- Preserve node name, quorum, category, severity, duration, and reason if present. -HEALTH (new_error / error_resolved / health_persistent / health_degraded): - Input: category, severity, duration, reason - Output: what changed, in which category, for how long (if resolved) - -CRITICAL: -- [TITLE] on its own line, title text on the very next line — no blank line between them -- [BODY] on its own line, body text starting on the very next line — no blank line between them -- Do NOT write "Title:", "Body:", or any label substituting the markers -- Do NOT include the literal words TITLE or BODY anywhere in the translated content - -═══ OUTPUT FORMAT (follow exactly — parsers rely on these markers) ═══ +═══ OUTPUT FORMAT ═══ [TITLE] translated title here [BODY] -translated body here""" +translated body here + +CRITICAL OUTPUT RULES: +- Write [TITLE] on its own line +- Write the title on the next line +- Write [BODY] on its own line +- Write the body starting on the next line +- Do NOT replace these markers with "Title:" or "Body:" +- Do NOT include any extra text before or after the formatted result +- Do NOT add blank lines between [TITLE] and the title +- Do NOT add blank lines between [BODY] and the first body line""" # Emoji instructions injected into AI_SYSTEM_PROMPT for rich channels (Telegram, Discord, Pushover) AI_EMOJI_INSTRUCTIONS = """ @@ -1485,135 +1613,10 @@ A blank line must be completely empty — no emoji, no spaces. 🟢 new version (pve_update) - BLANK LINES FOR READABILITY — insert ONE blank line between logical sections within the body. - Blank lines go BETWEEN groups, not before the first line or after the last line. - A blank line must be completely empty — no emoji, no spaces. - - When to add a blank line: - - Updates: after the last count line, before the packages block - - Backup multi-VM: one blank line between each VM entry; one blank line before the summary line - - Disk/SMART errors: after the device line, before the error description lines - - VM events with a reason: after the main status line, before Reason / Node / Target lines - - Health events: after the category/status line, before duration or detail lines - - EXAMPLE — CT shutdown: - [TITLE] - 🔽 amd: CT alpine (101) shut down - [BODY] - 🏷️ Container alpine (ID: 101) - ✔️ Cleanly shut down - - EXAMPLE — VM started: - [TITLE] - 🚀 pve01: VM arch-linux (100) started - [BODY] - 🏷️ Virtual machine arch-linux (ID: 100) - ✔️ Now running - - EXAMPLE — migration complete: - [TITLE] - 🚚 amd: Migration complete — web01 (100) - [BODY] - 🏷️ Virtual machine web01 (ID: 100) - ✔️ Successfully migrated - - 🎯 Target: node02 - - EXAMPLE — updates message (no important packages): - [TITLE] - 📦 amd: Updates available - [BODY] - 📦 Total updates: 24 - 🔒 Security updates: 6 - 🔄 Proxmox updates: 0 - ⚙️ Kernel updates: 0 - - 🗂️ Important packages: - • none - - EXAMPLE — updates message (with important packages): - [TITLE] - 📦 amd: Updates available - [BODY] - 📦 Total updates: 90 - 🔒 Security updates: 6 - 🔄 Proxmox updates: 14 - ⚙️ Kernel updates: 1 - - 🗂️ Important packages: - • pve-manager (9.1.4 -> 9.1.6) - • qemu-server (9.1.3 -> 9.1.4) - • pve-container (6.0.18 -> 6.1.2) - - EXAMPLE — pve_update (new Proxmox VE version): - [TITLE] - 🆕 pve01: Proxmox VE 9.1.6 available - [BODY] - 🚀 A new Proxmox VE release is available. - - 🔹 Current: 9.1.4 - 🟢 New: 9.1.6 - - 🗂️ Important packages: - 📌 pve-manager (v9.1.4 ➜ v9.1.6) - - EXAMPLE — backup complete with multiple VMs: - [TITLE] - 💾✅ pve01: Backup complete - [BODY] - Backup job finished on storage local-bak. - - 🏷️ VM web01 (ID: 100) - ✔️ Status: ok - 💽 Size: 12.3 GiB - ⏱️ Duration: 00:04:21 - 🗄️ Storage: vm/100/2026-03-17T22:00:08Z - - 🏷️ CT db (ID: 101) - ✔️ Status: ok - 💽 Size: 4.1 GiB - ⏱️ Duration: 00:01:10 - 🗄️ Storage: ct/101/2026-03-17T22:04:29Z - - 📊 Total: 2 backups | 💾 16.4 GiB | ⏱️ 00:05:31 - - EXAMPLE — backup partially failed (some ok, some failed): - [TITLE] - 💾❌ pve01: Backup partially failed - [BODY] - Backup job finished with errors on storage PBS2. - - 🏷️ VM web01 (ID: 100) - ✔️ Status: ok - 💽 Size: 12.3 GiB - ⏱️ Duration: 00:04:21 - 🗄️ Storage: vm/100/2026-03-17T22:00:08Z - - 🏷️ VM broken (ID: 102) - ❌ Status: error - 💽 Size: 0 B - ⏱️ Duration: 00:00:37 - - 📊 Total: 2 backups | ❌ 1 failed | 💾 12.3 GiB | ⏱️ 00:04:58 - - EXAMPLE — disk I/O health warning: - [TITLE] - 💥 amd: Health warning — Disk I/O errors - [BODY] - 💿 Device: /dev/sda - - ⚠️ 1 sector currently unreadable (pending) - 📝 Disk reports sectors in pending reallocation state - - EXAMPLE — health degraded (multiple issues): - [TITLE] - ⚠️ amd: 2 health checks degraded - [BODY] - 💥 Disk I/O error on /dev/sda: 1 sector currently unreadable (pending) - - 🏷️ Container CT 9005: ❌ failed to start - 🏷️ Container CT 9004: ❌ failed to start - 🏷️ Container CT 9002: ❌ failed to start""" + BLANK LINES: + Insert one blank line only between logical sections inside the body. + Do not add a blank line before the first body line or after the last one. + """ # No emoji instructions for email/plain text channels diff --git a/AppImage/scripts/startup_grace.py b/AppImage/scripts/startup_grace.py index 2a6f395f..4c573754 100644 --- a/AppImage/scripts/startup_grace.py +++ b/AppImage/scripts/startup_grace.py @@ -120,7 +120,7 @@ class _StartupGraceState: with self._lock: return time.time() - self._startup_time - # ─── Shutdown Tracking ─────────────────────────────────────────────────── + # ─── Shutdown Tracking ────────────────────────────────────────��────────── def mark_shutdown(self): """ @@ -231,6 +231,219 @@ def was_startup_aggregated() -> bool: return _state.was_startup_aggregated() +# ─── Startup Report Collection ─────────────────────────────────────────────── + +def collect_startup_report() -> dict: + """ + Collect comprehensive startup report data. + + Called at the end of the grace period to generate a complete + startup report including: + - VMs/CTs that started successfully + - VMs/CTs that failed to start + - Service status + - Storage status + - Journal errors during boot (for AI enrichment) + + Returns: + Dictionary with startup report data + """ + import subprocess + + report = { + # VMs/CTs + 'vms_started': [], + 'cts_started': [], + 'vms_failed': [], + 'cts_failed': [], + + # System status + 'services_ok': True, + 'services_failed': [], + 'storage_ok': True, + 'storage_unavailable': [], + + # Health summary + 'health_status': 'OK', + 'health_issues': [], + + # For AI enrichment + '_journal_context': '', + '_startup_errors': [], + + # Metadata + 'startup_duration_seconds': get_startup_elapsed(), + 'timestamp': int(time.time()), + } + + # Get VMs/CTs that started during boot + startup_vms = get_and_clear_startup_vms() + for vmid, vmname, vm_type in startup_vms: + if vm_type == 'vm': + report['vms_started'].append({'vmid': vmid, 'name': vmname}) + else: + report['cts_started'].append({'vmid': vmid, 'name': vmname}) + + # Try to get health status from health_monitor + try: + import health_monitor + health_data = health_monitor.get_detailed_status() + + if health_data: + report['health_status'] = health_data.get('overall_status', 'UNKNOWN') + + # Check storage + storage_cat = health_data.get('categories', {}).get('storage', {}) + if storage_cat.get('status') in ['CRITICAL', 'WARNING']: + report['storage_ok'] = False + for check in storage_cat.get('checks', []): + if check.get('status') in ['CRITICAL', 'WARNING', 'error']: + report['storage_unavailable'].append({ + 'name': check.get('name', 'unknown'), + 'reason': check.get('reason', check.get('message', '')) + }) + + # Check services + services_cat = health_data.get('categories', {}).get('services', {}) + if services_cat.get('status') in ['CRITICAL', 'WARNING']: + report['services_ok'] = False + for check in services_cat.get('checks', []): + if check.get('status') in ['CRITICAL', 'WARNING', 'error']: + report['services_failed'].append({ + 'name': check.get('name', 'unknown'), + 'reason': check.get('reason', check.get('message', '')) + }) + + # Check VMs category for failed VMs + vms_cat = health_data.get('categories', {}).get('vms', {}) + for check in vms_cat.get('checks', []): + if check.get('status') in ['CRITICAL', 'WARNING', 'error']: + # Determine if VM or CT based on name/type + check_name = check.get('name', '') + check_reason = check.get('reason', check.get('message', '')) + if 'error al iniciar' in check_reason.lower() or 'failed to start' in check_reason.lower(): + if 'CT' in check_name or 'Container' in check_name: + report['cts_failed'].append({ + 'name': check_name, + 'reason': check_reason + }) + else: + report['vms_failed'].append({ + 'name': check_name, + 'reason': check_reason + }) + + # Collect all health issues for summary + for cat_name, cat_data in health_data.get('categories', {}).items(): + if cat_data.get('status') in ['CRITICAL', 'WARNING']: + report['health_issues'].append({ + 'category': cat_name, + 'status': cat_data.get('status'), + 'reason': cat_data.get('reason', '') + }) + except Exception as e: + report['_startup_errors'].append(f"Error getting health data: {e}") + + # Get journal errors during startup (for AI enrichment) + try: + boot_time = int(_state._startup_time) + result = subprocess.run( + ['journalctl', '-p', 'err', '--since', f'@{boot_time}', '--no-pager', '-n', '50'], + capture_output=True, + text=True, + timeout=10 + ) + if result.returncode == 0 and result.stdout.strip(): + report['_journal_context'] = result.stdout.strip() + except Exception as e: + report['_startup_errors'].append(f"Error getting journal: {e}") + + return report + + +def format_startup_summary(report: dict) -> str: + """ + Format a human-readable startup summary from report data. + + Args: + report: Dictionary from collect_startup_report() + + Returns: + Formatted summary string + """ + lines = [] + + # Count totals + vms_ok = len(report.get('vms_started', [])) + cts_ok = len(report.get('cts_started', [])) + vms_fail = len(report.get('vms_failed', [])) + cts_fail = len(report.get('cts_failed', [])) + + total_ok = vms_ok + cts_ok + total_fail = vms_fail + cts_fail + + # Determine overall status + has_issues = ( + total_fail > 0 or + not report.get('services_ok', True) or + not report.get('storage_ok', True) or + report.get('health_status') in ['CRITICAL', 'WARNING'] + ) + + # Header + if has_issues: + issue_count = total_fail + len(report.get('services_failed', [])) + len(report.get('storage_unavailable', [])) + lines.append(f"System startup - {issue_count} issue(s) detected") + else: + lines.append("System startup completed") + lines.append("All systems operational.") + + # VMs/CTs started + if total_ok > 0: + parts = [] + if vms_ok > 0: + parts.append(f"{vms_ok} VM{'s' if vms_ok > 1 else ''}") + if cts_ok > 0: + parts.append(f"{cts_ok} CT{'s' if cts_ok > 1 else ''}") + + # List names + names = [] + for vm in report.get('vms_started', []): + names.append(f"{vm['name']} ({vm['vmid']})") + for ct in report.get('cts_started', []): + names.append(f"{ct['name']} ({ct['vmid']})") + + line = f"{' and '.join(parts)} started" + if names and len(names) <= 5: + line += f": {', '.join(names)}" + elif names: + line += f": {', '.join(names[:3])}... (+{len(names)-3} more)" + lines.append(line) + + # Failed VMs/CTs + if total_fail > 0: + for vm in report.get('vms_failed', []): + lines.append(f"VM failed: {vm['name']} - {vm.get('reason', 'unknown error')}") + for ct in report.get('cts_failed', []): + lines.append(f"CT failed: {ct['name']} - {ct.get('reason', 'unknown error')}") + + # Storage issues + if not report.get('storage_ok', True): + unavailable = report.get('storage_unavailable', []) + if unavailable: + names = [s['name'] for s in unavailable] + lines.append(f"Storage: {len(unavailable)} unavailable ({', '.join(names[:3])})") + + # Service issues + if not report.get('services_ok', True): + failed = report.get('services_failed', []) + if failed: + names = [s['name'] for s in failed] + lines.append(f"Services: {len(failed)} failed ({', '.join(names[:3])})") + + return '\n'.join(lines) + + # ─── For backwards compatibility ───────────────────────────────────────────── # Expose constants for external use