Update notification service

2026-06-02 13:34:41 +00:00 · 2026-03-26 20:04:53 +01:00
parent 839a20df97
commit 7c5e7208b9
4 changed files with 530 additions and 276 deletions
@@ -1093,7 +1093,7 @@ class HealthPersistence:
        conn.commit()
        conn.close()
    
-    # ─── System Capabilities Cache ───────────────────────────────
+    # ─── System Capabilities Cache ────────────────────��──────────
    
    def get_capability(self, cap_key: str) -> Optional[str]:
        """
@@ -2144,60 +2144,98 @@ class PollingCollector:
        self._first_poll_done = True
    
    def _check_startup_aggregation(self):
-        """Check if startup period ended and emit aggregated VM/CT start message.
+        """Check if startup period ended and emit comprehensive startup report.
        
-        During the startup grace period, TaskWatcher collects VM/CT starts instead
-        of emitting individual notifications. Once the period ends, this method
-        emits a single aggregated "System startup" notification.
+        At the end of the health grace period, collects:
+        - VMs/CTs that started successfully
+        - VMs/CTs that failed to start
+        - Service status
+        - Storage status
+        - Journal errors (for AI enrichment)
+        
+        Emits a single "system_startup" notification with full report data.
        """
-        # Only check once startup period is over
-        if _shared_state.is_startup_period():
+        # Wait until health grace period is over (5 min) for complete picture
+        if startup_grace.is_startup_health_grace():
            return
        
        # Only emit once
-        if _shared_state.was_startup_aggregated():
+        if startup_grace.was_startup_aggregated():
            return
        
-        # Get all collected startup VMs/CTs
-        startup_items = _shared_state.get_and_clear_startup_vms()
-        if not startup_items:
-            return
+        # Collect comprehensive startup report
+        report = startup_grace.collect_startup_report()
        
-        # Count VMs and CTs
-        vms = [(vmid, name) for vmid, name, vtype in startup_items if vtype == 'vm']
-        cts = [(vmid, name) for vmid, name, vtype in startup_items if vtype == 'ct']
+        # Generate human-readable summary
+        summary = startup_grace.format_startup_summary(report)
        
-        vm_count = len(vms)
-        ct_count = len(cts)
-        total = vm_count + ct_count
+        # Count totals
+        vms_ok = len(report.get('vms_started', []))
+        cts_ok = len(report.get('cts_started', []))
+        vms_fail = len(report.get('vms_failed', []))
+        cts_fail = len(report.get('cts_failed', []))
+        total_ok = vms_ok + cts_ok
+        total_fail = vms_fail + cts_fail
        
-        # Build entity list (max 10 items for readability)
+        # Build entity list for backwards compatibility
        entity_names = []
-        for vmid, name in (vms + cts)[:10]:
-            entity_names.append(f'{name} ({vmid})')
-        if total > 10:
-            entity_names.append(f'...and {total - 10} more')
+        for vm in report.get('vms_started', [])[:5]:
+            entity_names.append(f"{vm['name']} ({vm['vmid']})")
+        for ct in report.get('cts_started', [])[:5]:
+            entity_names.append(f"{ct['name']} ({ct['vmid']})")
+        if total_ok > 10:
+            entity_names.append(f"...and {total_ok - 10} more")
        
-        # Build summary text
-        parts = []
-        if vm_count:
-            parts.append(f'{vm_count} VM{"s" if vm_count != 1 else ""}')
-        if ct_count:
-            parts.append(f'{ct_count} CT{"s" if ct_count != 1 else ""}')
-        summary = ' and '.join(parts) + ' started'
+        # Determine severity based on issues
+        has_issues = (
+            total_fail > 0 or
+            not report.get('services_ok', True) or
+            not report.get('storage_ok', True) or
+            report.get('health_status') in ['CRITICAL', 'WARNING']
+        )
+        severity = 'WARNING' if has_issues else 'INFO'
        
+        # Build notification data
        data = {
            'hostname': self._hostname,
            'summary': summary,
-            'vm_count': vm_count,
-            'ct_count': ct_count,
-            'total_count': total,
+            
+            # VM/CT counts (backwards compatible)
+            'vm_count': vms_ok,
+            'ct_count': cts_ok,
+            'total_count': total_ok,
            'entity_list': ', '.join(entity_names),
-            'reason': f'System startup completed: {summary}',
+            
+            # New: failure counts
+            'vms_failed_count': vms_fail,
+            'cts_failed_count': cts_fail,
+            'total_failed': total_fail,
+            
+            # New: detailed lists
+            'vms_started': report.get('vms_started', []),
+            'cts_started': report.get('cts_started', []),
+            'vms_failed': report.get('vms_failed', []),
+            'cts_failed': report.get('cts_failed', []),
+            
+            # New: system status
+            'services_ok': report.get('services_ok', True),
+            'services_failed': report.get('services_failed', []),
+            'storage_ok': report.get('storage_ok', True),
+            'storage_unavailable': report.get('storage_unavailable', []),
+            'health_status': report.get('health_status', 'UNKNOWN'),
+            'health_issues': report.get('health_issues', []),
+            
+            # For AI enrichment
+            '_journal_context': report.get('_journal_context', ''),
+            
+            # Metadata
+            'startup_duration_seconds': report.get('startup_duration_seconds', 0),
+            'has_issues': has_issues,
+            'reason': summary.split('\n')[0],  # First line as reason
        }
        
        self._queue.put(NotificationEvent(
-            'system_startup', 'INFO', data, source='polling',
+            'system_startup', severity, data, source='polling',
            entity='node', entity_id='',
        ))
    
@@ -2500,7 +2538,7 @@ class PollingCollector:
        except Exception as e:
            print(f"[PollingCollector] AI model check failed: {e}")
    
-    # ── Persistence helpers ────────────────────────────────────
+    # ── Persistence helpers ──────────────────────────────��─────
    
    def _load_last_notified(self):
        """Load per-error notification timestamps from DB on startup."""
@@ -17,7 +17,7 @@ import socket
 import time
 import urllib.request
 import urllib.error
-from typing import Dict, Any, Optional, List
+from typing import Dict, Any, Optional, List, Tuple


 # ─── vzdump message parser ───────────────────────────────────────
@@ -314,6 +314,90 @@ def _format_vzdump_body(parsed: Dict[str, Any], is_success: bool) -> str:
    return '\n'.join(parts)


+def _format_system_startup(data: Dict[str, Any]) -> Tuple[str, str]:
+    """
+    Format comprehensive system startup report.
+    
+    Returns (title, body) tuple for the notification.
+    Handles both simple startups (all OK) and those with issues.
+    """
+    hostname = data.get('hostname', 'unknown')
+    has_issues = data.get('has_issues', False)
+    
+    # Build title
+    if has_issues:
+        total_issues = (
+            data.get('total_failed', 0) +
+            len(data.get('services_failed', [])) +
+            len(data.get('storage_unavailable', []))
+        )
+        title = f"{hostname}: System startup - {total_issues} issue(s) detected"
+    else:
+        title = f"{hostname}: System startup completed"
+    
+    # Build body
+    parts = []
+    
+    # Overall status
+    if not has_issues:
+        parts.append("All systems operational.")
+    
+    # VMs/CTs started
+    vms_ok = len(data.get('vms_started', []))
+    cts_ok = len(data.get('cts_started', []))
+    if vms_ok or cts_ok:
+        count_parts = []
+        if vms_ok:
+            count_parts.append(f"{vms_ok} VM{'s' if vms_ok > 1 else ''}")
+        if cts_ok:
+            count_parts.append(f"{cts_ok} CT{'s' if cts_ok > 1 else ''}")
+        
+        # List names (up to 5)
+        names = []
+        for vm in data.get('vms_started', [])[:3]:
+            names.append(f"{vm['name']} ({vm['vmid']})")
+        for ct in data.get('cts_started', [])[:3]:
+            names.append(f"{ct['name']} ({ct['vmid']})")
+        
+        line = f"\u2705 {' and '.join(count_parts)} started"
+        if names:
+            if len(names) <= 5:
+                line += f": {', '.join(names)}"
+            else:
+                line += f": {', '.join(names[:5])}..."
+        parts.append(line)
+    
+    # Failed VMs/CTs
+    for vm in data.get('vms_failed', []):
+        reason = vm.get('reason', 'unknown error')
+        parts.append(f"\u274C VM failed: {vm['name']} - {reason}")
+    
+    for ct in data.get('cts_failed', []):
+        reason = ct.get('reason', 'unknown error')
+        parts.append(f"\u274C CT failed: {ct['name']} - {reason}")
+    
+    # Storage issues
+    storage_unavailable = data.get('storage_unavailable', [])
+    if storage_unavailable:
+        names = [s['name'] for s in storage_unavailable[:3]]
+        parts.append(f"\u26A0\uFE0F Storage: {len(storage_unavailable)} unavailable ({', '.join(names)})")
+    
+    # Service issues  
+    services_failed = data.get('services_failed', [])
+    if services_failed:
+        names = [s['name'] for s in services_failed[:3]]
+        parts.append(f"\u26A0\uFE0F Services: {len(services_failed)} failed ({', '.join(names)})")
+    
+    # Startup duration
+    duration = data.get('startup_duration_seconds', 0)
+    if duration:
+        minutes = int(duration // 60)
+        parts.append(f"\u23F1\uFE0F Startup completed in {minutes} min")
+    
+    body = '\n'.join(parts)
+    return title, body
+
+
 # ─── Severity Icons ──────────────────────────────────────────────

 SEVERITY_ICONS = {
@@ -645,11 +729,12 @@ TEMPLATES = {
    
    # ── Services events ──
    'system_startup': {
-        'title': '{hostname}: System startup — {summary}',
-        'body': 'System startup completed.\n{summary}\n\nGuests: {entity_list}',
-        'label': 'System startup',
+        'title': '{hostname}: {reason}',
+        'body': '{summary}',
+        'label': 'System startup report',
        'group': 'services',
        'default_enabled': True,
+        'formatter': '_format_system_startup',
    },
    'system_shutdown': {
        'title': '{hostname}: System shutting down',
@@ -959,7 +1044,19 @@ def render_template(event_type: str, data: Dict[str, Any]) -> Dict[str, Any]:
    pve_message = data.get('pve_message', '')
    pve_title = data.get('pve_title', '')
    
-    if event_type in ('backup_complete', 'backup_fail') and pve_message:
+    # Check for custom formatter function
+    formatter_name = template.get('formatter')
+    if formatter_name and formatter_name in globals():
+        formatter_func = globals()[formatter_name]
+        try:
+            title, body_text = formatter_func(data)
+        except Exception:
+            # Fallback to standard formatting if formatter fails
+            try:
+                body_text = template['body'].format(**variables)
+            except (KeyError, ValueError):
+                body_text = template['body']
+    elif event_type in ('backup_complete', 'backup_fail') and pve_message:
        parsed = _parse_vzdump_message(pve_message)
        if parsed:
            is_success = (event_type == 'backup_complete')
@@ -1288,134 +1385,165 @@ AI_DETAIL_TOKENS = {
 # System prompt template - informative, no recommendations
 AI_SYSTEM_PROMPT = """You are a system notification formatter for ProxMenux Monitor, a Proxmox VE monitoring tool.

-Your task is to translate and reformat incoming server alert messages into {language}.
+Your task is to translate and lightly reformat incoming server alert messages into {language}.
+
+═══ CORE ROLE ═══
+You are a formatter, not an analyst.
+Translate, clean, and present the message clearly.
+Do NOT reinterpret the event, do NOT add meaning, and do NOT rebuild the message from scratch.

 ═══ ABSOLUTE RULES ═══
-  1. Translate BOTH title and body to {language}. Every word, label, and unit must be in {language}.
-  2. NO markdown: no **bold**, no *italic*, no `code`, no headers (#), no bullet lists (- or *)
-  3. Plain text only — the output is sent to chat apps and email which handle their own formatting
-  4. Tone: factual, concise, technical. No greetings, no closings, no apologies
-  5. DO NOT add recommendations, action items, or suggestions ("you should…", "consider…")
-  6. Present ONLY the facts already in the input — do not invent or assume information
-  7. OUTPUT ONLY THE FINAL RESULT — never include both original and processed versions.
-     Do NOT append "Original message:", "Original:", "Source:", or any before/after comparison.
-     Return ONLY the single, final formatted message in {language}.
-  8. PLAIN NARRATIVE LINES — if a line in the input is a complete sentence (not a "Label: value"
-     pair), translate it as-is. Never prepend "Message:", "Note:", or any other label to a sentence.
-  9. Detail level to apply: {detail_level}
-     - brief    → 2-3 lines, essential data only (status + key metric)
-     - standard → short paragraph covering who/what/where and the key value
-     - detailed → full technical breakdown of all available fields
-  10. Keep the "hostname: " prefix in the title. Translate only the descriptive part.
-      Example: "pve01: Updates available" → "pve01: Actualizaciones disponibles"
-  11. EMPTY LIST VALUES — if a list field is empty, "none", or "0":
-     Always write the translated word for "none" on the line after the label, never leave it blank.
-     Example: 🗂️ Important packages:\\n• none
-     Example (Spanish): 🗂️ Paquetes importantes:\\n• ninguno
-     Example (Français): 🗂️ Paquets importants:\\n• aucun
-  12. DEDUPLICATION — input may contain redundant or repeated information from multiple monitoring sources:
-      - Identify and merge duplicate facts (same device, same error, same metric mentioned twice)
-      - Present each unique fact exactly once in a clear, consolidated form
-      - If the same data appears in different formats, choose the most informative version
-  13. PROXMOX CONTEXT — silently translate Proxmox technical references into plain language.
-    Never explain what the term means — just use the human-readable equivalent directly.
+1. Translate BOTH title and body into {language}.

-    Service / process name mapping (replace the raw name with the friendly form):
-    - "pve-container@XXXX.service"  → "Container CT XXXX"
-    - "qemu-server@XXXX.service"    → "Virtual Machine VM XXXX"
-    - "pvesr-XXXX"                  → "storage replication job for XXXX"
-    - "vzdump"                      → "backup process"
-    - "pveproxy"                    → "Proxmox web proxy"
-    - "pvedaemon"                   → "Proxmox daemon"
-    - "pvestatd"                    → "Proxmox statistics service"
-    - "pvescheduler"                → "Proxmox task scheduler"
-    - "pve-cluster"                 → "Proxmox cluster service"
-    - "corosync"                    → "cluster communication service"
-    - "ceph-osd@N"                  → "Ceph storage disk N"
-    - "ceph-mon"                    → "Ceph monitor service"
+2. Translate human-readable text only.
+   Do NOT translate:
+   - hostnames
+   - device paths (/dev/sdX, /dev/nvmeXnX)
+   - filesystem paths
+   - IDs, VMIDs, CTIDs, UUIDs
+   - timestamps, dates, archive names, PBS paths
+   - version numbers
+   - technical units (B, KB, MB, GB, TB, KiB, MiB, GiB, TiB, %, ms, s)

-    systemd message patterns (rewrite the whole phrase, not just the service name):
-    - "systemd[1]: pve-container@9000.service: Failed"
-      → "Container CT 9000 service failed"
-    - "systemd[1]: qemu-server@100.service: Failed with result 'exit-code'"
-      → "Virtual Machine VM 100 failed to start"
-    - "systemd[1]: Started pve-container@9000.service"
-      → "Container CT 9000 started"
+3. Plain text only.
+   No markdown: no **bold**, no *italic*, no `code`, no headers (#), no markdown lists (- or *).
+   The bullet character "•" is allowed only where explicitly required.

-    ATA / SMART / kernel error patterns (replace raw kernel log with plain description):
-    - "ata8.00: exception Emask 0x1 SAct 0x4ce0 SErr 0x40000 action 0x0"
-      → "ATA controller error on port 8"
-    - "blk_update_request: I/O error, dev sdX, sector NNNN"
-      → "I/O error on disk /dev/sdX at sector NNNN"
-    - "SCSI error: return code = 0x08000002"
-      → "SCSI communication error"
+4. Tone: factual, concise, technical.
+   No greetings, no closings, no apologies, no conversational filler.
+
+5. Do NOT add recommendations, action items, remediation, or suggestions.
+
+6. Present ONLY the facts already present in the input.
+   Do NOT invent, assume, explain, soften, or escalate anything.
+
+7. Do NOT change severity or status meaning.
+   For example:
+   - "failed" must stay a failure
+   - "warning" must stay a warning
+   - "degraded" must stay degraded
+
+8. Preserve structure whenever possible.
+   Keep the same fields, lines, and data already present in the input.
+   Do NOT remove important lines such as storage, archive path, totals, durations, target node, reason, or summaries.
+
+9. Reordering must be minimal.
+   Only reorder lines if it clearly improves readability without changing meaning.
+
+10. PLAIN NARRATIVE LINES:
+    If a line is already a complete sentence, translate it as a sentence.
+    Do NOT prepend labels like "Message:", "Note:", or "Details:" unless they already exist in the input.
+
+11. Detail level to apply: {detail_level}
+    - brief    → compact output, keep only essential lines, but never remove critical facts
+    - standard → preserve structure with moderate cleanup
+    - detailed → preserve all available technical details
+
+12. DEDUPLICATION:
+    Remove ONLY exact duplicates or obviously duplicated repeated lines.
+    Do NOT merge distinct facts just because they look similar.
+    Do NOT summarize multiple separate events into one.
+
+13. Keep the "hostname: " prefix in the title.
+    Translate only the descriptive part.
+    Example: "pve01: Updates available" → "pve01: Actualizaciones disponibles"
+
+14. EMPTY VALUES:
+    If a list field is empty, "none", "0", or equivalent, write the translated word for "none".
+    Never leave a declared field blank.
+
+15. UNKNOWN INPUT:
+    If the message format is unfamiliar, preserve it as closely as possible and translate faithfully.
+    Do NOT force it into another template.
+
+═══ PROXMOX CONTEXT ═══
+Silently replace raw Proxmox technical references with the clearer forms below.
+Do NOT explain them. Just use the friendly equivalent directly.
+
+Service / process mappings:
+- "pve-container@XXXX.service"  → "Container CT XXXX"
+- "qemu-server@XXXX.service"    → "Virtual Machine VM XXXX"
+- "pvesr-XXXX"                  → "storage replication job for XXXX"
+- "vzdump"                      → "backup process"
+- "pveproxy"                    → "Proxmox web proxy"
+- "pvedaemon"                   → "Proxmox daemon"
+- "pvestatd"                    → "Proxmox statistics service"
+- "pvescheduler"                → "Proxmox task scheduler"
+- "pve-cluster"                 → "Proxmox cluster service"
+- "corosync"                    → "cluster communication service"
+- "ceph-osd@N"                  → "Ceph storage disk N"
+- "ceph-mon"                    → "Ceph monitor service"
+
+Systemd-style patterns:
+- "systemd[1]: pve-container@9000.service: Failed"
+  → "Container CT 9000 service failed"
+- "systemd[1]: qemu-server@100.service: Failed with result 'exit-code'"
+  → "Virtual Machine VM 100 failed to start"
+- "systemd[1]: Started pve-container@9000.service"
+  → "Container CT 9000 started"
+
+Kernel / storage patterns:
+- "ata8.00: exception Emask ..."
+  → "ATA controller error on port 8"
+- "blk_update_request: I/O error, dev sdX, sector NNNN"
+  → "I/O error on disk /dev/sdX at sector NNNN"
+- "SCSI error: return code = 0x08000002"
+  → "SCSI communication error"
+
+Apply these mappings in titles, field values, and body text when the raw technical string appears.

-    Apply these mappings everywhere: in the body narrative, in field values, and when
-    the raw technical string appears inside a longer sentence.
 {emoji_instructions}

-═══ MESSAGE TYPES — FORMAT RULES ═══
+═══ MESSAGE-TYPE GUIDANCE ═══

 BACKUP (backup_complete / backup_fail / backup_start):
-  Input contains: VM/CT names, IDs, size, duration, storage location, status per VM
-  Output body: first line is plain text (no emoji) describing the event briefly.
-  Then list each VM/CT with its fields. End with a summary line.
-  PARTIAL FAILURE RULE: if some VMs succeeded and at least one failed, use a combined title
-  like "Backup partially failed" / "Copia de seguridad parcialmente fallida" — never say
-  "backup failed" when there are also successful VMs in the same job.
-  NEVER omit the storage/archive line or the summary line — always include them even for long jobs.
+- Preserve per-VM / per-CT detail if present.
+- Preserve size, duration, storage/archive path, and final summary if present.
+- If both successes and failures are present in the same backup job, use a title equivalent to "Backup partially failed".
+- Do NOT collapse multi-guest backup results into a single generic sentence.

 UPDATES (update_summary):
-  - Each count on its own line with its label.
-  - Package list uses "• " (bullet + space) per package, NOT the 🗂️ emoji on each line.
-  - The 🗂️ emoji goes only on the "Important packages:" header line.
-  - NEVER add a redundant summary line repeating the total count.
- 
-PVE UPDATE (pve_update):
-  - First line: plain sentence announcing the new version (no emoji on this line).
-  - Blank line after intro.
-  - Current version: 🔹 prefix  |  New version: 🟢 prefix
-  - Blank line before packages block.
-  - Packages header: 🗂️  |  Package lines: 📌 prefix with version arrow v{{old}} ➜ v{{new}}
+- Keep each count on its own line.
+- Keep the important packages block if present.
+- Use "• " for package items.
+- Do NOT add a redundant summary line repeating totals already shown.

-DISK / SMART ERRORS (disk_io_error / storage_unavailable):
-  Input contains: device name, error type, SMART values or I/O error codes
-  Output body: device, then the specific error or failing attribute
-  DEDUPLICATION: Input may contain repeated or similar information from multiple sources.
-  If you see the same device, error count, or technical details mentioned multiple times,
-  consolidate them into a single, clear statement. Never repeat the same information twice.
+PVE UPDATE (pve_update):
+- Preserve current version, new version, and package list if present.
+- Keep the announcement concise.
+
+DISK / SMART / STORAGE (disk_io_error / storage_unavailable):
+- Preserve device, specific error, failing attribute, and counts if present.
+- Do NOT repeat the same disk fact twice.

 RESOURCES (cpu_high / ram_high / temp_high / load_high):
-  Input contains: current value, threshold, core count
-  Output: current value vs threshold, context if available
+- Preserve current value, threshold, and context if present.

 SECURITY (auth_fail / ip_block):
-  Input contains: source IP, user, service, jail, failure count
-  Output: list each field on its own line
+- Keep source IP, user, service, jail, and failure count on separate clear lines if present.

-VM/CT LIFECYCLE (vm_start, vm_stop, vm_fail, ct_*, migration_*, replication_*):
-  Input contains: VM name, ID, target node (migrations), reason (failures)
-  Output: one or two lines confirming the event with key facts
+VM / CT LIFECYCLE (vm_*, ct_*, migration_*, replication_*):
+- Keep name, ID, state, reason, and target node if present.
+- Keep lifecycle messages compact unless detail_level is detailed.

-CLUSTER (split_brain / node_disconnect / node_reconnect):
-  Input: node name, quorum status
-  Output: state change + quorum value
+CLUSTER / HEALTH:
+- Preserve node name, quorum, category, severity, duration, and reason if present.

-HEALTH (new_error / error_resolved / health_persistent / health_degraded):
-  Input: category, severity, duration, reason
-  Output: what changed, in which category, for how long (if resolved)
-
-CRITICAL:
- [TITLE] on its own line, title text on the very next line — no blank line between them
- [BODY] on its own line, body text starting on the very next line — no blank line between them
- Do NOT write "Title:", "Body:", or any label substituting the markers
- Do NOT include the literal words TITLE or BODY anywhere in the translated content
-
-═══ OUTPUT FORMAT (follow exactly — parsers rely on these markers) ═══
+═══ OUTPUT FORMAT ═══
 [TITLE]
 translated title here
 [BODY]
-translated body here"""
+translated body here
+
+CRITICAL OUTPUT RULES:
+- Write [TITLE] on its own line
+- Write the title on the next line
+- Write [BODY] on its own line
+- Write the body starting on the next line
+- Do NOT replace these markers with "Title:" or "Body:"
+- Do NOT include any extra text before or after the formatted result
+- Do NOT add blank lines between [TITLE] and the title
+- Do NOT add blank lines between [BODY] and the first body line"""

 # Emoji instructions injected into AI_SYSTEM_PROMPT for rich channels (Telegram, Discord, Pushover)
 AI_EMOJI_INSTRUCTIONS = """
@@ -1485,135 +1613,10 @@ A blank line must be completely empty — no emoji, no spaces.
    🟢  new version (pve_update)


-   BLANK LINES FOR READABILITY — insert ONE blank line between logical sections within the body.
-   Blank lines go BETWEEN groups, not before the first line or after the last line.
-   A blank line must be completely empty — no emoji, no spaces.
-
-   When to add a blank line:
-   - Updates: after the last count line, before the packages block
-   - Backup multi-VM: one blank line between each VM entry; one blank line before the summary line
-   - Disk/SMART errors: after the device line, before the error description lines
-   - VM events with a reason: after the main status line, before Reason / Node / Target lines
-   - Health events: after the category/status line, before duration or detail lines
-
-    EXAMPLE — CT shutdown:
-    [TITLE]
-    🔽 amd: CT alpine (101) shut down
-    [BODY]
-    🏷️ Container alpine (ID: 101)
-    ✔️ Cleanly shut down
-
-    EXAMPLE — VM started:
-    [TITLE]
-    🚀 pve01: VM arch-linux (100) started
-    [BODY]
-    🏷️ Virtual machine arch-linux (ID: 100)
-    ✔️ Now running
-
-    EXAMPLE — migration complete:
-    [TITLE]
-    🚚 amd: Migration complete — web01 (100)
-    [BODY]
-    🏷️ Virtual machine web01 (ID: 100)
-    ✔️ Successfully migrated
-    
-    🎯 Target: node02
-
-    EXAMPLE — updates message (no important packages):
-    [TITLE]
-    📦 amd: Updates available
-    [BODY]
-    📦 Total updates: 24
-    🔒 Security updates: 6
-    🔄 Proxmox updates: 0
-    ⚙️ Kernel updates: 0
-
-    🗂️ Important packages:
-    • none
-
-    EXAMPLE — updates message (with important packages):
-    [TITLE]
-    📦 amd: Updates available
-    [BODY]
-    📦 Total updates: 90
-    🔒 Security updates: 6
-    🔄 Proxmox updates: 14
-    ⚙️ Kernel updates: 1
-
-    🗂️ Important packages:
-    • pve-manager (9.1.4 -> 9.1.6)
-    • qemu-server (9.1.3 -> 9.1.4)
-    • pve-container (6.0.18 -> 6.1.2)
-    
-    EXAMPLE — pve_update (new Proxmox VE version):
-    [TITLE]
-    🆕 pve01: Proxmox VE 9.1.6 available
-    [BODY]
-    🚀 A new Proxmox VE release is available.
-
-    🔹 Current: 9.1.4
-    🟢 New: 9.1.6
-
-    🗂️ Important packages:
-    📌 pve-manager (v9.1.4 ➜ v9.1.6)
-
-    EXAMPLE — backup complete with multiple VMs:
-    [TITLE]
-    💾✅ pve01: Backup complete
-    [BODY]
-    Backup job finished on storage local-bak.
-
-    🏷️ VM web01 (ID: 100)
-    ✔️ Status: ok
-    💽 Size: 12.3 GiB
-    ⏱️ Duration: 00:04:21
-    🗄️ Storage: vm/100/2026-03-17T22:00:08Z
-
-    🏷️ CT db (ID: 101)
-    ✔️ Status: ok
-    💽 Size: 4.1 GiB
-    ⏱️ Duration: 00:01:10
-    🗄️ Storage: ct/101/2026-03-17T22:04:29Z
-
-    📊 Total: 2 backups | 💾 16.4 GiB | ⏱️ 00:05:31
-
-    EXAMPLE — backup partially failed (some ok, some failed):
-    [TITLE]
-    💾❌ pve01: Backup partially failed
-    [BODY]
-    Backup job finished with errors on storage PBS2.
-
-    🏷️ VM web01 (ID: 100)
-    ✔️ Status: ok
-    💽 Size: 12.3 GiB
-    ⏱️ Duration: 00:04:21
-    🗄️ Storage: vm/100/2026-03-17T22:00:08Z
-
-    🏷️ VM broken (ID: 102)
-    ❌ Status: error
-    💽 Size: 0 B
-    ⏱️ Duration: 00:00:37
-
-    📊 Total: 2 backups | ❌ 1 failed | 💾 12.3 GiB | ⏱️ 00:04:58
-
-    EXAMPLE — disk I/O health warning:
-    [TITLE]
-    💥 amd: Health warning — Disk I/O errors
-    [BODY]
-    💿 Device: /dev/sda
-
-    ⚠️ 1 sector currently unreadable (pending)
-    📝 Disk reports sectors in pending reallocation state
-
-    EXAMPLE — health degraded (multiple issues):
-    [TITLE]
-    ⚠️ amd: 2 health checks degraded
-    [BODY]
-    💥 Disk I/O error on /dev/sda: 1 sector currently unreadable (pending)
-
-    🏷️ Container CT 9005: ❌ failed to start
-    🏷️ Container CT 9004: ❌ failed to start
-    🏷️ Container CT 9002: ❌ failed to start"""
+    BLANK LINES:
+    Insert one blank line only between logical sections inside the body.
+    Do not add a blank line before the first body line or after the last one.
+    """


 # No emoji instructions for email/plain text channels
@@ -120,7 +120,7 @@ class _StartupGraceState:
        with self._lock:
            return time.time() - self._startup_time
    
-    # ─── Shutdown Tracking ───────────────────────────────────────────────────
+    # ─── Shutdown Tracking ────────────────────────────────────────��──────────
    
    def mark_shutdown(self):
        """
@@ -231,6 +231,219 @@ def was_startup_aggregated() -> bool:
    return _state.was_startup_aggregated()


+# ─── Startup Report Collection ───────────────────────────────────────────────
+
+def collect_startup_report() -> dict:
+    """
+    Collect comprehensive startup report data.
+    
+    Called at the end of the grace period to generate a complete
+    startup report including:
+    - VMs/CTs that started successfully
+    - VMs/CTs that failed to start
+    - Service status
+    - Storage status
+    - Journal errors during boot (for AI enrichment)
+    
+    Returns:
+        Dictionary with startup report data
+    """
+    import subprocess
+    
+    report = {
+        # VMs/CTs
+        'vms_started': [],
+        'cts_started': [],
+        'vms_failed': [],
+        'cts_failed': [],
+        
+        # System status
+        'services_ok': True,
+        'services_failed': [],
+        'storage_ok': True,
+        'storage_unavailable': [],
+        
+        # Health summary
+        'health_status': 'OK',
+        'health_issues': [],
+        
+        # For AI enrichment
+        '_journal_context': '',
+        '_startup_errors': [],
+        
+        # Metadata
+        'startup_duration_seconds': get_startup_elapsed(),
+        'timestamp': int(time.time()),
+    }
+    
+    # Get VMs/CTs that started during boot
+    startup_vms = get_and_clear_startup_vms()
+    for vmid, vmname, vm_type in startup_vms:
+        if vm_type == 'vm':
+            report['vms_started'].append({'vmid': vmid, 'name': vmname})
+        else:
+            report['cts_started'].append({'vmid': vmid, 'name': vmname})
+    
+    # Try to get health status from health_monitor
+    try:
+        import health_monitor
+        health_data = health_monitor.get_detailed_status()
+        
+        if health_data:
+            report['health_status'] = health_data.get('overall_status', 'UNKNOWN')
+            
+            # Check storage
+            storage_cat = health_data.get('categories', {}).get('storage', {})
+            if storage_cat.get('status') in ['CRITICAL', 'WARNING']:
+                report['storage_ok'] = False
+                for check in storage_cat.get('checks', []):
+                    if check.get('status') in ['CRITICAL', 'WARNING', 'error']:
+                        report['storage_unavailable'].append({
+                            'name': check.get('name', 'unknown'),
+                            'reason': check.get('reason', check.get('message', ''))
+                        })
+            
+            # Check services
+            services_cat = health_data.get('categories', {}).get('services', {})
+            if services_cat.get('status') in ['CRITICAL', 'WARNING']:
+                report['services_ok'] = False
+                for check in services_cat.get('checks', []):
+                    if check.get('status') in ['CRITICAL', 'WARNING', 'error']:
+                        report['services_failed'].append({
+                            'name': check.get('name', 'unknown'),
+                            'reason': check.get('reason', check.get('message', ''))
+                        })
+            
+            # Check VMs category for failed VMs
+            vms_cat = health_data.get('categories', {}).get('vms', {})
+            for check in vms_cat.get('checks', []):
+                if check.get('status') in ['CRITICAL', 'WARNING', 'error']:
+                    # Determine if VM or CT based on name/type
+                    check_name = check.get('name', '')
+                    check_reason = check.get('reason', check.get('message', ''))
+                    if 'error al iniciar' in check_reason.lower() or 'failed to start' in check_reason.lower():
+                        if 'CT' in check_name or 'Container' in check_name:
+                            report['cts_failed'].append({
+                                'name': check_name,
+                                'reason': check_reason
+                            })
+                        else:
+                            report['vms_failed'].append({
+                                'name': check_name,
+                                'reason': check_reason
+                            })
+            
+            # Collect all health issues for summary
+            for cat_name, cat_data in health_data.get('categories', {}).items():
+                if cat_data.get('status') in ['CRITICAL', 'WARNING']:
+                    report['health_issues'].append({
+                        'category': cat_name,
+                        'status': cat_data.get('status'),
+                        'reason': cat_data.get('reason', '')
+                    })
+    except Exception as e:
+        report['_startup_errors'].append(f"Error getting health data: {e}")
+    
+    # Get journal errors during startup (for AI enrichment)
+    try:
+        boot_time = int(_state._startup_time)
+        result = subprocess.run(
+            ['journalctl', '-p', 'err', '--since', f'@{boot_time}', '--no-pager', '-n', '50'],
+            capture_output=True,
+            text=True,
+            timeout=10
+        )
+        if result.returncode == 0 and result.stdout.strip():
+            report['_journal_context'] = result.stdout.strip()
+    except Exception as e:
+        report['_startup_errors'].append(f"Error getting journal: {e}")
+    
+    return report
+
+
+def format_startup_summary(report: dict) -> str:
+    """
+    Format a human-readable startup summary from report data.
+    
+    Args:
+        report: Dictionary from collect_startup_report()
+    
+    Returns:
+        Formatted summary string
+    """
+    lines = []
+    
+    # Count totals
+    vms_ok = len(report.get('vms_started', []))
+    cts_ok = len(report.get('cts_started', []))
+    vms_fail = len(report.get('vms_failed', []))
+    cts_fail = len(report.get('cts_failed', []))
+    
+    total_ok = vms_ok + cts_ok
+    total_fail = vms_fail + cts_fail
+    
+    # Determine overall status
+    has_issues = (
+        total_fail > 0 or
+        not report.get('services_ok', True) or
+        not report.get('storage_ok', True) or
+        report.get('health_status') in ['CRITICAL', 'WARNING']
+    )
+    
+    # Header
+    if has_issues:
+        issue_count = total_fail + len(report.get('services_failed', [])) + len(report.get('storage_unavailable', []))
+        lines.append(f"System startup - {issue_count} issue(s) detected")
+    else:
+        lines.append("System startup completed")
+        lines.append("All systems operational.")
+    
+    # VMs/CTs started
+    if total_ok > 0:
+        parts = []
+        if vms_ok > 0:
+            parts.append(f"{vms_ok} VM{'s' if vms_ok > 1 else ''}")
+        if cts_ok > 0:
+            parts.append(f"{cts_ok} CT{'s' if cts_ok > 1 else ''}")
+        
+        # List names
+        names = []
+        for vm in report.get('vms_started', []):
+            names.append(f"{vm['name']} ({vm['vmid']})")
+        for ct in report.get('cts_started', []):
+            names.append(f"{ct['name']} ({ct['vmid']})")
+        
+        line = f"{' and '.join(parts)} started"
+        if names and len(names) <= 5:
+            line += f": {', '.join(names)}"
+        elif names:
+            line += f": {', '.join(names[:3])}... (+{len(names)-3} more)"
+        lines.append(line)
+    
+    # Failed VMs/CTs
+    if total_fail > 0:
+        for vm in report.get('vms_failed', []):
+            lines.append(f"VM failed: {vm['name']} - {vm.get('reason', 'unknown error')}")
+        for ct in report.get('cts_failed', []):
+            lines.append(f"CT failed: {ct['name']} - {ct.get('reason', 'unknown error')}")
+    
+    # Storage issues
+    if not report.get('storage_ok', True):
+        unavailable = report.get('storage_unavailable', [])
+        if unavailable:
+            names = [s['name'] for s in unavailable]
+            lines.append(f"Storage: {len(unavailable)} unavailable ({', '.join(names[:3])})")
+    
+    # Service issues
+    if not report.get('services_ok', True):
+        failed = report.get('services_failed', [])
+        if failed:
+            names = [s['name'] for s in failed]
+            lines.append(f"Services: {len(failed)} failed ({', '.join(names[:3])})")
+    
+    return '\n'.join(lines)
+
+
 # ─── For backwards compatibility ─────────────────────────────────────────────

 # Expose constants for external use