Update notification service

This commit is contained in:
MacRimi
2026-03-26 20:04:53 +01:00
parent 839a20df97
commit 7c5e7208b9
4 changed files with 530 additions and 276 deletions

View File

@@ -1093,7 +1093,7 @@ class HealthPersistence:
conn.commit()
conn.close()
# ─── System Capabilities Cache ──────────────────────────────
# ─── System Capabilities Cache ────────────────────<EFBFBD><EFBFBD>──────────
def get_capability(self, cap_key: str) -> Optional[str]:
"""

View File

@@ -2144,60 +2144,98 @@ class PollingCollector:
self._first_poll_done = True
def _check_startup_aggregation(self):
"""Check if startup period ended and emit aggregated VM/CT start message.
"""Check if startup period ended and emit comprehensive startup report.
During the startup grace period, TaskWatcher collects VM/CT starts instead
of emitting individual notifications. Once the period ends, this method
emits a single aggregated "System startup" notification.
At the end of the health grace period, collects:
- VMs/CTs that started successfully
- VMs/CTs that failed to start
- Service status
- Storage status
- Journal errors (for AI enrichment)
Emits a single "system_startup" notification with full report data.
"""
# Only check once startup period is over
if _shared_state.is_startup_period():
# Wait until health grace period is over (5 min) for complete picture
if startup_grace.is_startup_health_grace():
return
# Only emit once
if _shared_state.was_startup_aggregated():
if startup_grace.was_startup_aggregated():
return
# Get all collected startup VMs/CTs
startup_items = _shared_state.get_and_clear_startup_vms()
if not startup_items:
return
# Collect comprehensive startup report
report = startup_grace.collect_startup_report()
# Count VMs and CTs
vms = [(vmid, name) for vmid, name, vtype in startup_items if vtype == 'vm']
cts = [(vmid, name) for vmid, name, vtype in startup_items if vtype == 'ct']
# Generate human-readable summary
summary = startup_grace.format_startup_summary(report)
vm_count = len(vms)
ct_count = len(cts)
total = vm_count + ct_count
# Count totals
vms_ok = len(report.get('vms_started', []))
cts_ok = len(report.get('cts_started', []))
vms_fail = len(report.get('vms_failed', []))
cts_fail = len(report.get('cts_failed', []))
total_ok = vms_ok + cts_ok
total_fail = vms_fail + cts_fail
# Build entity list (max 10 items for readability)
# Build entity list for backwards compatibility
entity_names = []
for vmid, name in (vms + cts)[:10]:
entity_names.append(f'{name} ({vmid})')
if total > 10:
entity_names.append(f'...and {total - 10} more')
for vm in report.get('vms_started', [])[:5]:
entity_names.append(f"{vm['name']} ({vm['vmid']})")
for ct in report.get('cts_started', [])[:5]:
entity_names.append(f"{ct['name']} ({ct['vmid']})")
if total_ok > 10:
entity_names.append(f"...and {total_ok - 10} more")
# Build summary text
parts = []
if vm_count:
parts.append(f'{vm_count} VM{"s" if vm_count != 1 else ""}')
if ct_count:
parts.append(f'{ct_count} CT{"s" if ct_count != 1 else ""}')
summary = ' and '.join(parts) + ' started'
# Determine severity based on issues
has_issues = (
total_fail > 0 or
not report.get('services_ok', True) or
not report.get('storage_ok', True) or
report.get('health_status') in ['CRITICAL', 'WARNING']
)
severity = 'WARNING' if has_issues else 'INFO'
# Build notification data
data = {
'hostname': self._hostname,
'summary': summary,
'vm_count': vm_count,
'ct_count': ct_count,
'total_count': total,
# VM/CT counts (backwards compatible)
'vm_count': vms_ok,
'ct_count': cts_ok,
'total_count': total_ok,
'entity_list': ', '.join(entity_names),
'reason': f'System startup completed: {summary}',
# New: failure counts
'vms_failed_count': vms_fail,
'cts_failed_count': cts_fail,
'total_failed': total_fail,
# New: detailed lists
'vms_started': report.get('vms_started', []),
'cts_started': report.get('cts_started', []),
'vms_failed': report.get('vms_failed', []),
'cts_failed': report.get('cts_failed', []),
# New: system status
'services_ok': report.get('services_ok', True),
'services_failed': report.get('services_failed', []),
'storage_ok': report.get('storage_ok', True),
'storage_unavailable': report.get('storage_unavailable', []),
'health_status': report.get('health_status', 'UNKNOWN'),
'health_issues': report.get('health_issues', []),
# For AI enrichment
'_journal_context': report.get('_journal_context', ''),
# Metadata
'startup_duration_seconds': report.get('startup_duration_seconds', 0),
'has_issues': has_issues,
'reason': summary.split('\n')[0], # First line as reason
}
self._queue.put(NotificationEvent(
'system_startup', 'INFO', data, source='polling',
'system_startup', severity, data, source='polling',
entity='node', entity_id='',
))
@@ -2500,7 +2538,7 @@ class PollingCollector:
except Exception as e:
print(f"[PollingCollector] AI model check failed: {e}")
# ── Persistence helpers ───────────────────────────────────
# ── Persistence helpers ──────────────────────────────<EFBFBD><EFBFBD>─────
def _load_last_notified(self):
"""Load per-error notification timestamps from DB on startup."""

View File

@@ -17,7 +17,7 @@ import socket
import time
import urllib.request
import urllib.error
from typing import Dict, Any, Optional, List
from typing import Dict, Any, Optional, List, Tuple
# ─── vzdump message parser ───────────────────────────────────────
@@ -314,6 +314,90 @@ def _format_vzdump_body(parsed: Dict[str, Any], is_success: bool) -> str:
return '\n'.join(parts)
def _format_system_startup(data: Dict[str, Any]) -> Tuple[str, str]:
"""
Format comprehensive system startup report.
Returns (title, body) tuple for the notification.
Handles both simple startups (all OK) and those with issues.
"""
hostname = data.get('hostname', 'unknown')
has_issues = data.get('has_issues', False)
# Build title
if has_issues:
total_issues = (
data.get('total_failed', 0) +
len(data.get('services_failed', [])) +
len(data.get('storage_unavailable', []))
)
title = f"{hostname}: System startup - {total_issues} issue(s) detected"
else:
title = f"{hostname}: System startup completed"
# Build body
parts = []
# Overall status
if not has_issues:
parts.append("All systems operational.")
# VMs/CTs started
vms_ok = len(data.get('vms_started', []))
cts_ok = len(data.get('cts_started', []))
if vms_ok or cts_ok:
count_parts = []
if vms_ok:
count_parts.append(f"{vms_ok} VM{'s' if vms_ok > 1 else ''}")
if cts_ok:
count_parts.append(f"{cts_ok} CT{'s' if cts_ok > 1 else ''}")
# List names (up to 5)
names = []
for vm in data.get('vms_started', [])[:3]:
names.append(f"{vm['name']} ({vm['vmid']})")
for ct in data.get('cts_started', [])[:3]:
names.append(f"{ct['name']} ({ct['vmid']})")
line = f"\u2705 {' and '.join(count_parts)} started"
if names:
if len(names) <= 5:
line += f": {', '.join(names)}"
else:
line += f": {', '.join(names[:5])}..."
parts.append(line)
# Failed VMs/CTs
for vm in data.get('vms_failed', []):
reason = vm.get('reason', 'unknown error')
parts.append(f"\u274C VM failed: {vm['name']} - {reason}")
for ct in data.get('cts_failed', []):
reason = ct.get('reason', 'unknown error')
parts.append(f"\u274C CT failed: {ct['name']} - {reason}")
# Storage issues
storage_unavailable = data.get('storage_unavailable', [])
if storage_unavailable:
names = [s['name'] for s in storage_unavailable[:3]]
parts.append(f"\u26A0\uFE0F Storage: {len(storage_unavailable)} unavailable ({', '.join(names)})")
# Service issues
services_failed = data.get('services_failed', [])
if services_failed:
names = [s['name'] for s in services_failed[:3]]
parts.append(f"\u26A0\uFE0F Services: {len(services_failed)} failed ({', '.join(names)})")
# Startup duration
duration = data.get('startup_duration_seconds', 0)
if duration:
minutes = int(duration // 60)
parts.append(f"\u23F1\uFE0F Startup completed in {minutes} min")
body = '\n'.join(parts)
return title, body
# ─── Severity Icons ──────────────────────────────────────────────
SEVERITY_ICONS = {
@@ -645,11 +729,12 @@ TEMPLATES = {
# ── Services events ──
'system_startup': {
'title': '{hostname}: System startup — {summary}',
'body': 'System startup completed.\n{summary}\n\nGuests: {entity_list}',
'label': 'System startup',
'title': '{hostname}: {reason}',
'body': '{summary}',
'label': 'System startup report',
'group': 'services',
'default_enabled': True,
'formatter': '_format_system_startup',
},
'system_shutdown': {
'title': '{hostname}: System shutting down',
@@ -959,7 +1044,19 @@ def render_template(event_type: str, data: Dict[str, Any]) -> Dict[str, Any]:
pve_message = data.get('pve_message', '')
pve_title = data.get('pve_title', '')
if event_type in ('backup_complete', 'backup_fail') and pve_message:
# Check for custom formatter function
formatter_name = template.get('formatter')
if formatter_name and formatter_name in globals():
formatter_func = globals()[formatter_name]
try:
title, body_text = formatter_func(data)
except Exception:
# Fallback to standard formatting if formatter fails
try:
body_text = template['body'].format(**variables)
except (KeyError, ValueError):
body_text = template['body']
elif event_type in ('backup_complete', 'backup_fail') and pve_message:
parsed = _parse_vzdump_message(pve_message)
if parsed:
is_success = (event_type == 'backup_complete')
@@ -1288,134 +1385,165 @@ AI_DETAIL_TOKENS = {
# System prompt template - informative, no recommendations
AI_SYSTEM_PROMPT = """You are a system notification formatter for ProxMenux Monitor, a Proxmox VE monitoring tool.
Your task is to translate and reformat incoming server alert messages into {language}.
Your task is to translate and lightly reformat incoming server alert messages into {language}.
═══ CORE ROLE ═══
You are a formatter, not an analyst.
Translate, clean, and present the message clearly.
Do NOT reinterpret the event, do NOT add meaning, and do NOT rebuild the message from scratch.
═══ ABSOLUTE RULES ═══
1. Translate BOTH title and body to {language}. Every word, label, and unit must be in {language}.
2. NO markdown: no **bold**, no *italic*, no `code`, no headers (#), no bullet lists (- or *)
3. Plain text only — the output is sent to chat apps and email which handle their own formatting
4. Tone: factual, concise, technical. No greetings, no closings, no apologies
5. DO NOT add recommendations, action items, or suggestions ("you should…", "consider…")
6. Present ONLY the facts already in the input — do not invent or assume information
7. OUTPUT ONLY THE FINAL RESULT — never include both original and processed versions.
Do NOT append "Original message:", "Original:", "Source:", or any before/after comparison.
Return ONLY the single, final formatted message in {language}.
8. PLAIN NARRATIVE LINES — if a line in the input is a complete sentence (not a "Label: value"
pair), translate it as-is. Never prepend "Message:", "Note:", or any other label to a sentence.
9. Detail level to apply: {detail_level}
- brief → 2-3 lines, essential data only (status + key metric)
- standard → short paragraph covering who/what/where and the key value
- detailed → full technical breakdown of all available fields
10. Keep the "hostname: " prefix in the title. Translate only the descriptive part.
Example: "pve01: Updates available""pve01: Actualizaciones disponibles"
11. EMPTY LIST VALUES — if a list field is empty, "none", or "0":
Always write the translated word for "none" on the line after the label, never leave it blank.
Example: 🗂️ Important packages:\\n• none
Example (Spanish): 🗂️ Paquetes importantes:\\n• ninguno
Example (Français): 🗂️ Paquets importants:\\n• aucun
12. DEDUPLICATION — input may contain redundant or repeated information from multiple monitoring sources:
- Identify and merge duplicate facts (same device, same error, same metric mentioned twice)
- Present each unique fact exactly once in a clear, consolidated form
- If the same data appears in different formats, choose the most informative version
13. PROXMOX CONTEXT — silently translate Proxmox technical references into plain language.
Never explain what the term means — just use the human-readable equivalent directly.
1. Translate BOTH title and body into {language}.
Service / process name mapping (replace the raw name with the friendly form):
- "pve-container@XXXX.service""Container CT XXXX"
- "qemu-server@XXXX.service""Virtual Machine VM XXXX"
- "pvesr-XXXX""storage replication job for XXXX"
- "vzdump""backup process"
- "pveproxy""Proxmox web proxy"
- "pvedaemon""Proxmox daemon"
- "pvestatd""Proxmox statistics service"
- "pvescheduler" "Proxmox task scheduler"
- "pve-cluster""Proxmox cluster service"
- "corosync""cluster communication service"
- "ceph-osd@N""Ceph storage disk N"
- "ceph-mon""Ceph monitor service"
2. Translate human-readable text only.
Do NOT translate:
- hostnames
- device paths (/dev/sdX, /dev/nvmeXnX)
- filesystem paths
- IDs, VMIDs, CTIDs, UUIDs
- timestamps, dates, archive names, PBS paths
- version numbers
- technical units (B, KB, MB, GB, TB, KiB, MiB, GiB, TiB, %, ms, s)
systemd message patterns (rewrite the whole phrase, not just the service name):
- "systemd[1]: pve-container@9000.service: Failed"
"Container CT 9000 service failed"
- "systemd[1]: qemu-server@100.service: Failed with result 'exit-code'"
"Virtual Machine VM 100 failed to start"
- "systemd[1]: Started pve-container@9000.service"
"Container CT 9000 started"
3. Plain text only.
No markdown: no **bold**, no *italic*, no `code`, no headers (#), no markdown lists (- or *).
The bullet character "" is allowed only where explicitly required.
ATA / SMART / kernel error patterns (replace raw kernel log with plain description):
- "ata8.00: exception Emask 0x1 SAct 0x4ce0 SErr 0x40000 action 0x0"
"ATA controller error on port 8"
- "blk_update_request: I/O error, dev sdX, sector NNNN"
"I/O error on disk /dev/sdX at sector NNNN"
- "SCSI error: return code = 0x08000002"
"SCSI communication error"
4. Tone: factual, concise, technical.
No greetings, no closings, no apologies, no conversational filler.
5. Do NOT add recommendations, action items, remediation, or suggestions.
6. Present ONLY the facts already present in the input.
Do NOT invent, assume, explain, soften, or escalate anything.
7. Do NOT change severity or status meaning.
For example:
- "failed" must stay a failure
- "warning" must stay a warning
- "degraded" must stay degraded
8. Preserve structure whenever possible.
Keep the same fields, lines, and data already present in the input.
Do NOT remove important lines such as storage, archive path, totals, durations, target node, reason, or summaries.
9. Reordering must be minimal.
Only reorder lines if it clearly improves readability without changing meaning.
10. PLAIN NARRATIVE LINES:
If a line is already a complete sentence, translate it as a sentence.
Do NOT prepend labels like "Message:", "Note:", or "Details:" unless they already exist in the input.
11. Detail level to apply: {detail_level}
- brief → compact output, keep only essential lines, but never remove critical facts
- standard → preserve structure with moderate cleanup
- detailed → preserve all available technical details
12. DEDUPLICATION:
Remove ONLY exact duplicates or obviously duplicated repeated lines.
Do NOT merge distinct facts just because they look similar.
Do NOT summarize multiple separate events into one.
13. Keep the "hostname: " prefix in the title.
Translate only the descriptive part.
Example: "pve01: Updates available""pve01: Actualizaciones disponibles"
14. EMPTY VALUES:
If a list field is empty, "none", "0", or equivalent, write the translated word for "none".
Never leave a declared field blank.
15. UNKNOWN INPUT:
If the message format is unfamiliar, preserve it as closely as possible and translate faithfully.
Do NOT force it into another template.
═══ PROXMOX CONTEXT ═══
Silently replace raw Proxmox technical references with the clearer forms below.
Do NOT explain them. Just use the friendly equivalent directly.
Service / process mappings:
- "pve-container@XXXX.service""Container CT XXXX"
- "qemu-server@XXXX.service""Virtual Machine VM XXXX"
- "pvesr-XXXX""storage replication job for XXXX"
- "vzdump""backup process"
- "pveproxy""Proxmox web proxy"
- "pvedaemon""Proxmox daemon"
- "pvestatd""Proxmox statistics service"
- "pvescheduler""Proxmox task scheduler"
- "pve-cluster""Proxmox cluster service"
- "corosync""cluster communication service"
- "ceph-osd@N""Ceph storage disk N"
- "ceph-mon""Ceph monitor service"
Systemd-style patterns:
- "systemd[1]: pve-container@9000.service: Failed"
"Container CT 9000 service failed"
- "systemd[1]: qemu-server@100.service: Failed with result 'exit-code'"
"Virtual Machine VM 100 failed to start"
- "systemd[1]: Started pve-container@9000.service"
"Container CT 9000 started"
Kernel / storage patterns:
- "ata8.00: exception Emask ..."
"ATA controller error on port 8"
- "blk_update_request: I/O error, dev sdX, sector NNNN"
"I/O error on disk /dev/sdX at sector NNNN"
- "SCSI error: return code = 0x08000002"
"SCSI communication error"
Apply these mappings in titles, field values, and body text when the raw technical string appears.
Apply these mappings everywhere: in the body narrative, in field values, and when
the raw technical string appears inside a longer sentence.
{emoji_instructions}
═══ MESSAGE TYPES — FORMAT RULES ═══
═══ MESSAGE-TYPE GUIDANCE ═══
BACKUP (backup_complete / backup_fail / backup_start):
Input contains: VM/CT names, IDs, size, duration, storage location, status per VM
Output body: first line is plain text (no emoji) describing the event briefly.
Then list each VM/CT with its fields. End with a summary line.
PARTIAL FAILURE RULE: if some VMs succeeded and at least one failed, use a combined title
like "Backup partially failed" / "Copia de seguridad parcialmente fallida" — never say
"backup failed" when there are also successful VMs in the same job.
NEVER omit the storage/archive line or the summary line — always include them even for long jobs.
- Preserve per-VM / per-CT detail if present.
- Preserve size, duration, storage/archive path, and final summary if present.
- If both successes and failures are present in the same backup job, use a title equivalent to "Backup partially failed".
- Do NOT collapse multi-guest backup results into a single generic sentence.
UPDATES (update_summary):
- Each count on its own line with its label.
- Package list uses "" (bullet + space) per package, NOT the 🗂️ emoji on each line.
- The 🗂️ emoji goes only on the "Important packages:" header line.
- NEVER add a redundant summary line repeating the total count.
PVE UPDATE (pve_update):
- First line: plain sentence announcing the new version (no emoji on this line).
- Blank line after intro.
- Current version: 🔹 prefix | New version: 🟢 prefix
- Blank line before packages block.
- Packages header: 🗂️ | Package lines: 📌 prefix with version arrow v{{old}} ➜ v{{new}}
- Keep each count on its own line.
- Keep the important packages block if present.
- Use "" for package items.
- Do NOT add a redundant summary line repeating totals already shown.
DISK / SMART ERRORS (disk_io_error / storage_unavailable):
Input contains: device name, error type, SMART values or I/O error codes
Output body: device, then the specific error or failing attribute
DEDUPLICATION: Input may contain repeated or similar information from multiple sources.
If you see the same device, error count, or technical details mentioned multiple times,
consolidate them into a single, clear statement. Never repeat the same information twice.
PVE UPDATE (pve_update):
- Preserve current version, new version, and package list if present.
- Keep the announcement concise.
DISK / SMART / STORAGE (disk_io_error / storage_unavailable):
- Preserve device, specific error, failing attribute, and counts if present.
- Do NOT repeat the same disk fact twice.
RESOURCES (cpu_high / ram_high / temp_high / load_high):
Input contains: current value, threshold, core count
Output: current value vs threshold, context if available
- Preserve current value, threshold, and context if present.
SECURITY (auth_fail / ip_block):
Input contains: source IP, user, service, jail, failure count
Output: list each field on its own line
- Keep source IP, user, service, jail, and failure count on separate clear lines if present.
VM/CT LIFECYCLE (vm_start, vm_stop, vm_fail, ct_*, migration_*, replication_*):
Input contains: VM name, ID, target node (migrations), reason (failures)
Output: one or two lines confirming the event with key facts
VM / CT LIFECYCLE (vm_*, ct_*, migration_*, replication_*):
- Keep name, ID, state, reason, and target node if present.
- Keep lifecycle messages compact unless detail_level is detailed.
CLUSTER (split_brain / node_disconnect / node_reconnect):
Input: node name, quorum status
Output: state change + quorum value
CLUSTER / HEALTH:
- Preserve node name, quorum, category, severity, duration, and reason if present.
HEALTH (new_error / error_resolved / health_persistent / health_degraded):
Input: category, severity, duration, reason
Output: what changed, in which category, for how long (if resolved)
CRITICAL:
- [TITLE] on its own line, title text on the very next line — no blank line between them
- [BODY] on its own line, body text starting on the very next line — no blank line between them
- Do NOT write "Title:", "Body:", or any label substituting the markers
- Do NOT include the literal words TITLE or BODY anywhere in the translated content
═══ OUTPUT FORMAT (follow exactly — parsers rely on these markers) ═══
═══ OUTPUT FORMAT ═══
[TITLE]
translated title here
[BODY]
translated body here"""
translated body here
CRITICAL OUTPUT RULES:
- Write [TITLE] on its own line
- Write the title on the next line
- Write [BODY] on its own line
- Write the body starting on the next line
- Do NOT replace these markers with "Title:" or "Body:"
- Do NOT include any extra text before or after the formatted result
- Do NOT add blank lines between [TITLE] and the title
- Do NOT add blank lines between [BODY] and the first body line"""
# Emoji instructions injected into AI_SYSTEM_PROMPT for rich channels (Telegram, Discord, Pushover)
AI_EMOJI_INSTRUCTIONS = """
@@ -1485,135 +1613,10 @@ A blank line must be completely empty — no emoji, no spaces.
🟢 new version (pve_update)
BLANK LINES FOR READABILITY — insert ONE blank line between logical sections within the body.
Blank lines go BETWEEN groups, not before the first line or after the last line.
A blank line must be completely empty — no emoji, no spaces.
When to add a blank line:
- Updates: after the last count line, before the packages block
- Backup multi-VM: one blank line between each VM entry; one blank line before the summary line
- Disk/SMART errors: after the device line, before the error description lines
- VM events with a reason: after the main status line, before Reason / Node / Target lines
- Health events: after the category/status line, before duration or detail lines
EXAMPLE — CT shutdown:
[TITLE]
🔽 amd: CT alpine (101) shut down
[BODY]
🏷️ Container alpine (ID: 101)
✔️ Cleanly shut down
EXAMPLE — VM started:
[TITLE]
🚀 pve01: VM arch-linux (100) started
[BODY]
🏷️ Virtual machine arch-linux (ID: 100)
✔️ Now running
EXAMPLE — migration complete:
[TITLE]
🚚 amd: Migration complete — web01 (100)
[BODY]
🏷️ Virtual machine web01 (ID: 100)
✔️ Successfully migrated
🎯 Target: node02
EXAMPLE — updates message (no important packages):
[TITLE]
📦 amd: Updates available
[BODY]
📦 Total updates: 24
🔒 Security updates: 6
🔄 Proxmox updates: 0
⚙️ Kernel updates: 0
🗂️ Important packages:
• none
EXAMPLE — updates message (with important packages):
[TITLE]
📦 amd: Updates available
[BODY]
📦 Total updates: 90
🔒 Security updates: 6
🔄 Proxmox updates: 14
⚙️ Kernel updates: 1
🗂️ Important packages:
• pve-manager (9.1.4 -> 9.1.6)
• qemu-server (9.1.3 -> 9.1.4)
• pve-container (6.0.18 -> 6.1.2)
EXAMPLE — pve_update (new Proxmox VE version):
[TITLE]
🆕 pve01: Proxmox VE 9.1.6 available
[BODY]
🚀 A new Proxmox VE release is available.
🔹 Current: 9.1.4
🟢 New: 9.1.6
🗂️ Important packages:
📌 pve-manager (v9.1.4 ➜ v9.1.6)
EXAMPLE — backup complete with multiple VMs:
[TITLE]
💾✅ pve01: Backup complete
[BODY]
Backup job finished on storage local-bak.
🏷️ VM web01 (ID: 100)
✔️ Status: ok
💽 Size: 12.3 GiB
⏱️ Duration: 00:04:21
🗄️ Storage: vm/100/2026-03-17T22:00:08Z
🏷️ CT db (ID: 101)
✔️ Status: ok
💽 Size: 4.1 GiB
⏱️ Duration: 00:01:10
🗄️ Storage: ct/101/2026-03-17T22:04:29Z
📊 Total: 2 backups | 💾 16.4 GiB | ⏱️ 00:05:31
EXAMPLE — backup partially failed (some ok, some failed):
[TITLE]
💾❌ pve01: Backup partially failed
[BODY]
Backup job finished with errors on storage PBS2.
🏷️ VM web01 (ID: 100)
✔️ Status: ok
💽 Size: 12.3 GiB
⏱️ Duration: 00:04:21
🗄️ Storage: vm/100/2026-03-17T22:00:08Z
🏷️ VM broken (ID: 102)
❌ Status: error
💽 Size: 0 B
⏱️ Duration: 00:00:37
📊 Total: 2 backups | ❌ 1 failed | 💾 12.3 GiB | ⏱️ 00:04:58
EXAMPLE — disk I/O health warning:
[TITLE]
💥 amd: Health warning — Disk I/O errors
[BODY]
💿 Device: /dev/sda
⚠️ 1 sector currently unreadable (pending)
📝 Disk reports sectors in pending reallocation state
EXAMPLE — health degraded (multiple issues):
[TITLE]
⚠️ amd: 2 health checks degraded
[BODY]
💥 Disk I/O error on /dev/sda: 1 sector currently unreadable (pending)
🏷️ Container CT 9005: ❌ failed to start
🏷️ Container CT 9004: ❌ failed to start
🏷️ Container CT 9002: ❌ failed to start"""
BLANK LINES:
Insert one blank line only between logical sections inside the body.
Do not add a blank line before the first body line or after the last one.
"""
# No emoji instructions for email/plain text channels

View File

@@ -120,7 +120,7 @@ class _StartupGraceState:
with self._lock:
return time.time() - self._startup_time
# ─── Shutdown Tracking ──────────────────────────────────────────────────
# ─── Shutdown Tracking ────────────────────────────────────────<EFBFBD><EFBFBD>──────────
def mark_shutdown(self):
"""
@@ -231,6 +231,219 @@ def was_startup_aggregated() -> bool:
return _state.was_startup_aggregated()
# ─── Startup Report Collection ───────────────────────────────────────────────
def collect_startup_report() -> dict:
"""
Collect comprehensive startup report data.
Called at the end of the grace period to generate a complete
startup report including:
- VMs/CTs that started successfully
- VMs/CTs that failed to start
- Service status
- Storage status
- Journal errors during boot (for AI enrichment)
Returns:
Dictionary with startup report data
"""
import subprocess
report = {
# VMs/CTs
'vms_started': [],
'cts_started': [],
'vms_failed': [],
'cts_failed': [],
# System status
'services_ok': True,
'services_failed': [],
'storage_ok': True,
'storage_unavailable': [],
# Health summary
'health_status': 'OK',
'health_issues': [],
# For AI enrichment
'_journal_context': '',
'_startup_errors': [],
# Metadata
'startup_duration_seconds': get_startup_elapsed(),
'timestamp': int(time.time()),
}
# Get VMs/CTs that started during boot
startup_vms = get_and_clear_startup_vms()
for vmid, vmname, vm_type in startup_vms:
if vm_type == 'vm':
report['vms_started'].append({'vmid': vmid, 'name': vmname})
else:
report['cts_started'].append({'vmid': vmid, 'name': vmname})
# Try to get health status from health_monitor
try:
import health_monitor
health_data = health_monitor.get_detailed_status()
if health_data:
report['health_status'] = health_data.get('overall_status', 'UNKNOWN')
# Check storage
storage_cat = health_data.get('categories', {}).get('storage', {})
if storage_cat.get('status') in ['CRITICAL', 'WARNING']:
report['storage_ok'] = False
for check in storage_cat.get('checks', []):
if check.get('status') in ['CRITICAL', 'WARNING', 'error']:
report['storage_unavailable'].append({
'name': check.get('name', 'unknown'),
'reason': check.get('reason', check.get('message', ''))
})
# Check services
services_cat = health_data.get('categories', {}).get('services', {})
if services_cat.get('status') in ['CRITICAL', 'WARNING']:
report['services_ok'] = False
for check in services_cat.get('checks', []):
if check.get('status') in ['CRITICAL', 'WARNING', 'error']:
report['services_failed'].append({
'name': check.get('name', 'unknown'),
'reason': check.get('reason', check.get('message', ''))
})
# Check VMs category for failed VMs
vms_cat = health_data.get('categories', {}).get('vms', {})
for check in vms_cat.get('checks', []):
if check.get('status') in ['CRITICAL', 'WARNING', 'error']:
# Determine if VM or CT based on name/type
check_name = check.get('name', '')
check_reason = check.get('reason', check.get('message', ''))
if 'error al iniciar' in check_reason.lower() or 'failed to start' in check_reason.lower():
if 'CT' in check_name or 'Container' in check_name:
report['cts_failed'].append({
'name': check_name,
'reason': check_reason
})
else:
report['vms_failed'].append({
'name': check_name,
'reason': check_reason
})
# Collect all health issues for summary
for cat_name, cat_data in health_data.get('categories', {}).items():
if cat_data.get('status') in ['CRITICAL', 'WARNING']:
report['health_issues'].append({
'category': cat_name,
'status': cat_data.get('status'),
'reason': cat_data.get('reason', '')
})
except Exception as e:
report['_startup_errors'].append(f"Error getting health data: {e}")
# Get journal errors during startup (for AI enrichment)
try:
boot_time = int(_state._startup_time)
result = subprocess.run(
['journalctl', '-p', 'err', '--since', f'@{boot_time}', '--no-pager', '-n', '50'],
capture_output=True,
text=True,
timeout=10
)
if result.returncode == 0 and result.stdout.strip():
report['_journal_context'] = result.stdout.strip()
except Exception as e:
report['_startup_errors'].append(f"Error getting journal: {e}")
return report
def format_startup_summary(report: dict) -> str:
"""
Format a human-readable startup summary from report data.
Args:
report: Dictionary from collect_startup_report()
Returns:
Formatted summary string
"""
lines = []
# Count totals
vms_ok = len(report.get('vms_started', []))
cts_ok = len(report.get('cts_started', []))
vms_fail = len(report.get('vms_failed', []))
cts_fail = len(report.get('cts_failed', []))
total_ok = vms_ok + cts_ok
total_fail = vms_fail + cts_fail
# Determine overall status
has_issues = (
total_fail > 0 or
not report.get('services_ok', True) or
not report.get('storage_ok', True) or
report.get('health_status') in ['CRITICAL', 'WARNING']
)
# Header
if has_issues:
issue_count = total_fail + len(report.get('services_failed', [])) + len(report.get('storage_unavailable', []))
lines.append(f"System startup - {issue_count} issue(s) detected")
else:
lines.append("System startup completed")
lines.append("All systems operational.")
# VMs/CTs started
if total_ok > 0:
parts = []
if vms_ok > 0:
parts.append(f"{vms_ok} VM{'s' if vms_ok > 1 else ''}")
if cts_ok > 0:
parts.append(f"{cts_ok} CT{'s' if cts_ok > 1 else ''}")
# List names
names = []
for vm in report.get('vms_started', []):
names.append(f"{vm['name']} ({vm['vmid']})")
for ct in report.get('cts_started', []):
names.append(f"{ct['name']} ({ct['vmid']})")
line = f"{' and '.join(parts)} started"
if names and len(names) <= 5:
line += f": {', '.join(names)}"
elif names:
line += f": {', '.join(names[:3])}... (+{len(names)-3} more)"
lines.append(line)
# Failed VMs/CTs
if total_fail > 0:
for vm in report.get('vms_failed', []):
lines.append(f"VM failed: {vm['name']} - {vm.get('reason', 'unknown error')}")
for ct in report.get('cts_failed', []):
lines.append(f"CT failed: {ct['name']} - {ct.get('reason', 'unknown error')}")
# Storage issues
if not report.get('storage_ok', True):
unavailable = report.get('storage_unavailable', [])
if unavailable:
names = [s['name'] for s in unavailable]
lines.append(f"Storage: {len(unavailable)} unavailable ({', '.join(names[:3])})")
# Service issues
if not report.get('services_ok', True):
failed = report.get('services_failed', [])
if failed:
names = [s['name'] for s in failed]
lines.append(f"Services: {len(failed)} failed ({', '.join(names[:3])})")
return '\n'.join(lines)
# ─── For backwards compatibility ─────────────────────────────────────────────
# Expose constants for external use