Files
ProxMenux/AppImage/scripts/startup_grace.py
2026-03-28 22:19:33 +01:00

511 lines
19 KiB
Python

"""
Centralized Startup Grace Period Management
This module provides a single source of truth for startup grace period logic.
During system boot, various transient issues occur (high latency, storage not ready,
QMP timeouts, etc.) that shouldn't trigger notifications or critical alerts.
Grace Periods:
- VM/CT aggregation: 3 minutes - Aggregate multiple VM/CT starts into one notification
- Health suppression: 5 minutes - Suppress transient health warnings/errors
- Shutdown suppression: 2 minutes - Suppress VM/CT stops during system shutdown
Categories suppressed during startup:
- storage: NFS/CIFS mounts may take time to become available
- vms: VMs may have QMP timeouts or startup delays
- network: Latency spikes during boot are normal
- services: PVE services may take time to fully initialize
"""
import time
import threading
from typing import Set, List, Tuple, Optional
# ─── Configuration ───────────────────────────────────────────────────────────
# Grace period durations (seconds)
STARTUP_VM_GRACE_SECONDS = 180 # 3 minutes for VM/CT start aggregation
STARTUP_HEALTH_GRACE_SECONDS = 300 # 5 minutes for health warning suppression
SHUTDOWN_GRACE_SECONDS = 120 # 2 minutes for VM/CT stop suppression
# Maximum system uptime to consider this a real server boot (not just service restart)
# If system uptime > this value when service starts, skip startup notification
MAX_BOOT_UPTIME_SECONDS = 600 # 10 minutes - if system was up longer, it's a service restart
def _get_system_uptime() -> float:
"""
Get actual system uptime in seconds from /proc/uptime.
Returns 0 if unable to read (will default to treating as new boot).
"""
try:
with open('/proc/uptime', 'r') as f:
return float(f.readline().split()[0])
except Exception:
return 0
# Categories to suppress during startup grace period
# These categories typically have transient issues during boot
STARTUP_GRACE_CATEGORIES: Set[str] = {
'storage', # NFS/CIFS mounts may take time
'vms', # VMs may have QMP timeouts
'network', # Latency spikes during boot
'services', # PVE services initialization
}
# ─── Singleton State ─────────────────────────────────────────────────────────
class _StartupGraceState:
"""
Thread-safe singleton managing all startup/shutdown grace period state.
Initialized when the module loads (service start), which serves as the
reference point for determining if we're still in the startup period.
"""
_instance: Optional['_StartupGraceState'] = None
_init_lock = threading.Lock()
def __new__(cls) -> '_StartupGraceState':
if cls._instance is None:
with cls._init_lock:
if cls._instance is None:
cls._instance = super().__new__(cls)
cls._instance._initialized = False
return cls._instance
def __init__(self):
if self._initialized:
return
self._lock = threading.Lock()
# Startup time = when service started (module load time)
self._startup_time: float = time.time()
# Check if this is a REAL system boot or just a service restart
# by comparing system uptime to our threshold
system_uptime = _get_system_uptime()
self._is_real_boot: bool = system_uptime < MAX_BOOT_UPTIME_SECONDS
# Shutdown tracking
self._shutdown_time: float = 0
# VM/CT aggregation during startup
self._startup_vms: List[Tuple[str, str, str]] = [] # [(vmid, vmname, 'vm'|'ct'), ...]
self._startup_aggregated: bool = False
self._initialized = True
# ─── Startup Period Checks ───────────────────────────────────────────────
def is_startup_vm_period(self) -> bool:
"""
Check if we're within the VM/CT start aggregation period (3 min).
During this period, individual VM/CT start notifications are collected
and later sent as a single aggregated notification.
"""
with self._lock:
return (time.time() - self._startup_time) < STARTUP_VM_GRACE_SECONDS
def is_startup_health_grace(self) -> bool:
"""
Check if we're within the health suppression period (5 min).
During this period:
- Transient health warnings (latency, storage, etc.) are suppressed
- CRITICAL/WARNING may be downgraded to INFO for certain categories
- Health degradation notifications are skipped for grace categories
"""
with self._lock:
return (time.time() - self._startup_time) < STARTUP_HEALTH_GRACE_SECONDS
def should_suppress_category(self, category: str) -> bool:
"""
Check if notifications for a category should be suppressed.
Args:
category: Health category name (e.g., 'network', 'storage', 'vms')
Returns:
True if we're in grace period AND category is in STARTUP_GRACE_CATEGORIES
"""
if category.lower() in STARTUP_GRACE_CATEGORIES:
return self.is_startup_health_grace()
return False
def is_real_system_boot(self) -> bool:
"""
Check if the service started during a real system boot.
Returns False if the system was already running for more than 10 minutes
when the service started (indicates a service restart, not a system boot).
This prevents sending "System startup completed" notifications when
just restarting the ProxMenux Monitor service.
"""
with self._lock:
return self._is_real_boot
def get_startup_elapsed(self) -> float:
"""Get seconds elapsed since service startup."""
with self._lock:
return time.time() - self._startup_time
# ─── Shutdown Tracking ───────────────────────────────────────────────────
def mark_shutdown(self):
"""
Called when system_shutdown or system_reboot is detected.
After this, VM/CT stop notifications will be suppressed for the
shutdown grace period (expected stops during system shutdown).
"""
with self._lock:
self._shutdown_time = time.time()
def is_host_shutting_down(self) -> bool:
"""
Check if we're within the shutdown grace period.
During this period, VM/CT stop events are expected and should not
generate notifications.
"""
with self._lock:
if self._shutdown_time == 0:
return False
return (time.time() - self._shutdown_time) < SHUTDOWN_GRACE_SECONDS
# ─── VM/CT Start Aggregation ─────────────────────────────────────────────
def add_startup_vm(self, vmid: str, vmname: str, vm_type: str):
"""
Record a VM/CT start during startup period for later aggregation.
Args:
vmid: VM/CT ID
vmname: VM/CT name
vm_type: 'vm' or 'ct'
"""
with self._lock:
self._startup_vms.append((vmid, vmname, vm_type))
def get_and_clear_startup_vms(self) -> List[Tuple[str, str, str]]:
"""
Get all recorded startup VMs and clear the list.
Should be called once after the VM aggregation grace period ends
to get all VMs that started during boot for a single notification.
Returns:
List of (vmid, vmname, vm_type) tuples
"""
with self._lock:
vms = self._startup_vms.copy()
self._startup_vms = []
self._startup_aggregated = True
return vms
def has_startup_vms(self) -> bool:
"""Check if there are any startup VMs recorded."""
with self._lock:
return len(self._startup_vms) > 0
def was_startup_aggregated(self) -> bool:
"""Check if startup aggregation has already been processed."""
with self._lock:
return self._startup_aggregated
def mark_startup_aggregated(self) -> None:
"""Mark startup aggregation as completed without returning VMs."""
with self._lock:
self._startup_aggregated = True
# ─── Module-level convenience functions ──────────────────────────────────────
# Global singleton instance
_state = _StartupGraceState()
def is_startup_vm_period() -> bool:
"""Check if we're within the VM/CT start aggregation period (3 min)."""
return _state.is_startup_vm_period()
def is_startup_health_grace() -> bool:
"""Check if we're within the health suppression period (5 min)."""
return _state.is_startup_health_grace()
def should_suppress_category(category: str) -> bool:
"""Check if notifications for a category should be suppressed during startup."""
return _state.should_suppress_category(category)
def get_startup_elapsed() -> float:
"""Get seconds elapsed since service startup."""
return _state.get_startup_elapsed()
def mark_shutdown():
"""Mark that system shutdown/reboot has been detected."""
_state.mark_shutdown()
def is_host_shutting_down() -> bool:
"""Check if we're within the shutdown grace period."""
return _state.is_host_shutting_down()
def add_startup_vm(vmid: str, vmname: str, vm_type: str):
"""Record a VM/CT start during startup period for aggregation."""
_state.add_startup_vm(vmid, vmname, vm_type)
def get_and_clear_startup_vms() -> List[Tuple[str, str, str]]:
"""Get all recorded startup VMs and clear the list."""
return _state.get_and_clear_startup_vms()
def has_startup_vms() -> bool:
"""Check if there are any startup VMs recorded."""
return _state.has_startup_vms()
def was_startup_aggregated() -> bool:
"""Check if startup aggregation has already been processed."""
return _state.was_startup_aggregated()
def mark_startup_aggregated() -> None:
"""Mark startup aggregation as completed without processing VMs.
Use this when skipping startup notification (e.g., service restart
instead of real system boot) to prevent future checks.
"""
_state.mark_startup_aggregated()
def is_real_system_boot() -> bool:
"""
Check if this is a real system boot (not just a service restart).
Returns True if the system uptime was less than 10 minutes when the
service started. Returns False if the system was already running
longer (indicates the service was restarted, not the whole system).
Use this to prevent sending "System startup completed" notifications
when just restarting the ProxMenux Monitor service.
"""
return _state.is_real_system_boot()
# ─── Startup Report Collection ───────────────────────────────────────────────
def collect_startup_report() -> dict:
"""
Collect comprehensive startup report data.
Called at the end of the grace period to generate a complete
startup report including:
- VMs/CTs that started successfully
- VMs/CTs that failed to start
- Service status
- Storage status
- Journal errors during boot (for AI enrichment)
Returns:
Dictionary with startup report data
"""
import subprocess
report = {
# VMs/CTs
'vms_started': [],
'cts_started': [],
'vms_failed': [],
'cts_failed': [],
# System status
'services_ok': True,
'services_failed': [],
'storage_ok': True,
'storage_unavailable': [],
# Health summary
'health_status': 'OK',
'health_issues': [],
# For AI enrichment
'_journal_context': '',
'_startup_errors': [],
# Metadata
'startup_duration_seconds': get_startup_elapsed(),
'timestamp': int(time.time()),
}
# Get VMs/CTs that started during boot
startup_vms = get_and_clear_startup_vms()
for vmid, vmname, vm_type in startup_vms:
if vm_type == 'vm':
report['vms_started'].append({'vmid': vmid, 'name': vmname})
else:
report['cts_started'].append({'vmid': vmid, 'name': vmname})
# Try to get health status from health_monitor
try:
import health_monitor
health_data = health_monitor.get_detailed_status()
if health_data:
report['health_status'] = health_data.get('overall_status', 'UNKNOWN')
# Check storage
storage_cat = health_data.get('categories', {}).get('storage', {})
if storage_cat.get('status') in ['CRITICAL', 'WARNING']:
report['storage_ok'] = False
for check in storage_cat.get('checks', []):
if check.get('status') in ['CRITICAL', 'WARNING', 'error']:
report['storage_unavailable'].append({
'name': check.get('name', 'unknown'),
'reason': check.get('reason', check.get('message', ''))
})
# Check services
services_cat = health_data.get('categories', {}).get('services', {})
if services_cat.get('status') in ['CRITICAL', 'WARNING']:
report['services_ok'] = False
for check in services_cat.get('checks', []):
if check.get('status') in ['CRITICAL', 'WARNING', 'error']:
report['services_failed'].append({
'name': check.get('name', 'unknown'),
'reason': check.get('reason', check.get('message', ''))
})
# Check VMs category for failed VMs
vms_cat = health_data.get('categories', {}).get('vms', {})
for check in vms_cat.get('checks', []):
if check.get('status') in ['CRITICAL', 'WARNING', 'error']:
# Determine if VM or CT based on name/type
check_name = check.get('name', '')
check_reason = check.get('reason', check.get('message', ''))
if 'error al iniciar' in check_reason.lower() or 'failed to start' in check_reason.lower():
if 'CT' in check_name or 'Container' in check_name:
report['cts_failed'].append({
'name': check_name,
'reason': check_reason
})
else:
report['vms_failed'].append({
'name': check_name,
'reason': check_reason
})
# Collect all health issues for summary
for cat_name, cat_data in health_data.get('categories', {}).items():
if cat_data.get('status') in ['CRITICAL', 'WARNING']:
report['health_issues'].append({
'category': cat_name,
'status': cat_data.get('status'),
'reason': cat_data.get('reason', '')
})
except Exception as e:
report['_startup_errors'].append(f"Error getting health data: {e}")
# Get journal errors during startup (for AI enrichment)
try:
boot_time = int(_state._startup_time)
result = subprocess.run(
['journalctl', '-p', 'err', '--since', f'@{boot_time}', '--no-pager', '-n', '50'],
capture_output=True,
text=True,
timeout=10
)
if result.returncode == 0 and result.stdout.strip():
report['_journal_context'] = result.stdout.strip()
except Exception as e:
report['_startup_errors'].append(f"Error getting journal: {e}")
return report
def format_startup_summary(report: dict) -> str:
"""
Format a human-readable startup summary from report data.
Args:
report: Dictionary from collect_startup_report()
Returns:
Formatted summary string
"""
lines = []
# Count totals
vms_ok = len(report.get('vms_started', []))
cts_ok = len(report.get('cts_started', []))
vms_fail = len(report.get('vms_failed', []))
cts_fail = len(report.get('cts_failed', []))
total_ok = vms_ok + cts_ok
total_fail = vms_fail + cts_fail
# Determine overall status
has_issues = (
total_fail > 0 or
not report.get('services_ok', True) or
not report.get('storage_ok', True) or
report.get('health_status') in ['CRITICAL', 'WARNING']
)
# Header
if has_issues:
issue_count = total_fail + len(report.get('services_failed', [])) + len(report.get('storage_unavailable', []))
lines.append(f"System startup - {issue_count} issue(s) detected")
else:
lines.append("System startup completed")
lines.append("All systems operational.")
# VMs/CTs started
if total_ok > 0:
parts = []
if vms_ok > 0:
parts.append(f"{vms_ok} VM{'s' if vms_ok > 1 else ''}")
if cts_ok > 0:
parts.append(f"{cts_ok} CT{'s' if cts_ok > 1 else ''}")
# List names
names = []
for vm in report.get('vms_started', []):
names.append(f"{vm['name']} ({vm['vmid']})")
for ct in report.get('cts_started', []):
names.append(f"{ct['name']} ({ct['vmid']})")
line = f"{' and '.join(parts)} started"
if names and len(names) <= 5:
line += f": {', '.join(names)}"
elif names:
line += f": {', '.join(names[:3])}... (+{len(names)-3} more)"
lines.append(line)
# Failed VMs/CTs
if total_fail > 0:
for vm in report.get('vms_failed', []):
lines.append(f"VM failed: {vm['name']} - {vm.get('reason', 'unknown error')}")
for ct in report.get('cts_failed', []):
lines.append(f"CT failed: {ct['name']} - {ct.get('reason', 'unknown error')}")
# Storage issues
if not report.get('storage_ok', True):
unavailable = report.get('storage_unavailable', [])
if unavailable:
names = [s['name'] for s in unavailable]
lines.append(f"Storage: {len(unavailable)} unavailable ({', '.join(names[:3])})")
# Service issues
if not report.get('services_ok', True):
failed = report.get('services_failed', [])
if failed:
names = [s['name'] for s in failed]
lines.append(f"Services: {len(failed)} failed ({', '.join(names[:3])})")
return '\n'.join(lines)
# ─── For backwards compatibility ─────────────────────────────────────────────
# Expose constants for external use
GRACE_CATEGORIES = STARTUP_GRACE_CATEGORIES