mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-05 20:03:48 +00:00
511 lines
19 KiB
Python
511 lines
19 KiB
Python
"""
|
||
Centralized Startup Grace Period Management
|
||
|
||
This module provides a single source of truth for startup grace period logic.
|
||
During system boot, various transient issues occur (high latency, storage not ready,
|
||
QMP timeouts, etc.) that shouldn't trigger notifications or critical alerts.
|
||
|
||
Grace Periods:
|
||
- VM/CT aggregation: 3 minutes - Aggregate multiple VM/CT starts into one notification
|
||
- Health suppression: 5 minutes - Suppress transient health warnings/errors
|
||
- Shutdown suppression: 2 minutes - Suppress VM/CT stops during system shutdown
|
||
|
||
Categories suppressed during startup:
|
||
- storage: NFS/CIFS mounts may take time to become available
|
||
- vms: VMs may have QMP timeouts or startup delays
|
||
- network: Latency spikes during boot are normal
|
||
- services: PVE services may take time to fully initialize
|
||
"""
|
||
|
||
import time
|
||
import threading
|
||
from typing import Set, List, Tuple, Optional
|
||
|
||
# ─── Configuration ───────────────────────────────────────────────────────────
|
||
|
||
# Grace period durations (seconds)
|
||
STARTUP_VM_GRACE_SECONDS = 180 # 3 minutes for VM/CT start aggregation
|
||
STARTUP_HEALTH_GRACE_SECONDS = 300 # 5 minutes for health warning suppression
|
||
SHUTDOWN_GRACE_SECONDS = 120 # 2 minutes for VM/CT stop suppression
|
||
|
||
# Maximum system uptime to consider this a real server boot (not just service restart)
|
||
# If system uptime > this value when service starts, skip startup notification
|
||
MAX_BOOT_UPTIME_SECONDS = 600 # 10 minutes - if system was up longer, it's a service restart
|
||
|
||
|
||
def _get_system_uptime() -> float:
|
||
"""
|
||
Get actual system uptime in seconds from /proc/uptime.
|
||
Returns 0 if unable to read (will default to treating as new boot).
|
||
"""
|
||
try:
|
||
with open('/proc/uptime', 'r') as f:
|
||
return float(f.readline().split()[0])
|
||
except Exception:
|
||
return 0
|
||
|
||
# Categories to suppress during startup grace period
|
||
# These categories typically have transient issues during boot
|
||
STARTUP_GRACE_CATEGORIES: Set[str] = {
|
||
'storage', # NFS/CIFS mounts may take time
|
||
'vms', # VMs may have QMP timeouts
|
||
'network', # Latency spikes during boot
|
||
'services', # PVE services initialization
|
||
}
|
||
|
||
|
||
# ─── Singleton State ─────────────────────────────────────────────────────────
|
||
|
||
class _StartupGraceState:
|
||
"""
|
||
Thread-safe singleton managing all startup/shutdown grace period state.
|
||
|
||
Initialized when the module loads (service start), which serves as the
|
||
reference point for determining if we're still in the startup period.
|
||
"""
|
||
|
||
_instance: Optional['_StartupGraceState'] = None
|
||
_init_lock = threading.Lock()
|
||
|
||
def __new__(cls) -> '_StartupGraceState':
|
||
if cls._instance is None:
|
||
with cls._init_lock:
|
||
if cls._instance is None:
|
||
cls._instance = super().__new__(cls)
|
||
cls._instance._initialized = False
|
||
return cls._instance
|
||
|
||
def __init__(self):
|
||
if self._initialized:
|
||
return
|
||
|
||
self._lock = threading.Lock()
|
||
|
||
# Startup time = when service started (module load time)
|
||
self._startup_time: float = time.time()
|
||
|
||
# Check if this is a REAL system boot or just a service restart
|
||
# by comparing system uptime to our threshold
|
||
system_uptime = _get_system_uptime()
|
||
self._is_real_boot: bool = system_uptime < MAX_BOOT_UPTIME_SECONDS
|
||
|
||
# Shutdown tracking
|
||
self._shutdown_time: float = 0
|
||
|
||
# VM/CT aggregation during startup
|
||
self._startup_vms: List[Tuple[str, str, str]] = [] # [(vmid, vmname, 'vm'|'ct'), ...]
|
||
self._startup_aggregated: bool = False
|
||
|
||
self._initialized = True
|
||
|
||
# ─── Startup Period Checks ───────────────────────────────────────────────
|
||
|
||
def is_startup_vm_period(self) -> bool:
|
||
"""
|
||
Check if we're within the VM/CT start aggregation period (3 min).
|
||
|
||
During this period, individual VM/CT start notifications are collected
|
||
and later sent as a single aggregated notification.
|
||
"""
|
||
with self._lock:
|
||
return (time.time() - self._startup_time) < STARTUP_VM_GRACE_SECONDS
|
||
|
||
def is_startup_health_grace(self) -> bool:
|
||
"""
|
||
Check if we're within the health suppression period (5 min).
|
||
|
||
During this period:
|
||
- Transient health warnings (latency, storage, etc.) are suppressed
|
||
- CRITICAL/WARNING may be downgraded to INFO for certain categories
|
||
- Health degradation notifications are skipped for grace categories
|
||
"""
|
||
with self._lock:
|
||
return (time.time() - self._startup_time) < STARTUP_HEALTH_GRACE_SECONDS
|
||
|
||
def should_suppress_category(self, category: str) -> bool:
|
||
"""
|
||
Check if notifications for a category should be suppressed.
|
||
|
||
Args:
|
||
category: Health category name (e.g., 'network', 'storage', 'vms')
|
||
|
||
Returns:
|
||
True if we're in grace period AND category is in STARTUP_GRACE_CATEGORIES
|
||
"""
|
||
if category.lower() in STARTUP_GRACE_CATEGORIES:
|
||
return self.is_startup_health_grace()
|
||
return False
|
||
|
||
def is_real_system_boot(self) -> bool:
|
||
"""
|
||
Check if the service started during a real system boot.
|
||
|
||
Returns False if the system was already running for more than 10 minutes
|
||
when the service started (indicates a service restart, not a system boot).
|
||
|
||
This prevents sending "System startup completed" notifications when
|
||
just restarting the ProxMenux Monitor service.
|
||
"""
|
||
with self._lock:
|
||
return self._is_real_boot
|
||
|
||
def get_startup_elapsed(self) -> float:
|
||
"""Get seconds elapsed since service startup."""
|
||
with self._lock:
|
||
return time.time() - self._startup_time
|
||
|
||
# ─── Shutdown Tracking ───────────────────────────────────────────────────
|
||
|
||
def mark_shutdown(self):
|
||
"""
|
||
Called when system_shutdown or system_reboot is detected.
|
||
|
||
After this, VM/CT stop notifications will be suppressed for the
|
||
shutdown grace period (expected stops during system shutdown).
|
||
"""
|
||
with self._lock:
|
||
self._shutdown_time = time.time()
|
||
|
||
def is_host_shutting_down(self) -> bool:
|
||
"""
|
||
Check if we're within the shutdown grace period.
|
||
|
||
During this period, VM/CT stop events are expected and should not
|
||
generate notifications.
|
||
"""
|
||
with self._lock:
|
||
if self._shutdown_time == 0:
|
||
return False
|
||
return (time.time() - self._shutdown_time) < SHUTDOWN_GRACE_SECONDS
|
||
|
||
# ─── VM/CT Start Aggregation ─────────────────────────────────────────────
|
||
|
||
def add_startup_vm(self, vmid: str, vmname: str, vm_type: str):
|
||
"""
|
||
Record a VM/CT start during startup period for later aggregation.
|
||
|
||
Args:
|
||
vmid: VM/CT ID
|
||
vmname: VM/CT name
|
||
vm_type: 'vm' or 'ct'
|
||
"""
|
||
with self._lock:
|
||
self._startup_vms.append((vmid, vmname, vm_type))
|
||
|
||
def get_and_clear_startup_vms(self) -> List[Tuple[str, str, str]]:
|
||
"""
|
||
Get all recorded startup VMs and clear the list.
|
||
|
||
Should be called once after the VM aggregation grace period ends
|
||
to get all VMs that started during boot for a single notification.
|
||
|
||
Returns:
|
||
List of (vmid, vmname, vm_type) tuples
|
||
"""
|
||
with self._lock:
|
||
vms = self._startup_vms.copy()
|
||
self._startup_vms = []
|
||
self._startup_aggregated = True
|
||
return vms
|
||
|
||
def has_startup_vms(self) -> bool:
|
||
"""Check if there are any startup VMs recorded."""
|
||
with self._lock:
|
||
return len(self._startup_vms) > 0
|
||
|
||
def was_startup_aggregated(self) -> bool:
|
||
"""Check if startup aggregation has already been processed."""
|
||
with self._lock:
|
||
return self._startup_aggregated
|
||
|
||
def mark_startup_aggregated(self) -> None:
|
||
"""Mark startup aggregation as completed without returning VMs."""
|
||
with self._lock:
|
||
self._startup_aggregated = True
|
||
|
||
|
||
# ─── Module-level convenience functions ──────────────────────────────────────
|
||
|
||
# Global singleton instance
|
||
_state = _StartupGraceState()
|
||
|
||
def is_startup_vm_period() -> bool:
|
||
"""Check if we're within the VM/CT start aggregation period (3 min)."""
|
||
return _state.is_startup_vm_period()
|
||
|
||
def is_startup_health_grace() -> bool:
|
||
"""Check if we're within the health suppression period (5 min)."""
|
||
return _state.is_startup_health_grace()
|
||
|
||
def should_suppress_category(category: str) -> bool:
|
||
"""Check if notifications for a category should be suppressed during startup."""
|
||
return _state.should_suppress_category(category)
|
||
|
||
def get_startup_elapsed() -> float:
|
||
"""Get seconds elapsed since service startup."""
|
||
return _state.get_startup_elapsed()
|
||
|
||
def mark_shutdown():
|
||
"""Mark that system shutdown/reboot has been detected."""
|
||
_state.mark_shutdown()
|
||
|
||
def is_host_shutting_down() -> bool:
|
||
"""Check if we're within the shutdown grace period."""
|
||
return _state.is_host_shutting_down()
|
||
|
||
def add_startup_vm(vmid: str, vmname: str, vm_type: str):
|
||
"""Record a VM/CT start during startup period for aggregation."""
|
||
_state.add_startup_vm(vmid, vmname, vm_type)
|
||
|
||
def get_and_clear_startup_vms() -> List[Tuple[str, str, str]]:
|
||
"""Get all recorded startup VMs and clear the list."""
|
||
return _state.get_and_clear_startup_vms()
|
||
|
||
def has_startup_vms() -> bool:
|
||
"""Check if there are any startup VMs recorded."""
|
||
return _state.has_startup_vms()
|
||
|
||
def was_startup_aggregated() -> bool:
|
||
"""Check if startup aggregation has already been processed."""
|
||
return _state.was_startup_aggregated()
|
||
|
||
def mark_startup_aggregated() -> None:
|
||
"""Mark startup aggregation as completed without processing VMs.
|
||
|
||
Use this when skipping startup notification (e.g., service restart
|
||
instead of real system boot) to prevent future checks.
|
||
"""
|
||
_state.mark_startup_aggregated()
|
||
|
||
def is_real_system_boot() -> bool:
|
||
"""
|
||
Check if this is a real system boot (not just a service restart).
|
||
|
||
Returns True if the system uptime was less than 10 minutes when the
|
||
service started. Returns False if the system was already running
|
||
longer (indicates the service was restarted, not the whole system).
|
||
|
||
Use this to prevent sending "System startup completed" notifications
|
||
when just restarting the ProxMenux Monitor service.
|
||
"""
|
||
return _state.is_real_system_boot()
|
||
|
||
|
||
# ─── Startup Report Collection ─────────────────────────────────────────────<E29480><E29480>─
|
||
|
||
def collect_startup_report() -> dict:
|
||
"""
|
||
Collect comprehensive startup report data.
|
||
|
||
Called at the end of the grace period to generate a complete
|
||
startup report including:
|
||
- VMs/CTs that started successfully
|
||
- VMs/CTs that failed to start
|
||
- Service status
|
||
- Storage status
|
||
- Journal errors during boot (for AI enrichment)
|
||
|
||
Returns:
|
||
Dictionary with startup report data
|
||
"""
|
||
import subprocess
|
||
|
||
report = {
|
||
# VMs/CTs
|
||
'vms_started': [],
|
||
'cts_started': [],
|
||
'vms_failed': [],
|
||
'cts_failed': [],
|
||
|
||
# System status
|
||
'services_ok': True,
|
||
'services_failed': [],
|
||
'storage_ok': True,
|
||
'storage_unavailable': [],
|
||
|
||
# Health summary
|
||
'health_status': 'OK',
|
||
'health_issues': [],
|
||
|
||
# For AI enrichment
|
||
'_journal_context': '',
|
||
'_startup_errors': [],
|
||
|
||
# Metadata
|
||
'startup_duration_seconds': get_startup_elapsed(),
|
||
'timestamp': int(time.time()),
|
||
}
|
||
|
||
# Get VMs/CTs that started during boot
|
||
startup_vms = get_and_clear_startup_vms()
|
||
for vmid, vmname, vm_type in startup_vms:
|
||
if vm_type == 'vm':
|
||
report['vms_started'].append({'vmid': vmid, 'name': vmname})
|
||
else:
|
||
report['cts_started'].append({'vmid': vmid, 'name': vmname})
|
||
|
||
# Try to get health status from health_monitor
|
||
try:
|
||
import health_monitor
|
||
health_data = health_monitor.get_detailed_status()
|
||
|
||
if health_data:
|
||
report['health_status'] = health_data.get('overall_status', 'UNKNOWN')
|
||
|
||
# Check storage
|
||
storage_cat = health_data.get('categories', {}).get('storage', {})
|
||
if storage_cat.get('status') in ['CRITICAL', 'WARNING']:
|
||
report['storage_ok'] = False
|
||
for check in storage_cat.get('checks', []):
|
||
if check.get('status') in ['CRITICAL', 'WARNING', 'error']:
|
||
report['storage_unavailable'].append({
|
||
'name': check.get('name', 'unknown'),
|
||
'reason': check.get('reason', check.get('message', ''))
|
||
})
|
||
|
||
# Check services
|
||
services_cat = health_data.get('categories', {}).get('services', {})
|
||
if services_cat.get('status') in ['CRITICAL', 'WARNING']:
|
||
report['services_ok'] = False
|
||
for check in services_cat.get('checks', []):
|
||
if check.get('status') in ['CRITICAL', 'WARNING', 'error']:
|
||
report['services_failed'].append({
|
||
'name': check.get('name', 'unknown'),
|
||
'reason': check.get('reason', check.get('message', ''))
|
||
})
|
||
|
||
# Check VMs category for failed VMs
|
||
vms_cat = health_data.get('categories', {}).get('vms', {})
|
||
for check in vms_cat.get('checks', []):
|
||
if check.get('status') in ['CRITICAL', 'WARNING', 'error']:
|
||
# Determine if VM or CT based on name/type
|
||
check_name = check.get('name', '')
|
||
check_reason = check.get('reason', check.get('message', ''))
|
||
if 'error al iniciar' in check_reason.lower() or 'failed to start' in check_reason.lower():
|
||
if 'CT' in check_name or 'Container' in check_name:
|
||
report['cts_failed'].append({
|
||
'name': check_name,
|
||
'reason': check_reason
|
||
})
|
||
else:
|
||
report['vms_failed'].append({
|
||
'name': check_name,
|
||
'reason': check_reason
|
||
})
|
||
|
||
# Collect all health issues for summary
|
||
for cat_name, cat_data in health_data.get('categories', {}).items():
|
||
if cat_data.get('status') in ['CRITICAL', 'WARNING']:
|
||
report['health_issues'].append({
|
||
'category': cat_name,
|
||
'status': cat_data.get('status'),
|
||
'reason': cat_data.get('reason', '')
|
||
})
|
||
except Exception as e:
|
||
report['_startup_errors'].append(f"Error getting health data: {e}")
|
||
|
||
# Get journal errors during startup (for AI enrichment)
|
||
try:
|
||
boot_time = int(_state._startup_time)
|
||
result = subprocess.run(
|
||
['journalctl', '-p', 'err', '--since', f'@{boot_time}', '--no-pager', '-n', '50'],
|
||
capture_output=True,
|
||
text=True,
|
||
timeout=10
|
||
)
|
||
if result.returncode == 0 and result.stdout.strip():
|
||
report['_journal_context'] = result.stdout.strip()
|
||
except Exception as e:
|
||
report['_startup_errors'].append(f"Error getting journal: {e}")
|
||
|
||
return report
|
||
|
||
|
||
def format_startup_summary(report: dict) -> str:
|
||
"""
|
||
Format a human-readable startup summary from report data.
|
||
|
||
Args:
|
||
report: Dictionary from collect_startup_report()
|
||
|
||
Returns:
|
||
Formatted summary string
|
||
"""
|
||
lines = []
|
||
|
||
# Count totals
|
||
vms_ok = len(report.get('vms_started', []))
|
||
cts_ok = len(report.get('cts_started', []))
|
||
vms_fail = len(report.get('vms_failed', []))
|
||
cts_fail = len(report.get('cts_failed', []))
|
||
|
||
total_ok = vms_ok + cts_ok
|
||
total_fail = vms_fail + cts_fail
|
||
|
||
# Determine overall status
|
||
has_issues = (
|
||
total_fail > 0 or
|
||
not report.get('services_ok', True) or
|
||
not report.get('storage_ok', True) or
|
||
report.get('health_status') in ['CRITICAL', 'WARNING']
|
||
)
|
||
|
||
# Header
|
||
if has_issues:
|
||
issue_count = total_fail + len(report.get('services_failed', [])) + len(report.get('storage_unavailable', []))
|
||
lines.append(f"System startup - {issue_count} issue(s) detected")
|
||
else:
|
||
lines.append("System startup completed")
|
||
lines.append("All systems operational.")
|
||
|
||
# VMs/CTs started
|
||
if total_ok > 0:
|
||
parts = []
|
||
if vms_ok > 0:
|
||
parts.append(f"{vms_ok} VM{'s' if vms_ok > 1 else ''}")
|
||
if cts_ok > 0:
|
||
parts.append(f"{cts_ok} CT{'s' if cts_ok > 1 else ''}")
|
||
|
||
# List names
|
||
names = []
|
||
for vm in report.get('vms_started', []):
|
||
names.append(f"{vm['name']} ({vm['vmid']})")
|
||
for ct in report.get('cts_started', []):
|
||
names.append(f"{ct['name']} ({ct['vmid']})")
|
||
|
||
line = f"{' and '.join(parts)} started"
|
||
if names and len(names) <= 5:
|
||
line += f": {', '.join(names)}"
|
||
elif names:
|
||
line += f": {', '.join(names[:3])}... (+{len(names)-3} more)"
|
||
lines.append(line)
|
||
|
||
# Failed VMs/CTs
|
||
if total_fail > 0:
|
||
for vm in report.get('vms_failed', []):
|
||
lines.append(f"VM failed: {vm['name']} - {vm.get('reason', 'unknown error')}")
|
||
for ct in report.get('cts_failed', []):
|
||
lines.append(f"CT failed: {ct['name']} - {ct.get('reason', 'unknown error')}")
|
||
|
||
# Storage issues
|
||
if not report.get('storage_ok', True):
|
||
unavailable = report.get('storage_unavailable', [])
|
||
if unavailable:
|
||
names = [s['name'] for s in unavailable]
|
||
lines.append(f"Storage: {len(unavailable)} unavailable ({', '.join(names[:3])})")
|
||
|
||
# Service issues
|
||
if not report.get('services_ok', True):
|
||
failed = report.get('services_failed', [])
|
||
if failed:
|
||
names = [s['name'] for s in failed]
|
||
lines.append(f"Services: {len(failed)} failed ({', '.join(names[:3])})")
|
||
|
||
return '\n'.join(lines)
|
||
|
||
|
||
# ─── For backwards compatibility ─────────────────────────────────────────────
|
||
|
||
# Expose constants for external use
|
||
GRACE_CATEGORIES = STARTUP_GRACE_CATEGORIES
|