Update health_monitor.py

This commit is contained in:
MacRimi
2026-03-23 18:08:22 +01:00
parent 168726c131
commit 4ac71381da

View File

@@ -26,6 +26,18 @@ try:
except ImportError: except ImportError:
PROXMOX_STORAGE_AVAILABLE = False PROXMOX_STORAGE_AVAILABLE = False
# ============================================================================
# PERFORMANCE DEBUG FLAG - Set to True to log timing of each health check
# To analyze: grep "\[PERF\]" /var/log/proxmenux-monitor.log | sort -t'=' -k2 -n
# Set to False or remove this section after debugging
# ============================================================================
DEBUG_PERF = True
def _perf_log(section: str, elapsed_ms: float):
"""Log performance timing for a section. Only logs if DEBUG_PERF is True."""
if DEBUG_PERF:
print(f"[PERF] {section} = {elapsed_ms:.1f}ms")
class HealthMonitor: class HealthMonitor:
""" """
Monitors system health across multiple components with minimal impact. Monitors system health across multiple components with minimal impact.
@@ -434,9 +446,12 @@ class HealthMonitor:
info_issues = [] # Added info_issues to track INFO separately info_issues = [] # Added info_issues to track INFO separately
# --- Priority Order of Checks --- # --- Priority Order of Checks ---
_t_total = time.time() # [PERF] Total health check timing
# Priority 1: Critical PVE Services # Priority 1: Critical PVE Services
_t = time.time()
services_status = self._check_pve_services() services_status = self._check_pve_services()
_perf_log("services", (time.time() - _t) * 1000)
details['services'] = services_status details['services'] = services_status
if services_status['status'] == 'CRITICAL': if services_status['status'] == 'CRITICAL':
critical_issues.append(f"PVE Services: {services_status.get('reason', 'Service failure')}") critical_issues.append(f"PVE Services: {services_status.get('reason', 'Service failure')}")
@@ -444,7 +459,9 @@ class HealthMonitor:
warning_issues.append(f"PVE Services: {services_status.get('reason', 'Service issue')}") warning_issues.append(f"PVE Services: {services_status.get('reason', 'Service issue')}")
# Priority 1.5: Proxmox Storage Check (External Module) # Priority 1.5: Proxmox Storage Check (External Module)
_t = time.time()
proxmox_storage_result = self._check_proxmox_storage() proxmox_storage_result = self._check_proxmox_storage()
_perf_log("proxmox_storage", (time.time() - _t) * 1000)
if proxmox_storage_result: # Only process if the check ran (module available) if proxmox_storage_result: # Only process if the check ran (module available)
details['storage'] = proxmox_storage_result details['storage'] = proxmox_storage_result
if proxmox_storage_result.get('status') == 'CRITICAL': if proxmox_storage_result.get('status') == 'CRITICAL':
@@ -459,7 +476,9 @@ class HealthMonitor:
self.capabilities['has_lvm'] = any(t in ('lvm', 'lvmthin') for t in storage_types) self.capabilities['has_lvm'] = any(t in ('lvm', 'lvmthin') for t in storage_types)
# Priority 2: Disk/Filesystem Health (Internal checks: usage, ZFS, SMART, IO errors) # Priority 2: Disk/Filesystem Health (Internal checks: usage, ZFS, SMART, IO errors)
_t = time.time()
storage_status = self._check_storage_optimized() storage_status = self._check_storage_optimized()
_perf_log("storage_optimized", (time.time() - _t) * 1000)
details['disks'] = storage_status # Use 'disks' for filesystem/disk specific issues details['disks'] = storage_status # Use 'disks' for filesystem/disk specific issues
if storage_status.get('status') == 'CRITICAL': if storage_status.get('status') == 'CRITICAL':
critical_issues.append(f"Storage/Disks: {storage_status.get('reason', 'Disk/Storage failure')}") critical_issues.append(f"Storage/Disks: {storage_status.get('reason', 'Disk/Storage failure')}")
@@ -467,7 +486,9 @@ class HealthMonitor:
warning_issues.append(f"Storage/Disks: {storage_status.get('reason', 'Disk/Storage issue')}") warning_issues.append(f"Storage/Disks: {storage_status.get('reason', 'Disk/Storage issue')}")
# Priority 3: VMs/CTs Status (with persistence) # Priority 3: VMs/CTs Status (with persistence)
_t = time.time()
vms_status = self._check_vms_cts_with_persistence() vms_status = self._check_vms_cts_with_persistence()
_perf_log("vms_cts", (time.time() - _t) * 1000)
details['vms'] = vms_status details['vms'] = vms_status
if vms_status.get('status') == 'CRITICAL': if vms_status.get('status') == 'CRITICAL':
critical_issues.append(f"VMs/CTs: {vms_status.get('reason', 'VM/CT failure')}") critical_issues.append(f"VMs/CTs: {vms_status.get('reason', 'VM/CT failure')}")
@@ -475,7 +496,9 @@ class HealthMonitor:
warning_issues.append(f"VMs/CTs: {vms_status.get('reason', 'VM/CT issue')}") warning_issues.append(f"VMs/CTs: {vms_status.get('reason', 'VM/CT issue')}")
# Priority 4: Network Connectivity # Priority 4: Network Connectivity
_t = time.time()
network_status = self._check_network_optimized() network_status = self._check_network_optimized()
_perf_log("network", (time.time() - _t) * 1000)
details['network'] = network_status details['network'] = network_status
if network_status.get('status') == 'CRITICAL': if network_status.get('status') == 'CRITICAL':
critical_issues.append(f"Network: {network_status.get('reason', 'Network failure')}") critical_issues.append(f"Network: {network_status.get('reason', 'Network failure')}")
@@ -483,7 +506,9 @@ class HealthMonitor:
warning_issues.append(f"Network: {network_status.get('reason', 'Network issue')}") warning_issues.append(f"Network: {network_status.get('reason', 'Network issue')}")
# Priority 5: CPU Usage (with hysteresis) # Priority 5: CPU Usage (with hysteresis)
_t = time.time()
cpu_status = self._check_cpu_with_hysteresis() cpu_status = self._check_cpu_with_hysteresis()
_perf_log("cpu", (time.time() - _t) * 1000)
details['cpu'] = cpu_status details['cpu'] = cpu_status
if cpu_status.get('status') == 'CRITICAL': if cpu_status.get('status') == 'CRITICAL':
critical_issues.append(f"CPU: {cpu_status.get('reason', 'CPU critical')}") critical_issues.append(f"CPU: {cpu_status.get('reason', 'CPU critical')}")
@@ -491,7 +516,9 @@ class HealthMonitor:
warning_issues.append(f"CPU: {cpu_status.get('reason', 'CPU high')}") warning_issues.append(f"CPU: {cpu_status.get('reason', 'CPU high')}")
# Priority 6: Memory Usage (RAM and Swap) # Priority 6: Memory Usage (RAM and Swap)
_t = time.time()
memory_status = self._check_memory_comprehensive() memory_status = self._check_memory_comprehensive()
_perf_log("memory", (time.time() - _t) * 1000)
details['memory'] = memory_status details['memory'] = memory_status
if memory_status.get('status') == 'CRITICAL': if memory_status.get('status') == 'CRITICAL':
critical_issues.append(f"Memory: {memory_status.get('reason', 'Memory critical')}") critical_issues.append(f"Memory: {memory_status.get('reason', 'Memory critical')}")
@@ -499,7 +526,9 @@ class HealthMonitor:
warning_issues.append(f"Memory: {memory_status.get('reason', 'Memory high')}") warning_issues.append(f"Memory: {memory_status.get('reason', 'Memory high')}")
# Priority 7: Log Analysis (with persistence) # Priority 7: Log Analysis (with persistence)
_t = time.time()
logs_status = self._check_logs_with_persistence() logs_status = self._check_logs_with_persistence()
_perf_log("logs", (time.time() - _t) * 1000)
details['logs'] = logs_status details['logs'] = logs_status
if logs_status.get('status') == 'CRITICAL': if logs_status.get('status') == 'CRITICAL':
critical_issues.append(f"Logs: {logs_status.get('reason', 'Critical log errors')}") critical_issues.append(f"Logs: {logs_status.get('reason', 'Critical log errors')}")
@@ -507,7 +536,9 @@ class HealthMonitor:
warning_issues.append(f"Logs: {logs_status.get('reason', 'Log warnings')}") warning_issues.append(f"Logs: {logs_status.get('reason', 'Log warnings')}")
# Priority 8: System Updates # Priority 8: System Updates
_t = time.time()
updates_status = self._check_updates() updates_status = self._check_updates()
_perf_log("updates", (time.time() - _t) * 1000)
details['updates'] = updates_status details['updates'] = updates_status
if updates_status.get('status') == 'CRITICAL': if updates_status.get('status') == 'CRITICAL':
critical_issues.append(f"Updates: {updates_status.get('reason', 'System not updated')}") critical_issues.append(f"Updates: {updates_status.get('reason', 'System not updated')}")
@@ -517,13 +548,18 @@ class HealthMonitor:
info_issues.append(f"Updates: {updates_status.get('reason', 'Informational update notice')}") info_issues.append(f"Updates: {updates_status.get('reason', 'Informational update notice')}")
# Priority 9: Security Checks # Priority 9: Security Checks
_t = time.time()
security_status = self._check_security() security_status = self._check_security()
_perf_log("security", (time.time() - _t) * 1000)
details['security'] = security_status details['security'] = security_status
if security_status.get('status') == 'WARNING': if security_status.get('status') == 'WARNING':
warning_issues.append(f"Security: {security_status.get('reason', 'Security issue')}") warning_issues.append(f"Security: {security_status.get('reason', 'Security issue')}")
elif security_status.get('status') == 'INFO': elif security_status.get('status') == 'INFO':
info_issues.append(f"Security: {security_status.get('reason', 'Security information')}") info_issues.append(f"Security: {security_status.get('reason', 'Security information')}")
# Log total time for all checks
_perf_log("TOTAL_HEALTH_CHECK", (time.time() - _t_total) * 1000)
# --- Track UNKNOWN counts and persist if >= 3 consecutive cycles --- # --- Track UNKNOWN counts and persist if >= 3 consecutive cycles ---
unknown_issues = [] unknown_issues = []
for cat_key, cat_data in details.items(): for cat_key, cat_data in details.items():