mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-18 18:12:15 +00:00
Update health_monitor.py
This commit is contained in:
@@ -26,6 +26,18 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
PROXMOX_STORAGE_AVAILABLE = False
|
PROXMOX_STORAGE_AVAILABLE = False
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# PERFORMANCE DEBUG FLAG - Set to True to log timing of each health check
|
||||||
|
# To analyze: grep "\[PERF\]" /var/log/proxmenux-monitor.log | sort -t'=' -k2 -n
|
||||||
|
# Set to False or remove this section after debugging
|
||||||
|
# ============================================================================
|
||||||
|
DEBUG_PERF = True
|
||||||
|
|
||||||
|
def _perf_log(section: str, elapsed_ms: float):
|
||||||
|
"""Log performance timing for a section. Only logs if DEBUG_PERF is True."""
|
||||||
|
if DEBUG_PERF:
|
||||||
|
print(f"[PERF] {section} = {elapsed_ms:.1f}ms")
|
||||||
|
|
||||||
class HealthMonitor:
|
class HealthMonitor:
|
||||||
"""
|
"""
|
||||||
Monitors system health across multiple components with minimal impact.
|
Monitors system health across multiple components with minimal impact.
|
||||||
@@ -434,9 +446,12 @@ class HealthMonitor:
|
|||||||
info_issues = [] # Added info_issues to track INFO separately
|
info_issues = [] # Added info_issues to track INFO separately
|
||||||
|
|
||||||
# --- Priority Order of Checks ---
|
# --- Priority Order of Checks ---
|
||||||
|
_t_total = time.time() # [PERF] Total health check timing
|
||||||
|
|
||||||
# Priority 1: Critical PVE Services
|
# Priority 1: Critical PVE Services
|
||||||
|
_t = time.time()
|
||||||
services_status = self._check_pve_services()
|
services_status = self._check_pve_services()
|
||||||
|
_perf_log("services", (time.time() - _t) * 1000)
|
||||||
details['services'] = services_status
|
details['services'] = services_status
|
||||||
if services_status['status'] == 'CRITICAL':
|
if services_status['status'] == 'CRITICAL':
|
||||||
critical_issues.append(f"PVE Services: {services_status.get('reason', 'Service failure')}")
|
critical_issues.append(f"PVE Services: {services_status.get('reason', 'Service failure')}")
|
||||||
@@ -444,7 +459,9 @@ class HealthMonitor:
|
|||||||
warning_issues.append(f"PVE Services: {services_status.get('reason', 'Service issue')}")
|
warning_issues.append(f"PVE Services: {services_status.get('reason', 'Service issue')}")
|
||||||
|
|
||||||
# Priority 1.5: Proxmox Storage Check (External Module)
|
# Priority 1.5: Proxmox Storage Check (External Module)
|
||||||
|
_t = time.time()
|
||||||
proxmox_storage_result = self._check_proxmox_storage()
|
proxmox_storage_result = self._check_proxmox_storage()
|
||||||
|
_perf_log("proxmox_storage", (time.time() - _t) * 1000)
|
||||||
if proxmox_storage_result: # Only process if the check ran (module available)
|
if proxmox_storage_result: # Only process if the check ran (module available)
|
||||||
details['storage'] = proxmox_storage_result
|
details['storage'] = proxmox_storage_result
|
||||||
if proxmox_storage_result.get('status') == 'CRITICAL':
|
if proxmox_storage_result.get('status') == 'CRITICAL':
|
||||||
@@ -459,7 +476,9 @@ class HealthMonitor:
|
|||||||
self.capabilities['has_lvm'] = any(t in ('lvm', 'lvmthin') for t in storage_types)
|
self.capabilities['has_lvm'] = any(t in ('lvm', 'lvmthin') for t in storage_types)
|
||||||
|
|
||||||
# Priority 2: Disk/Filesystem Health (Internal checks: usage, ZFS, SMART, IO errors)
|
# Priority 2: Disk/Filesystem Health (Internal checks: usage, ZFS, SMART, IO errors)
|
||||||
|
_t = time.time()
|
||||||
storage_status = self._check_storage_optimized()
|
storage_status = self._check_storage_optimized()
|
||||||
|
_perf_log("storage_optimized", (time.time() - _t) * 1000)
|
||||||
details['disks'] = storage_status # Use 'disks' for filesystem/disk specific issues
|
details['disks'] = storage_status # Use 'disks' for filesystem/disk specific issues
|
||||||
if storage_status.get('status') == 'CRITICAL':
|
if storage_status.get('status') == 'CRITICAL':
|
||||||
critical_issues.append(f"Storage/Disks: {storage_status.get('reason', 'Disk/Storage failure')}")
|
critical_issues.append(f"Storage/Disks: {storage_status.get('reason', 'Disk/Storage failure')}")
|
||||||
@@ -467,7 +486,9 @@ class HealthMonitor:
|
|||||||
warning_issues.append(f"Storage/Disks: {storage_status.get('reason', 'Disk/Storage issue')}")
|
warning_issues.append(f"Storage/Disks: {storage_status.get('reason', 'Disk/Storage issue')}")
|
||||||
|
|
||||||
# Priority 3: VMs/CTs Status (with persistence)
|
# Priority 3: VMs/CTs Status (with persistence)
|
||||||
|
_t = time.time()
|
||||||
vms_status = self._check_vms_cts_with_persistence()
|
vms_status = self._check_vms_cts_with_persistence()
|
||||||
|
_perf_log("vms_cts", (time.time() - _t) * 1000)
|
||||||
details['vms'] = vms_status
|
details['vms'] = vms_status
|
||||||
if vms_status.get('status') == 'CRITICAL':
|
if vms_status.get('status') == 'CRITICAL':
|
||||||
critical_issues.append(f"VMs/CTs: {vms_status.get('reason', 'VM/CT failure')}")
|
critical_issues.append(f"VMs/CTs: {vms_status.get('reason', 'VM/CT failure')}")
|
||||||
@@ -475,7 +496,9 @@ class HealthMonitor:
|
|||||||
warning_issues.append(f"VMs/CTs: {vms_status.get('reason', 'VM/CT issue')}")
|
warning_issues.append(f"VMs/CTs: {vms_status.get('reason', 'VM/CT issue')}")
|
||||||
|
|
||||||
# Priority 4: Network Connectivity
|
# Priority 4: Network Connectivity
|
||||||
|
_t = time.time()
|
||||||
network_status = self._check_network_optimized()
|
network_status = self._check_network_optimized()
|
||||||
|
_perf_log("network", (time.time() - _t) * 1000)
|
||||||
details['network'] = network_status
|
details['network'] = network_status
|
||||||
if network_status.get('status') == 'CRITICAL':
|
if network_status.get('status') == 'CRITICAL':
|
||||||
critical_issues.append(f"Network: {network_status.get('reason', 'Network failure')}")
|
critical_issues.append(f"Network: {network_status.get('reason', 'Network failure')}")
|
||||||
@@ -483,7 +506,9 @@ class HealthMonitor:
|
|||||||
warning_issues.append(f"Network: {network_status.get('reason', 'Network issue')}")
|
warning_issues.append(f"Network: {network_status.get('reason', 'Network issue')}")
|
||||||
|
|
||||||
# Priority 5: CPU Usage (with hysteresis)
|
# Priority 5: CPU Usage (with hysteresis)
|
||||||
|
_t = time.time()
|
||||||
cpu_status = self._check_cpu_with_hysteresis()
|
cpu_status = self._check_cpu_with_hysteresis()
|
||||||
|
_perf_log("cpu", (time.time() - _t) * 1000)
|
||||||
details['cpu'] = cpu_status
|
details['cpu'] = cpu_status
|
||||||
if cpu_status.get('status') == 'CRITICAL':
|
if cpu_status.get('status') == 'CRITICAL':
|
||||||
critical_issues.append(f"CPU: {cpu_status.get('reason', 'CPU critical')}")
|
critical_issues.append(f"CPU: {cpu_status.get('reason', 'CPU critical')}")
|
||||||
@@ -491,7 +516,9 @@ class HealthMonitor:
|
|||||||
warning_issues.append(f"CPU: {cpu_status.get('reason', 'CPU high')}")
|
warning_issues.append(f"CPU: {cpu_status.get('reason', 'CPU high')}")
|
||||||
|
|
||||||
# Priority 6: Memory Usage (RAM and Swap)
|
# Priority 6: Memory Usage (RAM and Swap)
|
||||||
|
_t = time.time()
|
||||||
memory_status = self._check_memory_comprehensive()
|
memory_status = self._check_memory_comprehensive()
|
||||||
|
_perf_log("memory", (time.time() - _t) * 1000)
|
||||||
details['memory'] = memory_status
|
details['memory'] = memory_status
|
||||||
if memory_status.get('status') == 'CRITICAL':
|
if memory_status.get('status') == 'CRITICAL':
|
||||||
critical_issues.append(f"Memory: {memory_status.get('reason', 'Memory critical')}")
|
critical_issues.append(f"Memory: {memory_status.get('reason', 'Memory critical')}")
|
||||||
@@ -499,7 +526,9 @@ class HealthMonitor:
|
|||||||
warning_issues.append(f"Memory: {memory_status.get('reason', 'Memory high')}")
|
warning_issues.append(f"Memory: {memory_status.get('reason', 'Memory high')}")
|
||||||
|
|
||||||
# Priority 7: Log Analysis (with persistence)
|
# Priority 7: Log Analysis (with persistence)
|
||||||
|
_t = time.time()
|
||||||
logs_status = self._check_logs_with_persistence()
|
logs_status = self._check_logs_with_persistence()
|
||||||
|
_perf_log("logs", (time.time() - _t) * 1000)
|
||||||
details['logs'] = logs_status
|
details['logs'] = logs_status
|
||||||
if logs_status.get('status') == 'CRITICAL':
|
if logs_status.get('status') == 'CRITICAL':
|
||||||
critical_issues.append(f"Logs: {logs_status.get('reason', 'Critical log errors')}")
|
critical_issues.append(f"Logs: {logs_status.get('reason', 'Critical log errors')}")
|
||||||
@@ -507,7 +536,9 @@ class HealthMonitor:
|
|||||||
warning_issues.append(f"Logs: {logs_status.get('reason', 'Log warnings')}")
|
warning_issues.append(f"Logs: {logs_status.get('reason', 'Log warnings')}")
|
||||||
|
|
||||||
# Priority 8: System Updates
|
# Priority 8: System Updates
|
||||||
|
_t = time.time()
|
||||||
updates_status = self._check_updates()
|
updates_status = self._check_updates()
|
||||||
|
_perf_log("updates", (time.time() - _t) * 1000)
|
||||||
details['updates'] = updates_status
|
details['updates'] = updates_status
|
||||||
if updates_status.get('status') == 'CRITICAL':
|
if updates_status.get('status') == 'CRITICAL':
|
||||||
critical_issues.append(f"Updates: {updates_status.get('reason', 'System not updated')}")
|
critical_issues.append(f"Updates: {updates_status.get('reason', 'System not updated')}")
|
||||||
@@ -517,13 +548,18 @@ class HealthMonitor:
|
|||||||
info_issues.append(f"Updates: {updates_status.get('reason', 'Informational update notice')}")
|
info_issues.append(f"Updates: {updates_status.get('reason', 'Informational update notice')}")
|
||||||
|
|
||||||
# Priority 9: Security Checks
|
# Priority 9: Security Checks
|
||||||
|
_t = time.time()
|
||||||
security_status = self._check_security()
|
security_status = self._check_security()
|
||||||
|
_perf_log("security", (time.time() - _t) * 1000)
|
||||||
details['security'] = security_status
|
details['security'] = security_status
|
||||||
if security_status.get('status') == 'WARNING':
|
if security_status.get('status') == 'WARNING':
|
||||||
warning_issues.append(f"Security: {security_status.get('reason', 'Security issue')}")
|
warning_issues.append(f"Security: {security_status.get('reason', 'Security issue')}")
|
||||||
elif security_status.get('status') == 'INFO':
|
elif security_status.get('status') == 'INFO':
|
||||||
info_issues.append(f"Security: {security_status.get('reason', 'Security information')}")
|
info_issues.append(f"Security: {security_status.get('reason', 'Security information')}")
|
||||||
|
|
||||||
|
# Log total time for all checks
|
||||||
|
_perf_log("TOTAL_HEALTH_CHECK", (time.time() - _t_total) * 1000)
|
||||||
|
|
||||||
# --- Track UNKNOWN counts and persist if >= 3 consecutive cycles ---
|
# --- Track UNKNOWN counts and persist if >= 3 consecutive cycles ---
|
||||||
unknown_issues = []
|
unknown_issues = []
|
||||||
for cat_key, cat_data in details.items():
|
for cat_key, cat_data in details.items():
|
||||||
|
|||||||
Reference in New Issue
Block a user