mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-23 20:10:39 +00:00
update health_monitor.py
This commit is contained in:
@@ -1088,43 +1088,50 @@ def _health_collector_loop():
|
||||
|
||||
|
||||
def _vital_signs_sampler():
|
||||
"""Dedicated thread for rapid CPU & temperature sampling.
|
||||
|
||||
"""Dedicated thread for rapid CPU, memory & temperature sampling.
|
||||
|
||||
Runs independently of the 5-min health collector loop.
|
||||
- CPU usage: sampled every 30s (3 samples in 1.5 min for hysteresis)
|
||||
- CPU usage: sampled every 30s (10 samples in 5 min for sustained detection)
|
||||
- Memory: sampled every 30s (10 samples in 5 min for sustained detection)
|
||||
- Temperature: sampled every 15s (12 samples in 3 min for temporal logic)
|
||||
Uses time.monotonic() to avoid drift.
|
||||
|
||||
Staggered intervals: CPU at offset 0, Temp at offset 7s to avoid collision.
|
||||
|
||||
Staggered intervals to avoid collision: CPU at 0, Temp at +7s, Mem at +15s.
|
||||
"""
|
||||
from health_monitor import health_monitor
|
||||
|
||||
|
||||
# Wait 15s after startup for sensors to be ready
|
||||
time.sleep(15)
|
||||
|
||||
|
||||
TEMP_INTERVAL = 15 # seconds (was 10s - reduced frequency by 33%)
|
||||
CPU_INTERVAL = 30 # seconds
|
||||
|
||||
# Stagger: CPU starts immediately, Temp starts after 7s offset
|
||||
MEM_INTERVAL = 30 # seconds (aligned with CPU for sustained-RAM detection)
|
||||
|
||||
# Stagger: CPU starts immediately, Temp after 7s, Mem after 15s
|
||||
next_cpu = time.monotonic()
|
||||
next_temp = time.monotonic() + 7
|
||||
|
||||
print("[ProxMenux] Vital signs sampler started (CPU: 30s, Temp: 10s)")
|
||||
|
||||
next_mem = time.monotonic() + 15
|
||||
|
||||
print("[ProxMenux] Vital signs sampler started (CPU: 30s, Mem: 30s, Temp: 15s)")
|
||||
|
||||
while True:
|
||||
try:
|
||||
now = time.monotonic()
|
||||
|
||||
|
||||
if now >= next_temp:
|
||||
health_monitor._sample_cpu_temperature()
|
||||
next_temp = now + TEMP_INTERVAL
|
||||
|
||||
|
||||
if now >= next_cpu:
|
||||
health_monitor._sample_cpu_usage()
|
||||
next_cpu = now + CPU_INTERVAL
|
||||
|
||||
|
||||
if now >= next_mem:
|
||||
health_monitor._sample_memory_usage()
|
||||
next_mem = now + MEM_INTERVAL
|
||||
|
||||
# Sleep until the next earliest event (with 0.5s min to avoid busy-loop)
|
||||
sleep_until = min(next_temp, next_cpu) - time.monotonic()
|
||||
sleep_until = min(next_temp, next_cpu, next_mem) - time.monotonic()
|
||||
time.sleep(max(sleep_until, 0.5))
|
||||
except Exception as e:
|
||||
print(f"[ProxMenux] Vital signs sampler error: {e}")
|
||||
@@ -1160,7 +1167,7 @@ _pvesh_cache = {
|
||||
'storage_list': None,
|
||||
'storage_list_time': 0,
|
||||
}
|
||||
_PVESH_CACHE_TTL = 30 # 30 seconds - balances freshness with performance
|
||||
_PVESH_CACHE_TTL = 5 # 5 seconds - near real-time for active UI; pvesh local cost is ~200-400ms
|
||||
|
||||
# Cache for sensors output (temperature readings)
|
||||
_sensors_cache = {
|
||||
@@ -1169,6 +1176,15 @@ _sensors_cache = {
|
||||
}
|
||||
_SENSORS_CACHE_TTL = 10 # 10 seconds - temperature changes slowly
|
||||
|
||||
# Cache for ipmitool sensor output (shared between fans, power supplies, power meter)
|
||||
# ipmitool is slow (1-3s per call) and was called twice per /api/hardware hit.
|
||||
_ipmi_cache = {
|
||||
'output': None,
|
||||
'time': 0,
|
||||
'unavailable': False, # set True if ipmitool is missing, avoid retrying
|
||||
}
|
||||
_IPMI_CACHE_TTL = 10 # 10 seconds
|
||||
|
||||
# Cache for hardware info (lspci, dmidecode, lsblk)
|
||||
_hardware_cache = {
|
||||
'lspci': None,
|
||||
@@ -3820,13 +3836,42 @@ def get_proxmox_vms():
|
||||
# Return empty array instead of error object - frontend expects array
|
||||
return []
|
||||
|
||||
def get_ipmi_fans():
|
||||
"""Get fan information from IPMI"""
|
||||
fans = []
|
||||
def get_cached_ipmi_sensors():
|
||||
"""Get ipmitool sensor output with 10s cache. Shared between fans/power parsers.
|
||||
|
||||
Returns empty string if ipmitool is unavailable (cached to avoid repeated FileNotFoundError).
|
||||
"""
|
||||
global _ipmi_cache
|
||||
now = time.time()
|
||||
|
||||
if _ipmi_cache['unavailable']:
|
||||
return ''
|
||||
|
||||
if _ipmi_cache['output'] is not None and \
|
||||
now - _ipmi_cache['time'] < _IPMI_CACHE_TTL:
|
||||
return _ipmi_cache['output']
|
||||
|
||||
try:
|
||||
result = subprocess.run(['ipmitool', 'sensor'], capture_output=True, text=True, timeout=10)
|
||||
if result.returncode == 0:
|
||||
for line in result.stdout.split('\n'):
|
||||
_ipmi_cache['output'] = result.stdout
|
||||
_ipmi_cache['time'] = now
|
||||
return result.stdout
|
||||
except FileNotFoundError:
|
||||
_ipmi_cache['unavailable'] = True
|
||||
return ''
|
||||
except Exception:
|
||||
pass
|
||||
return _ipmi_cache['output'] or ''
|
||||
|
||||
|
||||
def get_ipmi_fans():
|
||||
"""Get fan information from IPMI (uses cached sensor output)."""
|
||||
fans = []
|
||||
try:
|
||||
output = get_cached_ipmi_sensors()
|
||||
if output:
|
||||
for line in output.split('\n'):
|
||||
if 'fan' in line.lower() and '|' in line:
|
||||
parts = [p.strip() for p in line.split('|')]
|
||||
if len(parts) >= 3:
|
||||
@@ -3862,14 +3907,14 @@ def get_ipmi_fans():
|
||||
return fans
|
||||
|
||||
def get_ipmi_power():
|
||||
"""Get power supply information from IPMI"""
|
||||
"""Get power supply information from IPMI (uses cached sensor output)."""
|
||||
power_supplies = []
|
||||
power_meter = None
|
||||
|
||||
|
||||
try:
|
||||
result = subprocess.run(['ipmitool', 'sensor'], capture_output=True, text=True, timeout=10)
|
||||
if result.returncode == 0:
|
||||
for line in result.stdout.split('\n'):
|
||||
output = get_cached_ipmi_sensors()
|
||||
if output:
|
||||
for line in output.split('\n'):
|
||||
if ('power supply' in line.lower() or 'power meter' in line.lower()) and '|' in line:
|
||||
parts = [p.strip() for p in line.split('|')]
|
||||
if len(parts) >= 3:
|
||||
@@ -4202,7 +4247,97 @@ def identify_fan(sensor_name, adapter, chip_name=None):
|
||||
return sensor_name
|
||||
|
||||
# Default: return original name
|
||||
return sensor_name
|
||||
return sensor_name
|
||||
|
||||
|
||||
def _parse_sensor_fans(sensors_output):
|
||||
"""Parse fan entries from `sensors` output. Extracted for reuse between
|
||||
get_hardware_info (static full payload) and get_hardware_live_info (live endpoint)."""
|
||||
fans = []
|
||||
if not sensors_output:
|
||||
return fans
|
||||
current_adapter = None
|
||||
current_chip = None
|
||||
for line in sensors_output.split('\n'):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
if not ':' in line and not line.startswith(' ') and not line.startswith('Adapter'):
|
||||
current_chip = line
|
||||
continue
|
||||
if line.startswith('Adapter:'):
|
||||
current_adapter = line.replace('Adapter:', '').strip()
|
||||
continue
|
||||
if ':' in line and not line.startswith(' '):
|
||||
parts = line.split(':', 1)
|
||||
sensor_name = parts[0].strip()
|
||||
value_part = parts[1].strip()
|
||||
if 'RPM' in value_part:
|
||||
rpm_match = re.search(r'([\d.]+)\s*RPM', value_part)
|
||||
if rpm_match:
|
||||
fan_speed = int(float(rpm_match.group(1)))
|
||||
identified_name = identify_fan(sensor_name, current_adapter, current_chip)
|
||||
fans.append({
|
||||
'name': identified_name,
|
||||
'original_name': sensor_name,
|
||||
'speed': fan_speed,
|
||||
'unit': 'RPM',
|
||||
'adapter': current_adapter
|
||||
})
|
||||
return fans
|
||||
|
||||
|
||||
def get_hardware_live_info():
|
||||
"""Build only the live/dynamic hardware fields for /api/hardware/live.
|
||||
|
||||
Skips all the heavy static collection (lscpu, dmidecode, lsblk, smartctl, lspci...).
|
||||
Uses cached sensors + cached ipmitool output to stay cheap under 5s polling.
|
||||
"""
|
||||
result = {
|
||||
'temperatures': [],
|
||||
'fans': [],
|
||||
'power_meter': None,
|
||||
'power_supplies': [],
|
||||
'ups': None,
|
||||
}
|
||||
|
||||
try:
|
||||
temp_info = get_temperature_info()
|
||||
result['temperatures'] = temp_info.get('temperatures', [])
|
||||
result['power_meter'] = temp_info.get('power_meter')
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
sensor_fans = _parse_sensor_fans(get_cached_sensors_output())
|
||||
except Exception:
|
||||
sensor_fans = []
|
||||
|
||||
try:
|
||||
ipmi_fans = get_ipmi_fans()
|
||||
except Exception:
|
||||
ipmi_fans = []
|
||||
|
||||
result['fans'] = sensor_fans + ipmi_fans
|
||||
|
||||
try:
|
||||
ipmi_power = get_ipmi_power()
|
||||
if ipmi_power:
|
||||
result['power_supplies'] = ipmi_power.get('power_supplies', [])
|
||||
# Fallback: if sensors didn't provide a power_meter, use IPMI's
|
||||
if result['power_meter'] is None and ipmi_power.get('power_meter'):
|
||||
result['power_meter'] = ipmi_power['power_meter']
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
ups_info = get_ups_info()
|
||||
if ups_info:
|
||||
result['ups'] = ups_info
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_temperature_info():
|
||||
@@ -6102,52 +6237,8 @@ def get_hardware_info():
|
||||
pass
|
||||
|
||||
try:
|
||||
sensors_output = get_cached_sensors_output()
|
||||
if sensors_output:
|
||||
current_adapter = None
|
||||
current_chip = None # Add chip name tracking
|
||||
fans = []
|
||||
|
||||
for line in sensors_output.split('\n'):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Chip names don't have ":" and are not indented
|
||||
if not ':' in line and not line.startswith(' ') and not line.startswith('Adapter'):
|
||||
current_chip = line
|
||||
continue
|
||||
|
||||
# Detect adapter line
|
||||
if line.startswith('Adapter:'):
|
||||
current_adapter = line.replace('Adapter:', '').strip()
|
||||
continue
|
||||
|
||||
# Parse fan sensors
|
||||
if ':' in line and not line.startswith(' '):
|
||||
parts = line.split(':', 1)
|
||||
sensor_name = parts[0].strip()
|
||||
value_part = parts[1].strip()
|
||||
|
||||
# Look for fan sensors (RPM)
|
||||
if 'RPM' in value_part:
|
||||
rpm_match = re.search(r'([\d.]+)\s*RPM', value_part)
|
||||
if rpm_match:
|
||||
fan_speed = int(float(rpm_match.group(1)))
|
||||
|
||||
identified_name = identify_fan(sensor_name, current_adapter, current_chip)
|
||||
|
||||
fans.append({
|
||||
'name': identified_name,
|
||||
'original_name': sensor_name,
|
||||
'speed': fan_speed,
|
||||
'unit': 'RPM',
|
||||
'adapter': current_adapter
|
||||
})
|
||||
|
||||
hardware_data['sensors']['fans'] = fans
|
||||
except Exception as e:
|
||||
# print(f"[v0] Error getting fan sensors: {e}")
|
||||
hardware_data['sensors']['fans'] = _parse_sensor_fans(get_cached_sensors_output())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Power Supply / UPS
|
||||
@@ -6226,7 +6317,9 @@ def get_hardware_info():
|
||||
def api_system():
|
||||
"""Get system information including CPU, memory, and temperature"""
|
||||
try:
|
||||
cpu_usage = psutil.cpu_percent(interval=0.5)
|
||||
# Non-blocking: returns %CPU since the last psutil call (sampler or prior API hit).
|
||||
# The background vital-signs sampler keeps psutil's internal state primed.
|
||||
cpu_usage = psutil.cpu_percent(interval=0)
|
||||
|
||||
memory = psutil.virtual_memory()
|
||||
memory_used_gb = memory.used / (1024 ** 3)
|
||||
@@ -9286,6 +9379,23 @@ def api_hardware():
|
||||
traceback.print_exc()
|
||||
return jsonify({'error': str(e)}), 500
|
||||
|
||||
@app.route('/api/hardware/live', methods=['GET'])
|
||||
@require_auth
|
||||
def api_hardware_live():
|
||||
"""Lightweight endpoint: only dynamic hardware fields (temps, fans, power, UPS).
|
||||
|
||||
Designed for the active Hardware page to poll every 3-5s without re-running the
|
||||
expensive static collectors (lscpu, dmidecode, lsblk, smartctl). ipmitool output
|
||||
is cached internally (10s) so repeated polls don't hammer the BMC.
|
||||
"""
|
||||
try:
|
||||
return jsonify(get_hardware_live_info())
|
||||
except Exception as e:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return jsonify({'error': str(e)}), 500
|
||||
|
||||
|
||||
@app.route('/api/gpu/<slot>/realtime', methods=['GET'])
|
||||
@require_auth
|
||||
def api_gpu_realtime(slot):
|
||||
@@ -9526,8 +9636,11 @@ def api_vm_control(vmid):
|
||||
control_result = subprocess.run(
|
||||
['pvesh', 'create', f'/nodes/{node}/{vm_type}/{vmid}/status/{action}'],
|
||||
capture_output=True, text=True, timeout=30)
|
||||
|
||||
|
||||
if control_result.returncode == 0:
|
||||
# Invalidate VM resources cache so the next /api/vms call
|
||||
# returns fresh status instead of the pre-action snapshot.
|
||||
_pvesh_cache['cluster_resources_vm_time'] = 0
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'vmid': vmid,
|
||||
|
||||
@@ -67,7 +67,7 @@ class HealthMonitor:
|
||||
# Memory Thresholds
|
||||
MEMORY_WARNING = 85
|
||||
MEMORY_CRITICAL = 95
|
||||
MEMORY_DURATION = 60
|
||||
MEMORY_DURATION = 300 # 5 minutes sustained (aligned with CPU)
|
||||
SWAP_WARNING_DURATION = 300
|
||||
SWAP_CRITICAL_PERCENT = 5
|
||||
SWAP_CRITICAL_DURATION = 120
|
||||
@@ -402,6 +402,30 @@ class HealthMonitor:
|
||||
except Exception:
|
||||
pass # Sampling must never crash the thread
|
||||
|
||||
def _sample_memory_usage(self):
|
||||
"""Lightweight memory sample: read RAM/swap % and append to history. ~1ms cost."""
|
||||
try:
|
||||
memory = psutil.virtual_memory()
|
||||
swap = psutil.swap_memory()
|
||||
current_time = time.time()
|
||||
mem_percent = memory.percent
|
||||
swap_percent = swap.percent if swap.total > 0 else 0
|
||||
swap_vs_ram = (swap.used / memory.total * 100) if memory.total > 0 else 0
|
||||
state_key = 'memory_usage'
|
||||
self.state_history[state_key].append({
|
||||
'mem_percent': mem_percent,
|
||||
'swap_percent': swap_percent,
|
||||
'swap_vs_ram': swap_vs_ram,
|
||||
'time': current_time
|
||||
})
|
||||
# Prune entries older than 10 minutes
|
||||
self.state_history[state_key] = [
|
||||
e for e in self.state_history[state_key]
|
||||
if current_time - e['time'] < 600
|
||||
]
|
||||
except Exception:
|
||||
pass # Sampling must never crash the thread
|
||||
|
||||
def _sample_cpu_temperature(self):
|
||||
"""Lightweight temperature sample: read sensor and append to history. ~50ms cost."""
|
||||
try:
|
||||
@@ -1050,34 +1074,46 @@ class HealthMonitor:
|
||||
if current_time - entry['time'] < 600
|
||||
]
|
||||
|
||||
mem_critical = sum(
|
||||
1 for entry in self.state_history[state_key]
|
||||
mem_critical_samples = [
|
||||
entry for entry in self.state_history[state_key]
|
||||
if entry['mem_percent'] >= 90 and
|
||||
current_time - entry['time'] <= self.MEMORY_DURATION
|
||||
)
|
||||
|
||||
mem_warning = sum(
|
||||
1 for entry in self.state_history[state_key]
|
||||
]
|
||||
|
||||
mem_warning_samples = [
|
||||
entry for entry in self.state_history[state_key]
|
||||
if entry['mem_percent'] >= self.MEMORY_WARNING and
|
||||
current_time - entry['time'] <= self.MEMORY_DURATION
|
||||
)
|
||||
|
||||
]
|
||||
|
||||
swap_critical = sum(
|
||||
1 for entry in self.state_history[state_key]
|
||||
if entry['swap_vs_ram'] > 20 and
|
||||
current_time - entry['time'] <= self.SWAP_CRITICAL_DURATION
|
||||
)
|
||||
|
||||
|
||||
if mem_critical >= 2:
|
||||
|
||||
# Require sustained high usage across most of the 300s window.
|
||||
# With ~30s sampling: 300s = ~10 samples, so 8 ≈ 240s sustained.
|
||||
# Mirrors CPU's ~83% coverage threshold (25/30).
|
||||
MEM_CRITICAL_MIN_SAMPLES = 8
|
||||
MEM_WARNING_MIN_SAMPLES = 8
|
||||
|
||||
mem_critical_count = len(mem_critical_samples)
|
||||
mem_warning_count = len(mem_warning_samples)
|
||||
|
||||
if mem_critical_count >= MEM_CRITICAL_MIN_SAMPLES:
|
||||
oldest = min(s['time'] for s in mem_critical_samples)
|
||||
actual_duration = int(current_time - oldest)
|
||||
status = 'CRITICAL'
|
||||
reason = f'RAM >90% for {self.MEMORY_DURATION}s'
|
||||
reason = f'RAM >90% sustained for {actual_duration}s'
|
||||
elif swap_critical >= 2:
|
||||
status = 'CRITICAL'
|
||||
reason = f'Swap >20% of RAM ({swap_vs_ram:.1f}%)'
|
||||
elif mem_warning >= 2:
|
||||
elif mem_warning_count >= MEM_WARNING_MIN_SAMPLES:
|
||||
oldest = min(s['time'] for s in mem_warning_samples)
|
||||
actual_duration = int(current_time - oldest)
|
||||
status = 'WARNING'
|
||||
reason = f'RAM >{self.MEMORY_WARNING}% for {self.MEMORY_DURATION}s'
|
||||
reason = f'RAM >{self.MEMORY_WARNING}% sustained for {actual_duration}s'
|
||||
else:
|
||||
status = 'OK'
|
||||
reason = None
|
||||
@@ -1088,7 +1124,7 @@ class HealthMonitor:
|
||||
swap_total_gb = round(swap.total / (1024**3), 2)
|
||||
|
||||
# Determine per-sub-check status
|
||||
ram_status = 'CRITICAL' if mem_percent >= 90 and mem_critical >= 2 else ('WARNING' if mem_percent >= self.MEMORY_WARNING and mem_warning >= 2 else 'OK')
|
||||
ram_status = 'CRITICAL' if mem_percent >= 90 and mem_critical_count >= MEM_CRITICAL_MIN_SAMPLES else ('WARNING' if mem_percent >= self.MEMORY_WARNING and mem_warning_count >= MEM_WARNING_MIN_SAMPLES else 'OK')
|
||||
swap_status = 'CRITICAL' if swap_critical >= 2 else 'OK'
|
||||
|
||||
result = {
|
||||
|
||||
Reference in New Issue
Block a user