mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-05 20:03:48 +00:00
update health_monitor.py
This commit is contained in:
@@ -150,7 +150,7 @@ class HealthMonitor:
|
|||||||
r'zfs.*scrub (started|finished|in progress)',
|
r'zfs.*scrub (started|finished|in progress)',
|
||||||
r'zpool.*resilver',
|
r'zpool.*resilver',
|
||||||
|
|
||||||
# ── LXC/Container normal operations ──
|
# <EFBFBD><EFBFBD><EFBFBD>─ LXC/Container normal operations ──
|
||||||
r'lxc.*monitor',
|
r'lxc.*monitor',
|
||||||
r'systemd\[1\]: (started|stopped) .*\.scope',
|
r'systemd\[1\]: (started|stopped) .*\.scope',
|
||||||
|
|
||||||
@@ -837,90 +837,95 @@ class HealthMonitor:
|
|||||||
return self.cached_results.get(cache_key)
|
return self.cached_results.get(cache_key)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Use shared journalctl cache to avoid duplicate calls
|
# Read temperature directly from sensors command (not journalctl)
|
||||||
journalctl_output = self._get_journalctl_10min_warnings()
|
result = subprocess.run(
|
||||||
|
['sensors', '-u'],
|
||||||
|
capture_output=True, text=True, timeout=3
|
||||||
|
)
|
||||||
|
|
||||||
if journalctl_output:
|
temps = []
|
||||||
temps = []
|
if result.returncode == 0 and result.stdout:
|
||||||
for line in journalctl_output.split('\n'):
|
for line in result.stdout.split('\n'):
|
||||||
if 'temp' in line.lower() and '_input' in line:
|
# Look for temperature input lines like "temp1_input: 42.000"
|
||||||
|
if '_input' in line and 'temp' in line.lower():
|
||||||
try:
|
try:
|
||||||
temp = float(line.split(':')[1].strip())
|
temp = float(line.split(':')[1].strip())
|
||||||
temps.append(temp)
|
if 0 < temp < 150: # Sanity check for valid temp range
|
||||||
|
temps.append(temp)
|
||||||
except:
|
except:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if temps:
|
||||||
|
max_temp = max(temps)
|
||||||
|
|
||||||
if temps:
|
state_key = 'cpu_temp_history'
|
||||||
max_temp = max(temps)
|
# Add this reading (supplements the sampler thread)
|
||||||
|
self.state_history[state_key].append({
|
||||||
|
'value': max_temp,
|
||||||
|
'time': current_time
|
||||||
|
})
|
||||||
|
|
||||||
|
# Snapshot for thread-safe reading, then atomic prune
|
||||||
|
temp_snapshot = list(self.state_history[state_key])
|
||||||
|
self.state_history[state_key] = [
|
||||||
|
entry for entry in temp_snapshot
|
||||||
|
if current_time - entry['time'] < 240
|
||||||
|
]
|
||||||
|
|
||||||
|
# Check if temperature >80°C for more than 3 minutes (180 seconds)
|
||||||
|
high_temp_samples = [
|
||||||
|
entry for entry in self.state_history[state_key]
|
||||||
|
if entry['value'] > 80 and current_time - entry['time'] <= 180
|
||||||
|
]
|
||||||
|
|
||||||
|
# Check if temperature ≤80°C for last 30 seconds (recovery)
|
||||||
|
recovery_samples = [
|
||||||
|
entry for entry in self.state_history[state_key]
|
||||||
|
if entry['value'] <= 80 and current_time - entry['time'] <= 30
|
||||||
|
]
|
||||||
|
|
||||||
|
# Require at least 18 samples over 3 minutes (one every 10 seconds) to trigger alert
|
||||||
|
if len(high_temp_samples) >= 18:
|
||||||
|
# Temperature has been >80°C for >3 minutes
|
||||||
|
status = 'WARNING'
|
||||||
|
reason = f'CPU temperature {max_temp}°C >80°C sustained >3min'
|
||||||
|
|
||||||
state_key = 'cpu_temp_history'
|
# Record non-dismissable error
|
||||||
# Add this reading (supplements the sampler thread)
|
health_persistence.record_error(
|
||||||
self.state_history[state_key].append({
|
error_key='cpu_temperature',
|
||||||
'value': max_temp,
|
category='temperature',
|
||||||
'time': current_time
|
severity='WARNING',
|
||||||
})
|
reason=reason,
|
||||||
|
details={'temperature': max_temp, 'dismissable': False}
|
||||||
# Snapshot for thread-safe reading, then atomic prune
|
)
|
||||||
temp_snapshot = list(self.state_history[state_key])
|
elif len(recovery_samples) >= 3:
|
||||||
self.state_history[state_key] = [
|
# Temperature has been ≤80°C for 30 seconds - clear the error
|
||||||
entry for entry in temp_snapshot
|
status = 'OK'
|
||||||
if current_time - entry['time'] < 240
|
reason = None
|
||||||
]
|
health_persistence.resolve_error('cpu_temperature', 'Temperature recovered')
|
||||||
|
else:
|
||||||
# Check if temperature >80°C for more than 3 minutes (180 seconds)
|
# Temperature is elevated but not long enough, or recovering but not yet cleared
|
||||||
high_temp_samples = [
|
# Check if we already have an active error
|
||||||
entry for entry in self.state_history[state_key]
|
if health_persistence.is_error_active('cpu_temperature', category='temperature'):
|
||||||
if entry['value'] > 80 and current_time - entry['time'] <= 180
|
# Keep the warning active
|
||||||
]
|
|
||||||
|
|
||||||
# Check if temperature ≤80°C for last 30 seconds (recovery)
|
|
||||||
recovery_samples = [
|
|
||||||
entry for entry in self.state_history[state_key]
|
|
||||||
if entry['value'] <= 80 and current_time - entry['time'] <= 30
|
|
||||||
]
|
|
||||||
|
|
||||||
# Require at least 18 samples over 3 minutes (one every 10 seconds) to trigger alert
|
|
||||||
if len(high_temp_samples) >= 18:
|
|
||||||
# Temperature has been >80°C for >3 minutes
|
|
||||||
status = 'WARNING'
|
status = 'WARNING'
|
||||||
reason = f'CPU temperature {max_temp}°C >80°C sustained >3min'
|
reason = f'CPU temperature {max_temp}°C still elevated'
|
||||||
|
else:
|
||||||
# Record non-dismissable error
|
# No active warning yet
|
||||||
health_persistence.record_error(
|
|
||||||
error_key='cpu_temperature',
|
|
||||||
category='temperature',
|
|
||||||
severity='WARNING',
|
|
||||||
reason=reason,
|
|
||||||
details={'temperature': max_temp, 'dismissable': False}
|
|
||||||
)
|
|
||||||
elif len(recovery_samples) >= 3:
|
|
||||||
# Temperature has been ≤80°C for 30 seconds - clear the error
|
|
||||||
status = 'OK'
|
status = 'OK'
|
||||||
reason = None
|
reason = None
|
||||||
health_persistence.resolve_error('cpu_temperature', 'Temperature recovered')
|
|
||||||
else:
|
temp_result = {
|
||||||
# Temperature is elevated but not long enough, or recovering but not yet cleared
|
'status': status,
|
||||||
# Check if we already have an active error
|
'value': round(max_temp, 1),
|
||||||
if health_persistence.is_error_active('cpu_temperature', category='temperature'):
|
'unit': '°C'
|
||||||
# Keep the warning active
|
}
|
||||||
status = 'WARNING'
|
if reason:
|
||||||
reason = f'CPU temperature {max_temp}°C still elevated'
|
temp_result['reason'] = reason
|
||||||
else:
|
|
||||||
# No active warning yet
|
self.cached_results[cache_key] = temp_result
|
||||||
status = 'OK'
|
self.last_check_times[cache_key] = current_time
|
||||||
reason = None
|
return temp_result
|
||||||
|
|
||||||
temp_result = {
|
|
||||||
'status': status,
|
|
||||||
'value': round(max_temp, 1),
|
|
||||||
'unit': '°C'
|
|
||||||
}
|
|
||||||
if reason:
|
|
||||||
temp_result['reason'] = reason
|
|
||||||
|
|
||||||
self.cached_results[cache_key] = temp_result
|
|
||||||
self.last_check_times[cache_key] = current_time
|
|
||||||
return temp_result
|
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|||||||
@@ -967,23 +967,21 @@ class HealthPersistence:
|
|||||||
cutoff_events = (now - timedelta(days=30)).isoformat()
|
cutoff_events = (now - timedelta(days=30)).isoformat()
|
||||||
cursor.execute('DELETE FROM events WHERE timestamp < ?', (cutoff_events,))
|
cursor.execute('DELETE FROM events WHERE timestamp < ?', (cutoff_events,))
|
||||||
|
|
||||||
# ── Auto-resolve transient log errors after system reboot ──
|
# ── Auto-resolve transient errors after system stabilizes ──
|
||||||
# OOM, service failures, timeouts are transient - a reboot resolves them.
|
# Transient errors (OOM, high CPU, service failures) resolve themselves.
|
||||||
# If the system has been up for >1 hour and these errors haven't recurred,
|
# If the system has been up for >10 minutes and these errors haven't recurred,
|
||||||
# they are from a previous boot and should be auto-resolved.
|
# they are stale and should be auto-resolved.
|
||||||
#
|
|
||||||
# Logic: If uptime > 1 hour AND error.last_seen is not within the last 30 minutes,
|
|
||||||
# the error is stale (from before the current stable state) and should be resolved.
|
|
||||||
try:
|
try:
|
||||||
|
import psutil
|
||||||
# Get system uptime
|
# Get system uptime
|
||||||
with open('/proc/uptime', 'r') as f:
|
with open('/proc/uptime', 'r') as f:
|
||||||
uptime_seconds = float(f.read().split()[0])
|
uptime_seconds = float(f.read().split()[0])
|
||||||
|
|
||||||
# Only auto-resolve if system has been stable for at least 1 hour
|
# Only auto-resolve if system has been stable for at least 10 minutes
|
||||||
if uptime_seconds > 3600: # 1 hour
|
if uptime_seconds > 600: # 10 minutes
|
||||||
# Resolve transient log errors that haven't been seen in the last 30 minutes
|
stale_cutoff = (now - timedelta(minutes=10)).isoformat()
|
||||||
# If they were real current issues, journalctl -b 0 would have detected them recently
|
|
||||||
stale_cutoff = (now - timedelta(minutes=30)).isoformat()
|
# 1. Resolve transient log errors (OOM, service failures)
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
UPDATE errors
|
UPDATE errors
|
||||||
SET resolved_at = ?
|
SET resolved_at = ?
|
||||||
@@ -999,6 +997,42 @@ class HealthPersistence:
|
|||||||
OR reason LIKE '%timeout%'
|
OR reason LIKE '%timeout%'
|
||||||
OR reason LIKE '%critical error%')
|
OR reason LIKE '%critical error%')
|
||||||
''', (now_iso, stale_cutoff))
|
''', (now_iso, stale_cutoff))
|
||||||
|
|
||||||
|
# 2. Auto-resolve CPU errors if current CPU is normal (<75%)
|
||||||
|
try:
|
||||||
|
current_cpu = psutil.cpu_percent(interval=0.1)
|
||||||
|
if current_cpu < 75:
|
||||||
|
cursor.execute('''
|
||||||
|
UPDATE errors
|
||||||
|
SET resolved_at = ?
|
||||||
|
WHERE category = 'temperature'
|
||||||
|
AND resolved_at IS NULL
|
||||||
|
AND acknowledged = 0
|
||||||
|
AND last_seen < ?
|
||||||
|
AND (error_key = 'cpu_usage'
|
||||||
|
OR reason LIKE '%CPU >%sustained%'
|
||||||
|
OR reason LIKE '%Sustained high CPU%')
|
||||||
|
''', (now_iso, stale_cutoff))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 3. Auto-resolve memory errors if current memory is normal (<80%)
|
||||||
|
try:
|
||||||
|
current_mem = psutil.virtual_memory().percent
|
||||||
|
if current_mem < 80:
|
||||||
|
cursor.execute('''
|
||||||
|
UPDATE errors
|
||||||
|
SET resolved_at = ?
|
||||||
|
WHERE category = 'memory'
|
||||||
|
AND resolved_at IS NULL
|
||||||
|
AND acknowledged = 0
|
||||||
|
AND last_seen < ?
|
||||||
|
AND (reason LIKE '%Memory >%'
|
||||||
|
OR reason LIKE '%RAM usage%')
|
||||||
|
''', (now_iso, stale_cutoff))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
pass # If we can't read uptime, skip this cleanup
|
pass # If we can't read uptime, skip this cleanup
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user