update health_monitor.py

This commit is contained in:
MacRimi
2026-04-05 12:17:42 +02:00
parent 4c72d0b3ef
commit 95e876b37f
2 changed files with 126 additions and 87 deletions

View File

@@ -150,7 +150,7 @@ class HealthMonitor:
r'zfs.*scrub (started|finished|in progress)', r'zfs.*scrub (started|finished|in progress)',
r'zpool.*resilver', r'zpool.*resilver',
# ─ LXC/Container normal operations ── # <EFBFBD><EFBFBD><EFBFBD>─ LXC/Container normal operations ──
r'lxc.*monitor', r'lxc.*monitor',
r'systemd\[1\]: (started|stopped) .*\.scope', r'systemd\[1\]: (started|stopped) .*\.scope',
@@ -837,90 +837,95 @@ class HealthMonitor:
return self.cached_results.get(cache_key) return self.cached_results.get(cache_key)
try: try:
# Use shared journalctl cache to avoid duplicate calls # Read temperature directly from sensors command (not journalctl)
journalctl_output = self._get_journalctl_10min_warnings() result = subprocess.run(
['sensors', '-u'],
capture_output=True, text=True, timeout=3
)
if journalctl_output: temps = []
temps = [] if result.returncode == 0 and result.stdout:
for line in journalctl_output.split('\n'): for line in result.stdout.split('\n'):
if 'temp' in line.lower() and '_input' in line: # Look for temperature input lines like "temp1_input: 42.000"
if '_input' in line and 'temp' in line.lower():
try: try:
temp = float(line.split(':')[1].strip()) temp = float(line.split(':')[1].strip())
temps.append(temp) if 0 < temp < 150: # Sanity check for valid temp range
temps.append(temp)
except: except:
continue continue
if temps:
max_temp = max(temps)
if temps: state_key = 'cpu_temp_history'
max_temp = max(temps) # Add this reading (supplements the sampler thread)
self.state_history[state_key].append({
'value': max_temp,
'time': current_time
})
# Snapshot for thread-safe reading, then atomic prune
temp_snapshot = list(self.state_history[state_key])
self.state_history[state_key] = [
entry for entry in temp_snapshot
if current_time - entry['time'] < 240
]
# Check if temperature >80°C for more than 3 minutes (180 seconds)
high_temp_samples = [
entry for entry in self.state_history[state_key]
if entry['value'] > 80 and current_time - entry['time'] <= 180
]
# Check if temperature ≤80°C for last 30 seconds (recovery)
recovery_samples = [
entry for entry in self.state_history[state_key]
if entry['value'] <= 80 and current_time - entry['time'] <= 30
]
# Require at least 18 samples over 3 minutes (one every 10 seconds) to trigger alert
if len(high_temp_samples) >= 18:
# Temperature has been >80°C for >3 minutes
status = 'WARNING'
reason = f'CPU temperature {max_temp}°C >80°C sustained >3min'
state_key = 'cpu_temp_history' # Record non-dismissable error
# Add this reading (supplements the sampler thread) health_persistence.record_error(
self.state_history[state_key].append({ error_key='cpu_temperature',
'value': max_temp, category='temperature',
'time': current_time severity='WARNING',
}) reason=reason,
details={'temperature': max_temp, 'dismissable': False}
# Snapshot for thread-safe reading, then atomic prune )
temp_snapshot = list(self.state_history[state_key]) elif len(recovery_samples) >= 3:
self.state_history[state_key] = [ # Temperature has been ≤80°C for 30 seconds - clear the error
entry for entry in temp_snapshot status = 'OK'
if current_time - entry['time'] < 240 reason = None
] health_persistence.resolve_error('cpu_temperature', 'Temperature recovered')
else:
# Check if temperature >80°C for more than 3 minutes (180 seconds) # Temperature is elevated but not long enough, or recovering but not yet cleared
high_temp_samples = [ # Check if we already have an active error
entry for entry in self.state_history[state_key] if health_persistence.is_error_active('cpu_temperature', category='temperature'):
if entry['value'] > 80 and current_time - entry['time'] <= 180 # Keep the warning active
]
# Check if temperature ≤80°C for last 30 seconds (recovery)
recovery_samples = [
entry for entry in self.state_history[state_key]
if entry['value'] <= 80 and current_time - entry['time'] <= 30
]
# Require at least 18 samples over 3 minutes (one every 10 seconds) to trigger alert
if len(high_temp_samples) >= 18:
# Temperature has been >80°C for >3 minutes
status = 'WARNING' status = 'WARNING'
reason = f'CPU temperature {max_temp}°C >80°C sustained >3min' reason = f'CPU temperature {max_temp}°C still elevated'
else:
# Record non-dismissable error # No active warning yet
health_persistence.record_error(
error_key='cpu_temperature',
category='temperature',
severity='WARNING',
reason=reason,
details={'temperature': max_temp, 'dismissable': False}
)
elif len(recovery_samples) >= 3:
# Temperature has been ≤80°C for 30 seconds - clear the error
status = 'OK' status = 'OK'
reason = None reason = None
health_persistence.resolve_error('cpu_temperature', 'Temperature recovered')
else: temp_result = {
# Temperature is elevated but not long enough, or recovering but not yet cleared 'status': status,
# Check if we already have an active error 'value': round(max_temp, 1),
if health_persistence.is_error_active('cpu_temperature', category='temperature'): 'unit': '°C'
# Keep the warning active }
status = 'WARNING' if reason:
reason = f'CPU temperature {max_temp}°C still elevated' temp_result['reason'] = reason
else:
# No active warning yet self.cached_results[cache_key] = temp_result
status = 'OK' self.last_check_times[cache_key] = current_time
reason = None return temp_result
temp_result = {
'status': status,
'value': round(max_temp, 1),
'unit': '°C'
}
if reason:
temp_result['reason'] = reason
self.cached_results[cache_key] = temp_result
self.last_check_times[cache_key] = current_time
return temp_result
return None return None

View File

@@ -967,23 +967,21 @@ class HealthPersistence:
cutoff_events = (now - timedelta(days=30)).isoformat() cutoff_events = (now - timedelta(days=30)).isoformat()
cursor.execute('DELETE FROM events WHERE timestamp < ?', (cutoff_events,)) cursor.execute('DELETE FROM events WHERE timestamp < ?', (cutoff_events,))
# ── Auto-resolve transient log errors after system reboot ── # ── Auto-resolve transient errors after system stabilizes ──
# OOM, service failures, timeouts are transient - a reboot resolves them. # Transient errors (OOM, high CPU, service failures) resolve themselves.
# If the system has been up for >1 hour and these errors haven't recurred, # If the system has been up for >10 minutes and these errors haven't recurred,
# they are from a previous boot and should be auto-resolved. # they are stale and should be auto-resolved.
#
# Logic: If uptime > 1 hour AND error.last_seen is not within the last 30 minutes,
# the error is stale (from before the current stable state) and should be resolved.
try: try:
import psutil
# Get system uptime # Get system uptime
with open('/proc/uptime', 'r') as f: with open('/proc/uptime', 'r') as f:
uptime_seconds = float(f.read().split()[0]) uptime_seconds = float(f.read().split()[0])
# Only auto-resolve if system has been stable for at least 1 hour # Only auto-resolve if system has been stable for at least 10 minutes
if uptime_seconds > 3600: # 1 hour if uptime_seconds > 600: # 10 minutes
# Resolve transient log errors that haven't been seen in the last 30 minutes stale_cutoff = (now - timedelta(minutes=10)).isoformat()
# If they were real current issues, journalctl -b 0 would have detected them recently
stale_cutoff = (now - timedelta(minutes=30)).isoformat() # 1. Resolve transient log errors (OOM, service failures)
cursor.execute(''' cursor.execute('''
UPDATE errors UPDATE errors
SET resolved_at = ? SET resolved_at = ?
@@ -999,6 +997,42 @@ class HealthPersistence:
OR reason LIKE '%timeout%' OR reason LIKE '%timeout%'
OR reason LIKE '%critical error%') OR reason LIKE '%critical error%')
''', (now_iso, stale_cutoff)) ''', (now_iso, stale_cutoff))
# 2. Auto-resolve CPU errors if current CPU is normal (<75%)
try:
current_cpu = psutil.cpu_percent(interval=0.1)
if current_cpu < 75:
cursor.execute('''
UPDATE errors
SET resolved_at = ?
WHERE category = 'temperature'
AND resolved_at IS NULL
AND acknowledged = 0
AND last_seen < ?
AND (error_key = 'cpu_usage'
OR reason LIKE '%CPU >%sustained%'
OR reason LIKE '%Sustained high CPU%')
''', (now_iso, stale_cutoff))
except Exception:
pass
# 3. Auto-resolve memory errors if current memory is normal (<80%)
try:
current_mem = psutil.virtual_memory().percent
if current_mem < 80:
cursor.execute('''
UPDATE errors
SET resolved_at = ?
WHERE category = 'memory'
AND resolved_at IS NULL
AND acknowledged = 0
AND last_seen < ?
AND (reason LIKE '%Memory >%'
OR reason LIKE '%RAM usage%')
''', (now_iso, stale_cutoff))
except Exception:
pass
except Exception: except Exception:
pass # If we can't read uptime, skip this cleanup pass # If we can't read uptime, skip this cleanup