update health_persistence.py

This commit is contained in:
MacRimi
2026-04-04 01:31:37 +02:00
parent ce69c0ba1f
commit e0e732dd2c
2 changed files with 41 additions and 31 deletions
+9 -4
View File
@@ -279,9 +279,10 @@ class HealthMonitor:
return cache['output'] return cache['output']
# Execute journalctl and cache result # Execute journalctl and cache result
# Use -b 0 to only include logs from the current boot
try: try:
result = subprocess.run( result = subprocess.run(
['journalctl', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'], ['journalctl', '-b', '0', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'],
capture_output=True, capture_output=True,
text=True, text=True,
timeout=20 timeout=20
@@ -311,9 +312,10 @@ class HealthMonitor:
return cache['output'] return cache['output']
# Execute journalctl and cache result # Execute journalctl and cache result
# Use -b 0 to only include logs from the current boot
try: try:
result = subprocess.run( result = subprocess.run(
['journalctl', '--since', '1 hour ago', '--no-pager', '-p', 'warning', ['journalctl', '-b', '0', '--since', '1 hour ago', '--no-pager', '-p', 'warning',
'--output=short-precise'], '--output=short-precise'],
capture_output=True, capture_output=True,
text=True, text=True,
@@ -3284,16 +3286,19 @@ class HealthMonitor:
try: try:
# Fetch logs from the last 3 minutes for immediate issue detection # Fetch logs from the last 3 minutes for immediate issue detection
# Use -b 0 to only include logs from the CURRENT boot (not previous boots)
# This prevents OOM/crash errors from before a reboot from persisting
result_recent = subprocess.run( result_recent = subprocess.run(
['journalctl', '--since', '3 minutes ago', '--no-pager', '-p', 'warning'], ['journalctl', '-b', '0', '--since', '3 minutes ago', '--no-pager', '-p', 'warning'],
capture_output=True, capture_output=True,
text=True, text=True,
timeout=20 timeout=20
) )
# Fetch logs from the previous 3-minute interval to detect spikes/cascades # Fetch logs from the previous 3-minute interval to detect spikes/cascades
# Also limited to current boot only
result_previous = subprocess.run( result_previous = subprocess.run(
['journalctl', '--since', '6 minutes ago', '--until', '3 minutes ago', '--no-pager', '-p', 'warning'], ['journalctl', '-b', '0', '--since', '6 minutes ago', '--until', '3 minutes ago', '--no-pager', '-p', 'warning'],
capture_output=True, capture_output=True,
text=True, text=True,
timeout=20 timeout=20
+22 -17
View File
@@ -967,20 +967,23 @@ class HealthPersistence:
cutoff_events = (now - timedelta(days=30)).isoformat() cutoff_events = (now - timedelta(days=30)).isoformat()
cursor.execute('DELETE FROM events WHERE timestamp < ?', (cutoff_events,)) cursor.execute('DELETE FROM events WHERE timestamp < ?', (cutoff_events,))
# ── Auto-resolve log errors that occurred before the last system reboot ── # ── Auto-resolve transient log errors after system reboot ──
# After a reboot, transient errors like OOM, service failures, etc. are resolved. # OOM, service failures, timeouts are transient - a reboot resolves them.
# Only resolve log errors (not disk errors which may persist across reboots). # If the system has been up for >1 hour and these errors haven't recurred,
# they are from a previous boot and should be auto-resolved.
#
# Logic: If uptime > 1 hour AND error.last_seen is not within the last 30 minutes,
# the error is stale (from before the current stable state) and should be resolved.
try: try:
import os # Get system uptime
# Get system boot time from /proc/stat with open('/proc/uptime', 'r') as f:
with open('/proc/stat', 'r') as f: uptime_seconds = float(f.read().split()[0])
for line in f:
if line.startswith('btime '): # Only auto-resolve if system has been stable for at least 1 hour
boot_timestamp = int(line.split()[1]) if uptime_seconds > 3600: # 1 hour
boot_time = datetime.fromtimestamp(boot_timestamp) # Resolve transient log errors that haven't been seen in the last 30 minutes
# Resolve log errors that were last seen BEFORE the boot time # If they were real current issues, journalctl -b 0 would have detected them recently
# These are transient errors (OOM, service crashes) that a reboot fixes stale_cutoff = (now - timedelta(minutes=30)).isoformat()
boot_time_iso = boot_time.isoformat()
cursor.execute(''' cursor.execute('''
UPDATE errors UPDATE errors
SET resolved_at = ? SET resolved_at = ?
@@ -989,13 +992,15 @@ class HealthPersistence:
AND acknowledged = 0 AND acknowledged = 0
AND last_seen < ? AND last_seen < ?
AND (error_key LIKE 'log_critical_%' AND (error_key LIKE 'log_critical_%'
OR error_key LIKE 'log_persistent_%'
OR reason LIKE '%Out of memory%' OR reason LIKE '%Out of memory%'
OR reason LIKE '%Recurring error%'
OR reason LIKE '%service%Failed%' OR reason LIKE '%service%Failed%'
OR reason LIKE '%timeout%') OR reason LIKE '%timeout%'
''', (now_iso, boot_time_iso)) OR reason LIKE '%critical error%')
break ''', (now_iso, stale_cutoff))
except Exception: except Exception:
pass # If we can't read boot time, skip this cleanup pass # If we can't read uptime, skip this cleanup
conn.commit() conn.commit()
conn.close() conn.close()