mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-05 20:03:48 +00:00
update health_persistence.py
This commit is contained in:
@@ -279,9 +279,10 @@ class HealthMonitor:
|
||||
return cache['output']
|
||||
|
||||
# Execute journalctl and cache result
|
||||
# Use -b 0 to only include logs from the current boot
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['journalctl', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'],
|
||||
['journalctl', '-b', '0', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=20
|
||||
@@ -311,9 +312,10 @@ class HealthMonitor:
|
||||
return cache['output']
|
||||
|
||||
# Execute journalctl and cache result
|
||||
# Use -b 0 to only include logs from the current boot
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['journalctl', '--since', '1 hour ago', '--no-pager', '-p', 'warning',
|
||||
['journalctl', '-b', '0', '--since', '1 hour ago', '--no-pager', '-p', 'warning',
|
||||
'--output=short-precise'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
@@ -3284,16 +3286,19 @@ class HealthMonitor:
|
||||
|
||||
try:
|
||||
# Fetch logs from the last 3 minutes for immediate issue detection
|
||||
# Use -b 0 to only include logs from the CURRENT boot (not previous boots)
|
||||
# This prevents OOM/crash errors from before a reboot from persisting
|
||||
result_recent = subprocess.run(
|
||||
['journalctl', '--since', '3 minutes ago', '--no-pager', '-p', 'warning'],
|
||||
['journalctl', '-b', '0', '--since', '3 minutes ago', '--no-pager', '-p', 'warning'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=20
|
||||
)
|
||||
|
||||
# Fetch logs from the previous 3-minute interval to detect spikes/cascades
|
||||
# Also limited to current boot only
|
||||
result_previous = subprocess.run(
|
||||
['journalctl', '--since', '6 minutes ago', '--until', '3 minutes ago', '--no-pager', '-p', 'warning'],
|
||||
['journalctl', '-b', '0', '--since', '6 minutes ago', '--until', '3 minutes ago', '--no-pager', '-p', 'warning'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=20
|
||||
|
||||
@@ -967,35 +967,40 @@ class HealthPersistence:
|
||||
cutoff_events = (now - timedelta(days=30)).isoformat()
|
||||
cursor.execute('DELETE FROM events WHERE timestamp < ?', (cutoff_events,))
|
||||
|
||||
# ── Auto-resolve log errors that occurred before the last system reboot ──
|
||||
# After a reboot, transient errors like OOM, service failures, etc. are resolved.
|
||||
# Only resolve log errors (not disk errors which may persist across reboots).
|
||||
# ── Auto-resolve transient log errors after system reboot ──
|
||||
# OOM, service failures, timeouts are transient - a reboot resolves them.
|
||||
# If the system has been up for >1 hour and these errors haven't recurred,
|
||||
# they are from a previous boot and should be auto-resolved.
|
||||
#
|
||||
# Logic: If uptime > 1 hour AND error.last_seen is not within the last 30 minutes,
|
||||
# the error is stale (from before the current stable state) and should be resolved.
|
||||
try:
|
||||
import os
|
||||
# Get system boot time from /proc/stat
|
||||
with open('/proc/stat', 'r') as f:
|
||||
for line in f:
|
||||
if line.startswith('btime '):
|
||||
boot_timestamp = int(line.split()[1])
|
||||
boot_time = datetime.fromtimestamp(boot_timestamp)
|
||||
# Resolve log errors that were last seen BEFORE the boot time
|
||||
# These are transient errors (OOM, service crashes) that a reboot fixes
|
||||
boot_time_iso = boot_time.isoformat()
|
||||
cursor.execute('''
|
||||
UPDATE errors
|
||||
SET resolved_at = ?
|
||||
WHERE category = 'logs'
|
||||
AND resolved_at IS NULL
|
||||
AND acknowledged = 0
|
||||
AND last_seen < ?
|
||||
AND (error_key LIKE 'log_critical_%'
|
||||
OR reason LIKE '%Out of memory%'
|
||||
OR reason LIKE '%service%Failed%'
|
||||
OR reason LIKE '%timeout%')
|
||||
''', (now_iso, boot_time_iso))
|
||||
break
|
||||
# Get system uptime
|
||||
with open('/proc/uptime', 'r') as f:
|
||||
uptime_seconds = float(f.read().split()[0])
|
||||
|
||||
# Only auto-resolve if system has been stable for at least 1 hour
|
||||
if uptime_seconds > 3600: # 1 hour
|
||||
# Resolve transient log errors that haven't been seen in the last 30 minutes
|
||||
# If they were real current issues, journalctl -b 0 would have detected them recently
|
||||
stale_cutoff = (now - timedelta(minutes=30)).isoformat()
|
||||
cursor.execute('''
|
||||
UPDATE errors
|
||||
SET resolved_at = ?
|
||||
WHERE category = 'logs'
|
||||
AND resolved_at IS NULL
|
||||
AND acknowledged = 0
|
||||
AND last_seen < ?
|
||||
AND (error_key LIKE 'log_critical_%'
|
||||
OR error_key LIKE 'log_persistent_%'
|
||||
OR reason LIKE '%Out of memory%'
|
||||
OR reason LIKE '%Recurring error%'
|
||||
OR reason LIKE '%service%Failed%'
|
||||
OR reason LIKE '%timeout%'
|
||||
OR reason LIKE '%critical error%')
|
||||
''', (now_iso, stale_cutoff))
|
||||
except Exception:
|
||||
pass # If we can't read boot time, skip this cleanup
|
||||
pass # If we can't read uptime, skip this cleanup
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
Reference in New Issue
Block a user