update health_persistence.py

2026-05-21 08:05:03 +00:00 · 2026-04-04 01:31:37 +02:00
parent ce69c0ba1f
commit e0e732dd2c
2 changed files with 41 additions and 31 deletions
@@ -279,9 +279,10 @@ class HealthMonitor:
            return cache['output']
        
        # Execute journalctl and cache result
+        # Use -b 0 to only include logs from the current boot
        try:
            result = subprocess.run(
-                ['journalctl', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'],
+                ['journalctl', '-b', '0', '--since', '10 minutes ago', '--no-pager', '-p', 'warning'],
                capture_output=True,
                text=True,
                timeout=20
@@ -311,9 +312,10 @@ class HealthMonitor:
            return cache['output']
        
        # Execute journalctl and cache result
+        # Use -b 0 to only include logs from the current boot
        try:
            result = subprocess.run(
-                ['journalctl', '--since', '1 hour ago', '--no-pager', '-p', 'warning',
+                ['journalctl', '-b', '0', '--since', '1 hour ago', '--no-pager', '-p', 'warning',
                 '--output=short-precise'],
                capture_output=True,
                text=True,
@@ -3284,16 +3286,19 @@ class HealthMonitor:
        
        try:
            # Fetch logs from the last 3 minutes for immediate issue detection
+            # Use -b 0 to only include logs from the CURRENT boot (not previous boots)
+            # This prevents OOM/crash errors from before a reboot from persisting
            result_recent = subprocess.run(
-                ['journalctl', '--since', '3 minutes ago', '--no-pager', '-p', 'warning'],
+                ['journalctl', '-b', '0', '--since', '3 minutes ago', '--no-pager', '-p', 'warning'],
                capture_output=True,
                text=True,
                timeout=20
            )
            
            # Fetch logs from the previous 3-minute interval to detect spikes/cascades
+            # Also limited to current boot only
            result_previous = subprocess.run(
-                ['journalctl', '--since', '6 minutes ago', '--until', '3 minutes ago', '--no-pager', '-p', 'warning'],
+                ['journalctl', '-b', '0', '--since', '6 minutes ago', '--until', '3 minutes ago', '--no-pager', '-p', 'warning'],
                capture_output=True,
                text=True,
                timeout=20
@@ -967,35 +967,40 @@ class HealthPersistence:
        cutoff_events = (now - timedelta(days=30)).isoformat()
        cursor.execute('DELETE FROM events WHERE timestamp < ?', (cutoff_events,))
        
-        # ── Auto-resolve log errors that occurred before the last system reboot ──
-        # After a reboot, transient errors like OOM, service failures, etc. are resolved.
-        # Only resolve log errors (not disk errors which may persist across reboots).
+        # ── Auto-resolve transient log errors after system reboot ──
+        # OOM, service failures, timeouts are transient - a reboot resolves them.
+        # If the system has been up for >1 hour and these errors haven't recurred,
+        # they are from a previous boot and should be auto-resolved.
+        # 
+        # Logic: If uptime > 1 hour AND error.last_seen is not within the last 30 minutes,
+        # the error is stale (from before the current stable state) and should be resolved.
        try:
-            import os
-            # Get system boot time from /proc/stat
-            with open('/proc/stat', 'r') as f:
-                for line in f:
-                    if line.startswith('btime '):
-                        boot_timestamp = int(line.split()[1])
-                        boot_time = datetime.fromtimestamp(boot_timestamp)
-                        # Resolve log errors that were last seen BEFORE the boot time
-                        # These are transient errors (OOM, service crashes) that a reboot fixes
-                        boot_time_iso = boot_time.isoformat()
-                        cursor.execute('''
-                            UPDATE errors 
-                            SET resolved_at = ?
-                            WHERE category = 'logs'
-                              AND resolved_at IS NULL 
-                              AND acknowledged = 0
-                              AND last_seen < ?
-                              AND (error_key LIKE 'log_critical_%' 
-                                   OR reason LIKE '%Out of memory%'
-                                   OR reason LIKE '%service%Failed%'
-                                   OR reason LIKE '%timeout%')
-                        ''', (now_iso, boot_time_iso))
-                        break
+            # Get system uptime
+            with open('/proc/uptime', 'r') as f:
+                uptime_seconds = float(f.read().split()[0])
+            
+            # Only auto-resolve if system has been stable for at least 1 hour
+            if uptime_seconds > 3600:  # 1 hour
+                # Resolve transient log errors that haven't been seen in the last 30 minutes
+                # If they were real current issues, journalctl -b 0 would have detected them recently
+                stale_cutoff = (now - timedelta(minutes=30)).isoformat()
+                cursor.execute('''
+                    UPDATE errors 
+                    SET resolved_at = ?
+                    WHERE category = 'logs'
+                      AND resolved_at IS NULL 
+                      AND acknowledged = 0
+                      AND last_seen < ?
+                      AND (error_key LIKE 'log_critical_%' 
+                           OR error_key LIKE 'log_persistent_%'
+                           OR reason LIKE '%Out of memory%'
+                           OR reason LIKE '%Recurring error%'
+                           OR reason LIKE '%service%Failed%'
+                           OR reason LIKE '%timeout%'
+                           OR reason LIKE '%critical error%')
+                ''', (now_iso, stale_cutoff))
        except Exception:
-            pass  # If we can't read boot time, skip this cleanup
+            pass  # If we can't read uptime, skip this cleanup
        
        conn.commit()
        conn.close()