update health_monitor.py

2026-05-20 23:55:08 +00:00 · 2026-04-05 12:17:42 +02:00
parent 4c72d0b3ef
commit 95e876b37f
2 changed files with 126 additions and 87 deletions
@@ -150,7 +150,7 @@ class HealthMonitor:
        r'zfs.*scrub (started|finished|in progress)',
        r'zpool.*resilver',
        
-        # ── LXC/Container normal operations ──
+        # ���─ LXC/Container normal operations ──
        r'lxc.*monitor',
        r'systemd\[1\]: (started|stopped) .*\.scope',
        
@@ -837,90 +837,95 @@ class HealthMonitor:
                return self.cached_results.get(cache_key)
        
        try:
-            # Use shared journalctl cache to avoid duplicate calls
-            journalctl_output = self._get_journalctl_10min_warnings()
+            # Read temperature directly from sensors command (not journalctl)
+            result = subprocess.run(
+                ['sensors', '-u'],
+                capture_output=True, text=True, timeout=3
+            )
            
-            if journalctl_output:
-                temps = []
-                for line in journalctl_output.split('\n'):
-                    if 'temp' in line.lower() and '_input' in line:
+            temps = []
+            if result.returncode == 0 and result.stdout:
+                for line in result.stdout.split('\n'):
+                    # Look for temperature input lines like "temp1_input: 42.000"
+                    if '_input' in line and 'temp' in line.lower():
                        try:
                            temp = float(line.split(':')[1].strip())
-                            temps.append(temp)
+                            if 0 < temp < 150:  # Sanity check for valid temp range
+                                temps.append(temp)
                        except:
                            continue
+            
+            if temps:
+                max_temp = max(temps)
                
-                if temps:
-                    max_temp = max(temps)
+                state_key = 'cpu_temp_history'
+                # Add this reading (supplements the sampler thread)
+                self.state_history[state_key].append({
+                    'value': max_temp,
+                    'time': current_time
+                })
+                
+                # Snapshot for thread-safe reading, then atomic prune
+                temp_snapshot = list(self.state_history[state_key])
+                self.state_history[state_key] = [
+                    entry for entry in temp_snapshot
+                    if current_time - entry['time'] < 240
+                ]
+                
+                # Check if temperature >80°C for more than 3 minutes (180 seconds)
+                high_temp_samples = [
+                    entry for entry in self.state_history[state_key]
+                    if entry['value'] > 80 and current_time - entry['time'] <= 180
+                ]
+                
+                # Check if temperature ≤80°C for last 30 seconds (recovery)
+                recovery_samples = [
+                    entry for entry in self.state_history[state_key]
+                    if entry['value'] <= 80 and current_time - entry['time'] <= 30
+                ]
+                
+                # Require at least 18 samples over 3 minutes (one every 10 seconds) to trigger alert
+                if len(high_temp_samples) >= 18:
+                    # Temperature has been >80°C for >3 minutes
+                    status = 'WARNING'
+                    reason = f'CPU temperature {max_temp}°C >80°C sustained >3min'
                    
-                    state_key = 'cpu_temp_history'
-                    # Add this reading (supplements the sampler thread)
-                    self.state_history[state_key].append({
-                        'value': max_temp,
-                        'time': current_time
-                    })
-                    
-                    # Snapshot for thread-safe reading, then atomic prune
-                    temp_snapshot = list(self.state_history[state_key])
-                    self.state_history[state_key] = [
-                        entry for entry in temp_snapshot
-                        if current_time - entry['time'] < 240
-                    ]
-                    
-                    # Check if temperature >80°C for more than 3 minutes (180 seconds)
-                    high_temp_samples = [
-                        entry for entry in self.state_history[state_key]
-                        if entry['value'] > 80 and current_time - entry['time'] <= 180
-                    ]
-                    
-                    # Check if temperature ≤80°C for last 30 seconds (recovery)
-                    recovery_samples = [
-                        entry for entry in self.state_history[state_key]
-                        if entry['value'] <= 80 and current_time - entry['time'] <= 30
-                    ]
-                    
-                    # Require at least 18 samples over 3 minutes (one every 10 seconds) to trigger alert
-                    if len(high_temp_samples) >= 18:
-                        # Temperature has been >80°C for >3 minutes
+                    # Record non-dismissable error
+                    health_persistence.record_error(
+                        error_key='cpu_temperature',
+                        category='temperature',
+                        severity='WARNING',
+                        reason=reason,
+                        details={'temperature': max_temp, 'dismissable': False}
+                    )
+                elif len(recovery_samples) >= 3:
+                    # Temperature has been ≤80°C for 30 seconds - clear the error
+                    status = 'OK'
+                    reason = None
+                    health_persistence.resolve_error('cpu_temperature', 'Temperature recovered')
+                else:
+                    # Temperature is elevated but not long enough, or recovering but not yet cleared
+                    # Check if we already have an active error
+                    if health_persistence.is_error_active('cpu_temperature', category='temperature'):
+                        # Keep the warning active
                        status = 'WARNING'
-                        reason = f'CPU temperature {max_temp}°C >80°C sustained >3min'
-                        
-                        # Record non-dismissable error
-                        health_persistence.record_error(
-                            error_key='cpu_temperature',
-                            category='temperature',
-                            severity='WARNING',
-                            reason=reason,
-                            details={'temperature': max_temp, 'dismissable': False}
-                        )
-                    elif len(recovery_samples) >= 3:
-                        # Temperature has been ≤80°C for 30 seconds - clear the error
+                        reason = f'CPU temperature {max_temp}°C still elevated'
+                    else:
+                        # No active warning yet
                        status = 'OK'
                        reason = None
-                        health_persistence.resolve_error('cpu_temperature', 'Temperature recovered')
-                    else:
-                        # Temperature is elevated but not long enough, or recovering but not yet cleared
-                        # Check if we already have an active error
-                        if health_persistence.is_error_active('cpu_temperature', category='temperature'):
-                            # Keep the warning active
-                            status = 'WARNING'
-                            reason = f'CPU temperature {max_temp}°C still elevated'
-                        else:
-                            # No active warning yet
-                            status = 'OK'
-                            reason = None
-                    
-                    temp_result = {
-                        'status': status,
-                        'value': round(max_temp, 1),
-                        'unit': '°C'
-                    }
-                    if reason:
-                        temp_result['reason'] = reason
-                    
-                    self.cached_results[cache_key] = temp_result
-                    self.last_check_times[cache_key] = current_time
-                    return temp_result
+                
+                temp_result = {
+                    'status': status,
+                    'value': round(max_temp, 1),
+                    'unit': '°C'
+                }
+                if reason:
+                    temp_result['reason'] = reason
+                
+                self.cached_results[cache_key] = temp_result
+                self.last_check_times[cache_key] = current_time
+                return temp_result
            
            return None
            
@@ -967,23 +967,21 @@ class HealthPersistence:
        cutoff_events = (now - timedelta(days=30)).isoformat()
        cursor.execute('DELETE FROM events WHERE timestamp < ?', (cutoff_events,))
        
-        # ── Auto-resolve transient log errors after system reboot ──
-        # OOM, service failures, timeouts are transient - a reboot resolves them.
-        # If the system has been up for >1 hour and these errors haven't recurred,
-        # they are from a previous boot and should be auto-resolved.
-        # 
-        # Logic: If uptime > 1 hour AND error.last_seen is not within the last 30 minutes,
-        # the error is stale (from before the current stable state) and should be resolved.
+        # ── Auto-resolve transient errors after system stabilizes ──
+        # Transient errors (OOM, high CPU, service failures) resolve themselves.
+        # If the system has been up for >10 minutes and these errors haven't recurred,
+        # they are stale and should be auto-resolved.
        try:
+            import psutil
            # Get system uptime
            with open('/proc/uptime', 'r') as f:
                uptime_seconds = float(f.read().split()[0])
            
-            # Only auto-resolve if system has been stable for at least 1 hour
-            if uptime_seconds > 3600:  # 1 hour
-                # Resolve transient log errors that haven't been seen in the last 30 minutes
-                # If they were real current issues, journalctl -b 0 would have detected them recently
-                stale_cutoff = (now - timedelta(minutes=30)).isoformat()
+            # Only auto-resolve if system has been stable for at least 10 minutes
+            if uptime_seconds > 600:  # 10 minutes
+                stale_cutoff = (now - timedelta(minutes=10)).isoformat()
+                
+                # 1. Resolve transient log errors (OOM, service failures)
                cursor.execute('''
                    UPDATE errors 
                    SET resolved_at = ?
@@ -999,6 +997,42 @@ class HealthPersistence:
                           OR reason LIKE '%timeout%'
                           OR reason LIKE '%critical error%')
                ''', (now_iso, stale_cutoff))
+                
+                # 2. Auto-resolve CPU errors if current CPU is normal (<75%)
+                try:
+                    current_cpu = psutil.cpu_percent(interval=0.1)
+                    if current_cpu < 75:
+                        cursor.execute('''
+                            UPDATE errors 
+                            SET resolved_at = ?
+                            WHERE category = 'temperature'
+                              AND resolved_at IS NULL 
+                              AND acknowledged = 0
+                              AND last_seen < ?
+                              AND (error_key = 'cpu_usage'
+                                   OR reason LIKE '%CPU >%sustained%'
+                                   OR reason LIKE '%Sustained high CPU%')
+                        ''', (now_iso, stale_cutoff))
+                except Exception:
+                    pass
+                
+                # 3. Auto-resolve memory errors if current memory is normal (<80%)
+                try:
+                    current_mem = psutil.virtual_memory().percent
+                    if current_mem < 80:
+                        cursor.execute('''
+                            UPDATE errors 
+                            SET resolved_at = ?
+                            WHERE category = 'memory'
+                              AND resolved_at IS NULL 
+                              AND acknowledged = 0
+                              AND last_seen < ?
+                              AND (reason LIKE '%Memory >%'
+                                   OR reason LIKE '%RAM usage%')
+                        ''', (now_iso, stale_cutoff))
+                except Exception:
+                    pass
+                    
        except Exception:
            pass  # If we can't read uptime, skip this cleanup