diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index 5fb70149..231687ca 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -150,7 +150,7 @@ class HealthMonitor: r'zfs.*scrub (started|finished|in progress)', r'zpool.*resilver', - # ── LXC/Container normal operations ── + # ���─ LXC/Container normal operations ── r'lxc.*monitor', r'systemd\[1\]: (started|stopped) .*\.scope', @@ -837,90 +837,95 @@ class HealthMonitor: return self.cached_results.get(cache_key) try: - # Use shared journalctl cache to avoid duplicate calls - journalctl_output = self._get_journalctl_10min_warnings() + # Read temperature directly from sensors command (not journalctl) + result = subprocess.run( + ['sensors', '-u'], + capture_output=True, text=True, timeout=3 + ) - if journalctl_output: - temps = [] - for line in journalctl_output.split('\n'): - if 'temp' in line.lower() and '_input' in line: + temps = [] + if result.returncode == 0 and result.stdout: + for line in result.stdout.split('\n'): + # Look for temperature input lines like "temp1_input: 42.000" + if '_input' in line and 'temp' in line.lower(): try: temp = float(line.split(':')[1].strip()) - temps.append(temp) + if 0 < temp < 150: # Sanity check for valid temp range + temps.append(temp) except: continue + + if temps: + max_temp = max(temps) - if temps: - max_temp = max(temps) + state_key = 'cpu_temp_history' + # Add this reading (supplements the sampler thread) + self.state_history[state_key].append({ + 'value': max_temp, + 'time': current_time + }) + + # Snapshot for thread-safe reading, then atomic prune + temp_snapshot = list(self.state_history[state_key]) + self.state_history[state_key] = [ + entry for entry in temp_snapshot + if current_time - entry['time'] < 240 + ] + + # Check if temperature >80°C for more than 3 minutes (180 seconds) + high_temp_samples = [ + entry for entry in self.state_history[state_key] + if entry['value'] > 80 and current_time - entry['time'] <= 180 + ] + + # Check if temperature ≤80°C for last 30 seconds (recovery) + recovery_samples = [ + entry for entry in self.state_history[state_key] + if entry['value'] <= 80 and current_time - entry['time'] <= 30 + ] + + # Require at least 18 samples over 3 minutes (one every 10 seconds) to trigger alert + if len(high_temp_samples) >= 18: + # Temperature has been >80°C for >3 minutes + status = 'WARNING' + reason = f'CPU temperature {max_temp}°C >80°C sustained >3min' - state_key = 'cpu_temp_history' - # Add this reading (supplements the sampler thread) - self.state_history[state_key].append({ - 'value': max_temp, - 'time': current_time - }) - - # Snapshot for thread-safe reading, then atomic prune - temp_snapshot = list(self.state_history[state_key]) - self.state_history[state_key] = [ - entry for entry in temp_snapshot - if current_time - entry['time'] < 240 - ] - - # Check if temperature >80°C for more than 3 minutes (180 seconds) - high_temp_samples = [ - entry for entry in self.state_history[state_key] - if entry['value'] > 80 and current_time - entry['time'] <= 180 - ] - - # Check if temperature ≤80°C for last 30 seconds (recovery) - recovery_samples = [ - entry for entry in self.state_history[state_key] - if entry['value'] <= 80 and current_time - entry['time'] <= 30 - ] - - # Require at least 18 samples over 3 minutes (one every 10 seconds) to trigger alert - if len(high_temp_samples) >= 18: - # Temperature has been >80°C for >3 minutes + # Record non-dismissable error + health_persistence.record_error( + error_key='cpu_temperature', + category='temperature', + severity='WARNING', + reason=reason, + details={'temperature': max_temp, 'dismissable': False} + ) + elif len(recovery_samples) >= 3: + # Temperature has been ≤80°C for 30 seconds - clear the error + status = 'OK' + reason = None + health_persistence.resolve_error('cpu_temperature', 'Temperature recovered') + else: + # Temperature is elevated but not long enough, or recovering but not yet cleared + # Check if we already have an active error + if health_persistence.is_error_active('cpu_temperature', category='temperature'): + # Keep the warning active status = 'WARNING' - reason = f'CPU temperature {max_temp}°C >80°C sustained >3min' - - # Record non-dismissable error - health_persistence.record_error( - error_key='cpu_temperature', - category='temperature', - severity='WARNING', - reason=reason, - details={'temperature': max_temp, 'dismissable': False} - ) - elif len(recovery_samples) >= 3: - # Temperature has been ≤80°C for 30 seconds - clear the error + reason = f'CPU temperature {max_temp}°C still elevated' + else: + # No active warning yet status = 'OK' reason = None - health_persistence.resolve_error('cpu_temperature', 'Temperature recovered') - else: - # Temperature is elevated but not long enough, or recovering but not yet cleared - # Check if we already have an active error - if health_persistence.is_error_active('cpu_temperature', category='temperature'): - # Keep the warning active - status = 'WARNING' - reason = f'CPU temperature {max_temp}°C still elevated' - else: - # No active warning yet - status = 'OK' - reason = None - - temp_result = { - 'status': status, - 'value': round(max_temp, 1), - 'unit': '°C' - } - if reason: - temp_result['reason'] = reason - - self.cached_results[cache_key] = temp_result - self.last_check_times[cache_key] = current_time - return temp_result + + temp_result = { + 'status': status, + 'value': round(max_temp, 1), + 'unit': '°C' + } + if reason: + temp_result['reason'] = reason + + self.cached_results[cache_key] = temp_result + self.last_check_times[cache_key] = current_time + return temp_result return None diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py index ab8fa1f5..c95db4bf 100644 --- a/AppImage/scripts/health_persistence.py +++ b/AppImage/scripts/health_persistence.py @@ -967,23 +967,21 @@ class HealthPersistence: cutoff_events = (now - timedelta(days=30)).isoformat() cursor.execute('DELETE FROM events WHERE timestamp < ?', (cutoff_events,)) - # ── Auto-resolve transient log errors after system reboot ── - # OOM, service failures, timeouts are transient - a reboot resolves them. - # If the system has been up for >1 hour and these errors haven't recurred, - # they are from a previous boot and should be auto-resolved. - # - # Logic: If uptime > 1 hour AND error.last_seen is not within the last 30 minutes, - # the error is stale (from before the current stable state) and should be resolved. + # ── Auto-resolve transient errors after system stabilizes ── + # Transient errors (OOM, high CPU, service failures) resolve themselves. + # If the system has been up for >10 minutes and these errors haven't recurred, + # they are stale and should be auto-resolved. try: + import psutil # Get system uptime with open('/proc/uptime', 'r') as f: uptime_seconds = float(f.read().split()[0]) - # Only auto-resolve if system has been stable for at least 1 hour - if uptime_seconds > 3600: # 1 hour - # Resolve transient log errors that haven't been seen in the last 30 minutes - # If they were real current issues, journalctl -b 0 would have detected them recently - stale_cutoff = (now - timedelta(minutes=30)).isoformat() + # Only auto-resolve if system has been stable for at least 10 minutes + if uptime_seconds > 600: # 10 minutes + stale_cutoff = (now - timedelta(minutes=10)).isoformat() + + # 1. Resolve transient log errors (OOM, service failures) cursor.execute(''' UPDATE errors SET resolved_at = ? @@ -999,6 +997,42 @@ class HealthPersistence: OR reason LIKE '%timeout%' OR reason LIKE '%critical error%') ''', (now_iso, stale_cutoff)) + + # 2. Auto-resolve CPU errors if current CPU is normal (<75%) + try: + current_cpu = psutil.cpu_percent(interval=0.1) + if current_cpu < 75: + cursor.execute(''' + UPDATE errors + SET resolved_at = ? + WHERE category = 'temperature' + AND resolved_at IS NULL + AND acknowledged = 0 + AND last_seen < ? + AND (error_key = 'cpu_usage' + OR reason LIKE '%CPU >%sustained%' + OR reason LIKE '%Sustained high CPU%') + ''', (now_iso, stale_cutoff)) + except Exception: + pass + + # 3. Auto-resolve memory errors if current memory is normal (<80%) + try: + current_mem = psutil.virtual_memory().percent + if current_mem < 80: + cursor.execute(''' + UPDATE errors + SET resolved_at = ? + WHERE category = 'memory' + AND resolved_at IS NULL + AND acknowledged = 0 + AND last_seen < ? + AND (reason LIKE '%Memory >%' + OR reason LIKE '%RAM usage%') + ''', (now_iso, stale_cutoff)) + except Exception: + pass + except Exception: pass # If we can't read uptime, skip this cleanup