diff --git a/AppImage/components/health-status-modal.tsx b/AppImage/components/health-status-modal.tsx index a05efa98..93018fc1 100644 --- a/AppImage/components/health-status-modal.tsx +++ b/AppImage/components/health-status-modal.tsx @@ -214,7 +214,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu const refreshInterval = setInterval(fetchHealthDetails, 300000) return () => clearInterval(refreshInterval) } - }, [open]) + }, [open, fetchHealthDetails]) // Auto-expand non-OK categories when data loads useEffect(() => { @@ -506,13 +506,13 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu size="sm" variant="outline" className="h-5 px-1 sm:px-1.5 shrink-0 hover:bg-red-500/10 hover:border-red-500/50 bg-transparent text-[10px]" - disabled={dismissingKey === checkKey} + disabled={dismissingKey === (checkData.error_key || checkKey)} onClick={(e) => { e.stopPropagation() handleAcknowledge(checkData.error_key || checkKey, e) }} > - {dismissingKey === checkKey ? ( + {dismissingKey === (checkData.error_key || checkKey) ? ( ) : ( <> diff --git a/AppImage/scripts/health_monitor.py b/AppImage/scripts/health_monitor.py index be7189d4..b939d8f4 100644 --- a/AppImage/scripts/health_monitor.py +++ b/AppImage/scripts/health_monitor.py @@ -189,6 +189,20 @@ class HealthMonitor: # PVE Critical Services PVE_SERVICES = ['pveproxy', 'pvedaemon', 'pvestatd', 'pve-cluster'] + # P2 fix: Pre-compiled regex patterns for performance (avoid re-compiling on every line) + _BENIGN_RE = None + _CRITICAL_RE = None + _WARNING_RE = None + + @classmethod + def _get_compiled_patterns(cls): + """Lazily compile regex patterns once""" + if cls._BENIGN_RE is None: + cls._BENIGN_RE = re.compile("|".join(cls.BENIGN_ERROR_PATTERNS), re.IGNORECASE) + cls._CRITICAL_RE = re.compile("|".join(cls.CRITICAL_LOG_KEYWORDS), re.IGNORECASE) + cls._WARNING_RE = re.compile("|".join(cls.WARNING_LOG_KEYWORDS), re.IGNORECASE) + return cls._BENIGN_RE, cls._CRITICAL_RE, cls._WARNING_RE + def __init__(self): """Initialize health monitor with state tracking""" self.state_history = defaultdict(list) @@ -199,6 +213,7 @@ class HealthMonitor: self.failed_vm_history = set() # Track VMs that failed to start self.persistent_log_patterns = defaultdict(lambda: {'count': 0, 'first_seen': 0, 'last_seen': 0}) self._unknown_counts = {} # Track consecutive UNKNOWN cycles per category + self._last_cleanup_time = 0 # Throttle cleanup_old_errors calls # System capabilities - derived from Proxmox storage types at runtime (Priority 1.5) # SMART detection still uses filesystem check on init (lightweight) @@ -380,12 +395,15 @@ class HealthMonitor: Returns JSON structure with ALL 10 categories always present. Now includes persistent error tracking. """ - # Run cleanup on every status check so stale errors are auto-resolved + # Run cleanup with throttle (every 5 min) so stale errors are auto-resolved # using the user-configured Suppression Duration (single source of truth). - try: - health_persistence.cleanup_old_errors() - except Exception: - pass + current_time = time.time() + if current_time - self._last_cleanup_time > 300: # 5 minutes + try: + health_persistence.cleanup_old_errors() + self._last_cleanup_time = current_time + except Exception: + pass active_errors = health_persistence.get_active_errors() # No need to create persistent_issues dict here, it's implicitly handled by the checks @@ -1319,7 +1337,7 @@ class HealthMonitor: 'device': f'/dev/{device_name}', 'serial': serial, 'model': model, - 'error_key': error_info.get('error_key') or f'disk_{device_name}', + 'error_key': error_info.get('error_key') or f'disk_smart_{device_name}', 'dismissable': error_info.get('dismissable', True), 'is_disk_entry': True, } @@ -2843,12 +2861,9 @@ class HealthMonitor: } def _is_benign_error(self, line: str) -> bool: - """Check if log line matches benign error patterns""" - line_lower = line.lower() - for pattern in self.BENIGN_ERROR_PATTERNS: - if re.search(pattern, line_lower): - return True - return False + """Check if log line matches benign error patterns (uses pre-compiled regex)""" + benign_re, _, _ = self._get_compiled_patterns() + return bool(benign_re.search(line.lower())) def _enrich_critical_log_reason(self, line: str) -> str: """ @@ -2969,7 +2984,7 @@ class HealthMonitor: # Generic classification -- very conservative to avoid false positives. # Only escalate if the line explicitly uses severity-level keywords # from the kernel or systemd (not just any line containing "error"). - if 'kernel panic' in line_lower or 'fatal' in line_lower and 'non-fatal' not in line_lower: + if 'kernel panic' in line_lower or ('fatal' in line_lower and 'non-fatal' not in line_lower): return 'CRITICAL' # Lines from priority "err" that don't match any keyword above are @@ -3164,7 +3179,7 @@ class HealthMonitor: device_name=base_device, serial=obs_serial, error_type='filesystem_error', - error_signature=f'fs_error_{fs_device}_{pattern_key}', + error_signature=f'fs_error_{fs_device}_{pattern_hash}', raw_message=enriched_reason[:500], severity=fs_severity.lower(), ) @@ -3253,12 +3268,25 @@ class HealthMonitor: 'dismissable': True, 'occurrences': data['count']} ) - patterns_to_remove = [ - p for p, data in self.persistent_log_patterns.items() - if current_time - data['last_seen'] > 1800 - ] - for pattern in patterns_to_remove: - del self.persistent_log_patterns[pattern] + patterns_to_remove = [ + p for p, data in self.persistent_log_patterns.items() + if current_time - data['last_seen'] > 1800 + ] + for pattern in patterns_to_remove: + del self.persistent_log_patterns[pattern] + + # B5 fix: Cap size to prevent unbounded memory growth under high error load + MAX_LOG_PATTERNS = 500 + if len(self.persistent_log_patterns) > MAX_LOG_PATTERNS: + sorted_patterns = sorted( + self.persistent_log_patterns.items(), + key=lambda x: x[1]['last_seen'], + reverse=True + ) + self.persistent_log_patterns = defaultdict( + lambda: {'count': 0, 'first_seen': 0, 'last_seen': 0}, + dict(sorted_patterns[:MAX_LOG_PATTERNS]) + ) unique_critical_count = len(critical_errors_found) cascade_count = len(cascading_errors) @@ -3870,12 +3898,14 @@ class HealthMonitor: # Sub-check 3: Failed login attempts (brute force detection) try: - result = subprocess.run( - ['journalctl', '--since', '24 hours ago', '--no-pager'], - capture_output=True, - text=True, - timeout=3 - ) + result = subprocess.run( + ['journalctl', '--since', '24 hours ago', '--no-pager', + '-g', 'authentication failure|failed password|invalid user', + '--output=cat', '-n', '5000'], + capture_output=True, + text=True, + timeout=5 + ) failed_logins = 0 if result.returncode == 0: diff --git a/AppImage/scripts/health_persistence.py b/AppImage/scripts/health_persistence.py index d3c3c291..188206cc 100644 --- a/AppImage/scripts/health_persistence.py +++ b/AppImage/scripts/health_persistence.py @@ -18,6 +18,7 @@ import sqlite3 import json import os import threading +from contextlib import contextmanager from datetime import datetime, timedelta from typing import Dict, List, Any, Optional from pathlib import Path @@ -59,6 +60,24 @@ class HealthPersistence: conn.execute('PRAGMA busy_timeout=5000') return conn + @contextmanager + def _db_connection(self, row_factory: bool = False): + """Context manager for safe database connections (B4 fix). + + Ensures connections are always closed, even if exceptions occur. + Usage: + with self._db_connection() as conn: + cursor = conn.cursor() + ... + """ + conn = self._get_conn() + if row_factory: + conn.row_factory = sqlite3.Row + try: + yield conn + finally: + conn.close() + def _init_database(self): """Initialize SQLite database with required tables""" conn = self._get_conn() @@ -345,7 +364,8 @@ class HealthPersistence: if not (error_key == 'cpu_temperature' and severity == 'CRITICAL'): setting_key = self.CATEGORY_SETTING_MAP.get(category, '') if setting_key: - stored = self.get_setting(setting_key) + # P4 fix: use _get_setting_impl with existing connection to avoid deadlock + stored = self._get_setting_impl(conn, setting_key) if stored is not None: configured_hours = int(stored) if configured_hours != self.DEFAULT_SUPPRESSION_HOURS: @@ -411,54 +431,54 @@ class HealthPersistence: - Error is active (unresolved and not acknowledged), OR - Error is dismissed but still within its suppression period """ - conn = self._get_conn() - cursor = conn.cursor() - - # First check: is the error active (unresolved and not acknowledged)? - if category: - cursor.execute(''' - SELECT COUNT(*) FROM errors - WHERE error_key = ? AND category = ? - AND resolved_at IS NULL AND acknowledged = 0 - ''', (error_key, category)) - else: - cursor.execute(''' - SELECT COUNT(*) FROM errors - WHERE error_key = ? - AND resolved_at IS NULL AND acknowledged = 0 - ''', (error_key,)) - - active_count = cursor.fetchone()[0] - if active_count > 0: - conn.close() - return True - - # Second check: is the error dismissed but still within suppression period? - # This prevents re-recording dismissed errors before their suppression expires - if category: - cursor.execute(''' - SELECT resolved_at, suppression_hours FROM errors - WHERE error_key = ? AND category = ? - AND acknowledged = 1 AND resolved_at IS NOT NULL - ORDER BY resolved_at DESC LIMIT 1 - ''', (error_key, category)) - else: - cursor.execute(''' - SELECT resolved_at, suppression_hours FROM errors - WHERE error_key = ? - AND acknowledged = 1 AND resolved_at IS NOT NULL - ORDER BY resolved_at DESC LIMIT 1 - ''', (error_key,)) - - row = cursor.fetchone() - conn.close() + with self._db_connection() as conn: + cursor = conn.cursor() + + # First check: is the error active (unresolved and not acknowledged)? + if category: + cursor.execute(''' + SELECT COUNT(*) FROM errors + WHERE error_key = ? AND category = ? + AND resolved_at IS NULL AND acknowledged = 0 + ''', (error_key, category)) + else: + cursor.execute(''' + SELECT COUNT(*) FROM errors + WHERE error_key = ? + AND resolved_at IS NULL AND acknowledged = 0 + ''', (error_key,)) + + active_count = cursor.fetchone()[0] + if active_count > 0: + return True + + # Second check: is the error dismissed but still within suppression period? + # This prevents re-recording dismissed errors before their suppression expires + # Note: acknowledged errors may have resolved_at NULL (dismissed but error still exists) + # or resolved_at set (error was dismissed AND condition resolved) + if category: + cursor.execute(''' + SELECT acknowledged_at, suppression_hours FROM errors + WHERE error_key = ? AND category = ? + AND acknowledged = 1 + ORDER BY acknowledged_at DESC LIMIT 1 + ''', (error_key, category)) + else: + cursor.execute(''' + SELECT acknowledged_at, suppression_hours FROM errors + WHERE error_key = ? + AND acknowledged = 1 + ORDER BY acknowledged_at DESC LIMIT 1 + ''', (error_key,)) + + row = cursor.fetchone() if row: - resolved_at_str, suppression_hours = row - if resolved_at_str and suppression_hours: + acknowledged_at_str, suppression_hours = row + if acknowledged_at_str and suppression_hours: try: - resolved_at = datetime.fromisoformat(resolved_at_str) - suppression_end = resolved_at + timedelta(hours=suppression_hours) + acknowledged_at = datetime.fromisoformat(acknowledged_at_str) + suppression_end = acknowledged_at + timedelta(hours=suppression_hours) if datetime.now() < suppression_end: # Still within suppression period - treat as "active" to prevent re-recording return True @@ -542,17 +562,25 @@ class HealthPersistence: ('updates', 'pending_updates'), ('updates', 'kernel_pve'), ('security', 'security_'), ('pve_services', 'pve_service_'), ('vms', 'vmct_'), ('vms', 'vm_'), ('vms', 'ct_'), - ('disks', 'disk_'), ('disks', 'smart_'), ('disks', 'zfs_pool_'), + ('disks', 'disk_smart_'), ('disks', 'disk_'), ('disks', 'smart_'), ('disks', 'zfs_pool_'), ('logs', 'log_'), ('network', 'net_'), ('temperature', 'temp_')]: if error_key == prefix or error_key.startswith(prefix): category = cat break + # Fallback: if no category matched, try to infer from common patterns + if not category: + if 'disk' in error_key or 'smart' in error_key or 'sda' in error_key or 'sdb' in error_key or 'nvme' in error_key: + category = 'disks' + else: + category = 'general' # Use 'general' as ultimate fallback instead of empty string + setting_key = self.CATEGORY_SETTING_MAP.get(category, '') sup_hours = self.DEFAULT_SUPPRESSION_HOURS if setting_key: - stored = self.get_setting(setting_key) + # P4 fix: use _get_setting_impl with existing connection + stored = self._get_setting_impl(conn, setting_key) if stored is not None: try: sup_hours = int(stored) @@ -593,7 +621,8 @@ class HealthPersistence: setting_key = self.CATEGORY_SETTING_MAP.get(category, '') sup_hours = self.DEFAULT_SUPPRESSION_HOURS if setting_key: - stored = self.get_setting(setting_key) + # P4 fix: use _get_setting_impl with existing connection + stored = self._get_setting_impl(conn, setting_key) if stored is not None: try: sup_hours = int(stored) @@ -648,55 +677,63 @@ class HealthPersistence: return result def is_error_acknowledged(self, error_key: str) -> bool: - """Check if an error_key has been acknowledged and is still within suppression window.""" + """Check if an error_key has been acknowledged and is still within suppression window. + + Uses acknowledged_at (not resolved_at) to calculate suppression expiration, + since dismissed errors may have resolved_at = NULL. + """ try: - conn = self._get_conn() - conn.row_factory = sqlite3.Row - cursor = conn.cursor() - cursor.execute( - 'SELECT acknowledged, resolved_at, suppression_hours FROM errors WHERE error_key = ?', - (error_key,)) - row = cursor.fetchone() - conn.close() - if not row: - return False - if not row['acknowledged']: - return False - # Check if still within suppression window - resolved_at = row['resolved_at'] - sup_hours = row['suppression_hours'] or self.DEFAULT_SUPPRESSION_HOURS - if resolved_at: - try: - resolved_dt = datetime.fromisoformat(resolved_at) - if datetime.now() > resolved_dt + timedelta(hours=sup_hours): - return False # Suppression expired - except Exception: - pass - return True + with self._db_connection(row_factory=True) as conn: + cursor = conn.cursor() + cursor.execute( + 'SELECT acknowledged, acknowledged_at, suppression_hours FROM errors WHERE error_key = ?', + (error_key,)) + row = cursor.fetchone() + if not row: + return False + if not row['acknowledged']: + return False + # Check if still within suppression window using acknowledged_at + acknowledged_at = row['acknowledged_at'] + sup_hours = row['suppression_hours'] or self.DEFAULT_SUPPRESSION_HOURS + + # -1 means permanently suppressed + if sup_hours < 0: + return True + + if acknowledged_at: + try: + acknowledged_dt = datetime.fromisoformat(acknowledged_at) + if datetime.now() > acknowledged_dt + timedelta(hours=sup_hours): + return False # Suppression expired + except Exception: + pass + return True except Exception: return False def get_active_errors(self, category: Optional[str] = None) -> List[Dict[str, Any]]: - """Get all active (unresolved) errors, optionally filtered by category""" - conn = self._get_conn() - conn.row_factory = sqlite3.Row - cursor = conn.cursor() + """Get all active (unresolved AND not acknowledged) errors, optionally filtered by category. - if category: - cursor.execute(''' - SELECT * FROM errors - WHERE resolved_at IS NULL AND category = ? - ORDER BY severity DESC, last_seen DESC - ''', (category,)) - else: - cursor.execute(''' - SELECT * FROM errors - WHERE resolved_at IS NULL - ORDER BY severity DESC, last_seen DESC - ''') - - rows = cursor.fetchall() - conn.close() + Acknowledged errors are excluded since they have been dismissed by the user. + """ + with self._db_connection(row_factory=True) as conn: + cursor = conn.cursor() + + if category: + cursor.execute(''' + SELECT * FROM errors + WHERE resolved_at IS NULL AND acknowledged = 0 AND category = ? + ORDER BY severity DESC, last_seen DESC + ''', (category,)) + else: + cursor.execute(''' + SELECT * FROM errors + WHERE resolved_at IS NULL AND acknowledged = 0 + ORDER BY severity DESC, last_seen DESC + ''') + + rows = cursor.fetchall() errors = [] for row in rows: @@ -850,11 +887,11 @@ class HealthPersistence: conn.row_factory = sqlite3.Row cursor = conn.cursor() - cursor.execute(''' - SELECT * FROM errors - WHERE acknowledged = 1 AND resolved_at IS NOT NULL - ORDER BY resolved_at DESC - ''') + cursor.execute(''' + SELECT * FROM errors + WHERE acknowledged = 1 + ORDER BY acknowledged_at DESC + ''') rows = cursor.fetchall() conn.close() @@ -871,8 +908,12 @@ class HealthPersistence: pass # Check if still within suppression period using per-record hours + # Use acknowledged_at as reference (resolved_at may be NULL for dismissed but active errors) try: - resolved_dt = datetime.fromisoformat(error_dict['resolved_at']) + ref_time_str = error_dict.get('acknowledged_at') or error_dict.get('resolved_at') + if not ref_time_str: + continue + ref_dt = datetime.fromisoformat(ref_time_str) sup_hours = error_dict.get('suppression_hours') if sup_hours is None: sup_hours = self.DEFAULT_SUPPRESSION_HOURS @@ -885,7 +926,7 @@ class HealthPersistence: error_dict['permanent'] = True dismissed.append(error_dict) else: - elapsed_seconds = (now - resolved_dt).total_seconds() + elapsed_seconds = (now - ref_dt).total_seconds() suppression_seconds = sup_hours * 3600 if elapsed_seconds < suppression_seconds: @@ -971,12 +1012,14 @@ class HealthPersistence: conn = self._get_conn() cursor = conn.cursor() - for event_id in event_ids: - cursor.execute(''' - UPDATE events - SET data = json_set(COALESCE(data, '{}'), '$.needs_notification', 0, '$.notified_at', ?) - WHERE id = ? - ''', (datetime.now().isoformat(), event_id)) + # Use single UPDATE with IN clause instead of N individual updates + now = datetime.now().isoformat() + placeholders = ','.join('?' * len(event_ids)) + cursor.execute(f''' + UPDATE events + SET data = json_set(COALESCE(data, '{{}}'), '$.needs_notification', 0, '$.notified_at', ?) + WHERE id IN ({placeholders}) + ''', [now] + event_ids) conn.commit() conn.close() @@ -1074,13 +1117,16 @@ class HealthPersistence: def get_setting(self, key: str, default: Optional[str] = None) -> Optional[str]: """Get a user setting value by key.""" - conn = self._get_conn() + with self._db_connection() as conn: + return self._get_setting_impl(conn, key, default) + + def _get_setting_impl(self, conn, key: str, default: Optional[str] = None) -> Optional[str]: + """Internal: get setting using existing connection (P4 fix - avoids nested connections).""" cursor = conn.cursor() cursor.execute( 'SELECT setting_value FROM user_settings WHERE setting_key = ?', (key,) ) row = cursor.fetchone() - conn.close() return row[0] if row else default def set_setting(self, key: str, value: str):