Update health monitor

This commit is contained in:
MacRimi
2026-03-15 10:03:35 +01:00
parent 26c75e8309
commit e169200f40
3 changed files with 212 additions and 136 deletions

View File

@@ -214,7 +214,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
const refreshInterval = setInterval(fetchHealthDetails, 300000) const refreshInterval = setInterval(fetchHealthDetails, 300000)
return () => clearInterval(refreshInterval) return () => clearInterval(refreshInterval)
} }
}, [open]) }, [open, fetchHealthDetails])
// Auto-expand non-OK categories when data loads // Auto-expand non-OK categories when data loads
useEffect(() => { useEffect(() => {
@@ -506,13 +506,13 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
size="sm" size="sm"
variant="outline" variant="outline"
className="h-5 px-1 sm:px-1.5 shrink-0 hover:bg-red-500/10 hover:border-red-500/50 bg-transparent text-[10px]" className="h-5 px-1 sm:px-1.5 shrink-0 hover:bg-red-500/10 hover:border-red-500/50 bg-transparent text-[10px]"
disabled={dismissingKey === checkKey} disabled={dismissingKey === (checkData.error_key || checkKey)}
onClick={(e) => { onClick={(e) => {
e.stopPropagation() e.stopPropagation()
handleAcknowledge(checkData.error_key || checkKey, e) handleAcknowledge(checkData.error_key || checkKey, e)
}} }}
> >
{dismissingKey === checkKey ? ( {dismissingKey === (checkData.error_key || checkKey) ? (
<Loader2 className="h-3 w-3 animate-spin" /> <Loader2 className="h-3 w-3 animate-spin" />
) : ( ) : (
<> <>

View File

@@ -189,6 +189,20 @@ class HealthMonitor:
# PVE Critical Services # PVE Critical Services
PVE_SERVICES = ['pveproxy', 'pvedaemon', 'pvestatd', 'pve-cluster'] PVE_SERVICES = ['pveproxy', 'pvedaemon', 'pvestatd', 'pve-cluster']
# P2 fix: Pre-compiled regex patterns for performance (avoid re-compiling on every line)
_BENIGN_RE = None
_CRITICAL_RE = None
_WARNING_RE = None
@classmethod
def _get_compiled_patterns(cls):
"""Lazily compile regex patterns once"""
if cls._BENIGN_RE is None:
cls._BENIGN_RE = re.compile("|".join(cls.BENIGN_ERROR_PATTERNS), re.IGNORECASE)
cls._CRITICAL_RE = re.compile("|".join(cls.CRITICAL_LOG_KEYWORDS), re.IGNORECASE)
cls._WARNING_RE = re.compile("|".join(cls.WARNING_LOG_KEYWORDS), re.IGNORECASE)
return cls._BENIGN_RE, cls._CRITICAL_RE, cls._WARNING_RE
def __init__(self): def __init__(self):
"""Initialize health monitor with state tracking""" """Initialize health monitor with state tracking"""
self.state_history = defaultdict(list) self.state_history = defaultdict(list)
@@ -199,6 +213,7 @@ class HealthMonitor:
self.failed_vm_history = set() # Track VMs that failed to start self.failed_vm_history = set() # Track VMs that failed to start
self.persistent_log_patterns = defaultdict(lambda: {'count': 0, 'first_seen': 0, 'last_seen': 0}) self.persistent_log_patterns = defaultdict(lambda: {'count': 0, 'first_seen': 0, 'last_seen': 0})
self._unknown_counts = {} # Track consecutive UNKNOWN cycles per category self._unknown_counts = {} # Track consecutive UNKNOWN cycles per category
self._last_cleanup_time = 0 # Throttle cleanup_old_errors calls
# System capabilities - derived from Proxmox storage types at runtime (Priority 1.5) # System capabilities - derived from Proxmox storage types at runtime (Priority 1.5)
# SMART detection still uses filesystem check on init (lightweight) # SMART detection still uses filesystem check on init (lightweight)
@@ -380,12 +395,15 @@ class HealthMonitor:
Returns JSON structure with ALL 10 categories always present. Returns JSON structure with ALL 10 categories always present.
Now includes persistent error tracking. Now includes persistent error tracking.
""" """
# Run cleanup on every status check so stale errors are auto-resolved # Run cleanup with throttle (every 5 min) so stale errors are auto-resolved
# using the user-configured Suppression Duration (single source of truth). # using the user-configured Suppression Duration (single source of truth).
try: current_time = time.time()
health_persistence.cleanup_old_errors() if current_time - self._last_cleanup_time > 300: # 5 minutes
except Exception: try:
pass health_persistence.cleanup_old_errors()
self._last_cleanup_time = current_time
except Exception:
pass
active_errors = health_persistence.get_active_errors() active_errors = health_persistence.get_active_errors()
# No need to create persistent_issues dict here, it's implicitly handled by the checks # No need to create persistent_issues dict here, it's implicitly handled by the checks
@@ -1319,7 +1337,7 @@ class HealthMonitor:
'device': f'/dev/{device_name}', 'device': f'/dev/{device_name}',
'serial': serial, 'serial': serial,
'model': model, 'model': model,
'error_key': error_info.get('error_key') or f'disk_{device_name}', 'error_key': error_info.get('error_key') or f'disk_smart_{device_name}',
'dismissable': error_info.get('dismissable', True), 'dismissable': error_info.get('dismissable', True),
'is_disk_entry': True, 'is_disk_entry': True,
} }
@@ -2843,12 +2861,9 @@ class HealthMonitor:
} }
def _is_benign_error(self, line: str) -> bool: def _is_benign_error(self, line: str) -> bool:
"""Check if log line matches benign error patterns""" """Check if log line matches benign error patterns (uses pre-compiled regex)"""
line_lower = line.lower() benign_re, _, _ = self._get_compiled_patterns()
for pattern in self.BENIGN_ERROR_PATTERNS: return bool(benign_re.search(line.lower()))
if re.search(pattern, line_lower):
return True
return False
def _enrich_critical_log_reason(self, line: str) -> str: def _enrich_critical_log_reason(self, line: str) -> str:
""" """
@@ -2969,7 +2984,7 @@ class HealthMonitor:
# Generic classification -- very conservative to avoid false positives. # Generic classification -- very conservative to avoid false positives.
# Only escalate if the line explicitly uses severity-level keywords # Only escalate if the line explicitly uses severity-level keywords
# from the kernel or systemd (not just any line containing "error"). # from the kernel or systemd (not just any line containing "error").
if 'kernel panic' in line_lower or 'fatal' in line_lower and 'non-fatal' not in line_lower: if 'kernel panic' in line_lower or ('fatal' in line_lower and 'non-fatal' not in line_lower):
return 'CRITICAL' return 'CRITICAL'
# Lines from priority "err" that don't match any keyword above are # Lines from priority "err" that don't match any keyword above are
@@ -3164,7 +3179,7 @@ class HealthMonitor:
device_name=base_device, device_name=base_device,
serial=obs_serial, serial=obs_serial,
error_type='filesystem_error', error_type='filesystem_error',
error_signature=f'fs_error_{fs_device}_{pattern_key}', error_signature=f'fs_error_{fs_device}_{pattern_hash}',
raw_message=enriched_reason[:500], raw_message=enriched_reason[:500],
severity=fs_severity.lower(), severity=fs_severity.lower(),
) )
@@ -3253,12 +3268,25 @@ class HealthMonitor:
'dismissable': True, 'occurrences': data['count']} 'dismissable': True, 'occurrences': data['count']}
) )
patterns_to_remove = [ patterns_to_remove = [
p for p, data in self.persistent_log_patterns.items() p for p, data in self.persistent_log_patterns.items()
if current_time - data['last_seen'] > 1800 if current_time - data['last_seen'] > 1800
] ]
for pattern in patterns_to_remove: for pattern in patterns_to_remove:
del self.persistent_log_patterns[pattern] del self.persistent_log_patterns[pattern]
# B5 fix: Cap size to prevent unbounded memory growth under high error load
MAX_LOG_PATTERNS = 500
if len(self.persistent_log_patterns) > MAX_LOG_PATTERNS:
sorted_patterns = sorted(
self.persistent_log_patterns.items(),
key=lambda x: x[1]['last_seen'],
reverse=True
)
self.persistent_log_patterns = defaultdict(
lambda: {'count': 0, 'first_seen': 0, 'last_seen': 0},
dict(sorted_patterns[:MAX_LOG_PATTERNS])
)
unique_critical_count = len(critical_errors_found) unique_critical_count = len(critical_errors_found)
cascade_count = len(cascading_errors) cascade_count = len(cascading_errors)
@@ -3870,12 +3898,14 @@ class HealthMonitor:
# Sub-check 3: Failed login attempts (brute force detection) # Sub-check 3: Failed login attempts (brute force detection)
try: try:
result = subprocess.run( result = subprocess.run(
['journalctl', '--since', '24 hours ago', '--no-pager'], ['journalctl', '--since', '24 hours ago', '--no-pager',
capture_output=True, '-g', 'authentication failure|failed password|invalid user',
text=True, '--output=cat', '-n', '5000'],
timeout=3 capture_output=True,
) text=True,
timeout=5
)
failed_logins = 0 failed_logins = 0
if result.returncode == 0: if result.returncode == 0:

View File

@@ -18,6 +18,7 @@ import sqlite3
import json import json
import os import os
import threading import threading
from contextlib import contextmanager
from datetime import datetime, timedelta from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional from typing import Dict, List, Any, Optional
from pathlib import Path from pathlib import Path
@@ -59,6 +60,24 @@ class HealthPersistence:
conn.execute('PRAGMA busy_timeout=5000') conn.execute('PRAGMA busy_timeout=5000')
return conn return conn
@contextmanager
def _db_connection(self, row_factory: bool = False):
"""Context manager for safe database connections (B4 fix).
Ensures connections are always closed, even if exceptions occur.
Usage:
with self._db_connection() as conn:
cursor = conn.cursor()
...
"""
conn = self._get_conn()
if row_factory:
conn.row_factory = sqlite3.Row
try:
yield conn
finally:
conn.close()
def _init_database(self): def _init_database(self):
"""Initialize SQLite database with required tables""" """Initialize SQLite database with required tables"""
conn = self._get_conn() conn = self._get_conn()
@@ -345,7 +364,8 @@ class HealthPersistence:
if not (error_key == 'cpu_temperature' and severity == 'CRITICAL'): if not (error_key == 'cpu_temperature' and severity == 'CRITICAL'):
setting_key = self.CATEGORY_SETTING_MAP.get(category, '') setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
if setting_key: if setting_key:
stored = self.get_setting(setting_key) # P4 fix: use _get_setting_impl with existing connection to avoid deadlock
stored = self._get_setting_impl(conn, setting_key)
if stored is not None: if stored is not None:
configured_hours = int(stored) configured_hours = int(stored)
if configured_hours != self.DEFAULT_SUPPRESSION_HOURS: if configured_hours != self.DEFAULT_SUPPRESSION_HOURS:
@@ -411,54 +431,54 @@ class HealthPersistence:
- Error is active (unresolved and not acknowledged), OR - Error is active (unresolved and not acknowledged), OR
- Error is dismissed but still within its suppression period - Error is dismissed but still within its suppression period
""" """
conn = self._get_conn() with self._db_connection() as conn:
cursor = conn.cursor() cursor = conn.cursor()
# First check: is the error active (unresolved and not acknowledged)? # First check: is the error active (unresolved and not acknowledged)?
if category: if category:
cursor.execute(''' cursor.execute('''
SELECT COUNT(*) FROM errors SELECT COUNT(*) FROM errors
WHERE error_key = ? AND category = ? WHERE error_key = ? AND category = ?
AND resolved_at IS NULL AND acknowledged = 0 AND resolved_at IS NULL AND acknowledged = 0
''', (error_key, category)) ''', (error_key, category))
else: else:
cursor.execute(''' cursor.execute('''
SELECT COUNT(*) FROM errors SELECT COUNT(*) FROM errors
WHERE error_key = ? WHERE error_key = ?
AND resolved_at IS NULL AND acknowledged = 0 AND resolved_at IS NULL AND acknowledged = 0
''', (error_key,)) ''', (error_key,))
active_count = cursor.fetchone()[0] active_count = cursor.fetchone()[0]
if active_count > 0: if active_count > 0:
conn.close() return True
return True
# Second check: is the error dismissed but still within suppression period? # Second check: is the error dismissed but still within suppression period?
# This prevents re-recording dismissed errors before their suppression expires # This prevents re-recording dismissed errors before their suppression expires
if category: # Note: acknowledged errors may have resolved_at NULL (dismissed but error still exists)
cursor.execute(''' # or resolved_at set (error was dismissed AND condition resolved)
SELECT resolved_at, suppression_hours FROM errors if category:
WHERE error_key = ? AND category = ? cursor.execute('''
AND acknowledged = 1 AND resolved_at IS NOT NULL SELECT acknowledged_at, suppression_hours FROM errors
ORDER BY resolved_at DESC LIMIT 1 WHERE error_key = ? AND category = ?
''', (error_key, category)) AND acknowledged = 1
else: ORDER BY acknowledged_at DESC LIMIT 1
cursor.execute(''' ''', (error_key, category))
SELECT resolved_at, suppression_hours FROM errors else:
WHERE error_key = ? cursor.execute('''
AND acknowledged = 1 AND resolved_at IS NOT NULL SELECT acknowledged_at, suppression_hours FROM errors
ORDER BY resolved_at DESC LIMIT 1 WHERE error_key = ?
''', (error_key,)) AND acknowledged = 1
ORDER BY acknowledged_at DESC LIMIT 1
''', (error_key,))
row = cursor.fetchone() row = cursor.fetchone()
conn.close()
if row: if row:
resolved_at_str, suppression_hours = row acknowledged_at_str, suppression_hours = row
if resolved_at_str and suppression_hours: if acknowledged_at_str and suppression_hours:
try: try:
resolved_at = datetime.fromisoformat(resolved_at_str) acknowledged_at = datetime.fromisoformat(acknowledged_at_str)
suppression_end = resolved_at + timedelta(hours=suppression_hours) suppression_end = acknowledged_at + timedelta(hours=suppression_hours)
if datetime.now() < suppression_end: if datetime.now() < suppression_end:
# Still within suppression period - treat as "active" to prevent re-recording # Still within suppression period - treat as "active" to prevent re-recording
return True return True
@@ -542,17 +562,25 @@ class HealthPersistence:
('updates', 'pending_updates'), ('updates', 'kernel_pve'), ('updates', 'pending_updates'), ('updates', 'kernel_pve'),
('security', 'security_'), ('security', 'security_'),
('pve_services', 'pve_service_'), ('vms', 'vmct_'), ('vms', 'vm_'), ('vms', 'ct_'), ('pve_services', 'pve_service_'), ('vms', 'vmct_'), ('vms', 'vm_'), ('vms', 'ct_'),
('disks', 'disk_'), ('disks', 'smart_'), ('disks', 'zfs_pool_'), ('disks', 'disk_smart_'), ('disks', 'disk_'), ('disks', 'smart_'), ('disks', 'zfs_pool_'),
('logs', 'log_'), ('network', 'net_'), ('logs', 'log_'), ('network', 'net_'),
('temperature', 'temp_')]: ('temperature', 'temp_')]:
if error_key == prefix or error_key.startswith(prefix): if error_key == prefix or error_key.startswith(prefix):
category = cat category = cat
break break
# Fallback: if no category matched, try to infer from common patterns
if not category:
if 'disk' in error_key or 'smart' in error_key or 'sda' in error_key or 'sdb' in error_key or 'nvme' in error_key:
category = 'disks'
else:
category = 'general' # Use 'general' as ultimate fallback instead of empty string
setting_key = self.CATEGORY_SETTING_MAP.get(category, '') setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
sup_hours = self.DEFAULT_SUPPRESSION_HOURS sup_hours = self.DEFAULT_SUPPRESSION_HOURS
if setting_key: if setting_key:
stored = self.get_setting(setting_key) # P4 fix: use _get_setting_impl with existing connection
stored = self._get_setting_impl(conn, setting_key)
if stored is not None: if stored is not None:
try: try:
sup_hours = int(stored) sup_hours = int(stored)
@@ -593,7 +621,8 @@ class HealthPersistence:
setting_key = self.CATEGORY_SETTING_MAP.get(category, '') setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
sup_hours = self.DEFAULT_SUPPRESSION_HOURS sup_hours = self.DEFAULT_SUPPRESSION_HOURS
if setting_key: if setting_key:
stored = self.get_setting(setting_key) # P4 fix: use _get_setting_impl with existing connection
stored = self._get_setting_impl(conn, setting_key)
if stored is not None: if stored is not None:
try: try:
sup_hours = int(stored) sup_hours = int(stored)
@@ -648,55 +677,63 @@ class HealthPersistence:
return result return result
def is_error_acknowledged(self, error_key: str) -> bool: def is_error_acknowledged(self, error_key: str) -> bool:
"""Check if an error_key has been acknowledged and is still within suppression window.""" """Check if an error_key has been acknowledged and is still within suppression window.
Uses acknowledged_at (not resolved_at) to calculate suppression expiration,
since dismissed errors may have resolved_at = NULL.
"""
try: try:
conn = self._get_conn() with self._db_connection(row_factory=True) as conn:
conn.row_factory = sqlite3.Row cursor = conn.cursor()
cursor = conn.cursor() cursor.execute(
cursor.execute( 'SELECT acknowledged, acknowledged_at, suppression_hours FROM errors WHERE error_key = ?',
'SELECT acknowledged, resolved_at, suppression_hours FROM errors WHERE error_key = ?', (error_key,))
(error_key,)) row = cursor.fetchone()
row = cursor.fetchone() if not row:
conn.close() return False
if not row: if not row['acknowledged']:
return False return False
if not row['acknowledged']: # Check if still within suppression window using acknowledged_at
return False acknowledged_at = row['acknowledged_at']
# Check if still within suppression window sup_hours = row['suppression_hours'] or self.DEFAULT_SUPPRESSION_HOURS
resolved_at = row['resolved_at']
sup_hours = row['suppression_hours'] or self.DEFAULT_SUPPRESSION_HOURS # -1 means permanently suppressed
if resolved_at: if sup_hours < 0:
try: return True
resolved_dt = datetime.fromisoformat(resolved_at)
if datetime.now() > resolved_dt + timedelta(hours=sup_hours): if acknowledged_at:
return False # Suppression expired try:
except Exception: acknowledged_dt = datetime.fromisoformat(acknowledged_at)
pass if datetime.now() > acknowledged_dt + timedelta(hours=sup_hours):
return True return False # Suppression expired
except Exception:
pass
return True
except Exception: except Exception:
return False return False
def get_active_errors(self, category: Optional[str] = None) -> List[Dict[str, Any]]: def get_active_errors(self, category: Optional[str] = None) -> List[Dict[str, Any]]:
"""Get all active (unresolved) errors, optionally filtered by category""" """Get all active (unresolved AND not acknowledged) errors, optionally filtered by category.
conn = self._get_conn()
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
if category: Acknowledged errors are excluded since they have been dismissed by the user.
cursor.execute(''' """
SELECT * FROM errors with self._db_connection(row_factory=True) as conn:
WHERE resolved_at IS NULL AND category = ? cursor = conn.cursor()
ORDER BY severity DESC, last_seen DESC
''', (category,))
else:
cursor.execute('''
SELECT * FROM errors
WHERE resolved_at IS NULL
ORDER BY severity DESC, last_seen DESC
''')
rows = cursor.fetchall() if category:
conn.close() cursor.execute('''
SELECT * FROM errors
WHERE resolved_at IS NULL AND acknowledged = 0 AND category = ?
ORDER BY severity DESC, last_seen DESC
''', (category,))
else:
cursor.execute('''
SELECT * FROM errors
WHERE resolved_at IS NULL AND acknowledged = 0
ORDER BY severity DESC, last_seen DESC
''')
rows = cursor.fetchall()
errors = [] errors = []
for row in rows: for row in rows:
@@ -850,11 +887,11 @@ class HealthPersistence:
conn.row_factory = sqlite3.Row conn.row_factory = sqlite3.Row
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute(''' cursor.execute('''
SELECT * FROM errors SELECT * FROM errors
WHERE acknowledged = 1 AND resolved_at IS NOT NULL WHERE acknowledged = 1
ORDER BY resolved_at DESC ORDER BY acknowledged_at DESC
''') ''')
rows = cursor.fetchall() rows = cursor.fetchall()
conn.close() conn.close()
@@ -871,8 +908,12 @@ class HealthPersistence:
pass pass
# Check if still within suppression period using per-record hours # Check if still within suppression period using per-record hours
# Use acknowledged_at as reference (resolved_at may be NULL for dismissed but active errors)
try: try:
resolved_dt = datetime.fromisoformat(error_dict['resolved_at']) ref_time_str = error_dict.get('acknowledged_at') or error_dict.get('resolved_at')
if not ref_time_str:
continue
ref_dt = datetime.fromisoformat(ref_time_str)
sup_hours = error_dict.get('suppression_hours') sup_hours = error_dict.get('suppression_hours')
if sup_hours is None: if sup_hours is None:
sup_hours = self.DEFAULT_SUPPRESSION_HOURS sup_hours = self.DEFAULT_SUPPRESSION_HOURS
@@ -885,7 +926,7 @@ class HealthPersistence:
error_dict['permanent'] = True error_dict['permanent'] = True
dismissed.append(error_dict) dismissed.append(error_dict)
else: else:
elapsed_seconds = (now - resolved_dt).total_seconds() elapsed_seconds = (now - ref_dt).total_seconds()
suppression_seconds = sup_hours * 3600 suppression_seconds = sup_hours * 3600
if elapsed_seconds < suppression_seconds: if elapsed_seconds < suppression_seconds:
@@ -971,12 +1012,14 @@ class HealthPersistence:
conn = self._get_conn() conn = self._get_conn()
cursor = conn.cursor() cursor = conn.cursor()
for event_id in event_ids: # Use single UPDATE with IN clause instead of N individual updates
cursor.execute(''' now = datetime.now().isoformat()
UPDATE events placeholders = ','.join('?' * len(event_ids))
SET data = json_set(COALESCE(data, '{}'), '$.needs_notification', 0, '$.notified_at', ?) cursor.execute(f'''
WHERE id = ? UPDATE events
''', (datetime.now().isoformat(), event_id)) SET data = json_set(COALESCE(data, '{{}}'), '$.needs_notification', 0, '$.notified_at', ?)
WHERE id IN ({placeholders})
''', [now] + event_ids)
conn.commit() conn.commit()
conn.close() conn.close()
@@ -1074,13 +1117,16 @@ class HealthPersistence:
def get_setting(self, key: str, default: Optional[str] = None) -> Optional[str]: def get_setting(self, key: str, default: Optional[str] = None) -> Optional[str]:
"""Get a user setting value by key.""" """Get a user setting value by key."""
conn = self._get_conn() with self._db_connection() as conn:
return self._get_setting_impl(conn, key, default)
def _get_setting_impl(self, conn, key: str, default: Optional[str] = None) -> Optional[str]:
"""Internal: get setting using existing connection (P4 fix - avoids nested connections)."""
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute( cursor.execute(
'SELECT setting_value FROM user_settings WHERE setting_key = ?', (key,) 'SELECT setting_value FROM user_settings WHERE setting_key = ?', (key,)
) )
row = cursor.fetchone() row = cursor.fetchone()
conn.close()
return row[0] if row else default return row[0] if row else default
def set_setting(self, key: str, value: str): def set_setting(self, key: str, value: str):