Update health monitor

This commit is contained in:
MacRimi
2026-03-15 10:03:35 +01:00
parent 26c75e8309
commit e169200f40
3 changed files with 212 additions and 136 deletions

View File

@@ -214,7 +214,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
const refreshInterval = setInterval(fetchHealthDetails, 300000)
return () => clearInterval(refreshInterval)
}
}, [open])
}, [open, fetchHealthDetails])
// Auto-expand non-OK categories when data loads
useEffect(() => {
@@ -506,13 +506,13 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
size="sm"
variant="outline"
className="h-5 px-1 sm:px-1.5 shrink-0 hover:bg-red-500/10 hover:border-red-500/50 bg-transparent text-[10px]"
disabled={dismissingKey === checkKey}
disabled={dismissingKey === (checkData.error_key || checkKey)}
onClick={(e) => {
e.stopPropagation()
handleAcknowledge(checkData.error_key || checkKey, e)
}}
>
{dismissingKey === checkKey ? (
{dismissingKey === (checkData.error_key || checkKey) ? (
<Loader2 className="h-3 w-3 animate-spin" />
) : (
<>

View File

@@ -189,6 +189,20 @@ class HealthMonitor:
# PVE Critical Services
PVE_SERVICES = ['pveproxy', 'pvedaemon', 'pvestatd', 'pve-cluster']
# P2 fix: Pre-compiled regex patterns for performance (avoid re-compiling on every line)
_BENIGN_RE = None
_CRITICAL_RE = None
_WARNING_RE = None
@classmethod
def _get_compiled_patterns(cls):
"""Lazily compile regex patterns once"""
if cls._BENIGN_RE is None:
cls._BENIGN_RE = re.compile("|".join(cls.BENIGN_ERROR_PATTERNS), re.IGNORECASE)
cls._CRITICAL_RE = re.compile("|".join(cls.CRITICAL_LOG_KEYWORDS), re.IGNORECASE)
cls._WARNING_RE = re.compile("|".join(cls.WARNING_LOG_KEYWORDS), re.IGNORECASE)
return cls._BENIGN_RE, cls._CRITICAL_RE, cls._WARNING_RE
def __init__(self):
"""Initialize health monitor with state tracking"""
self.state_history = defaultdict(list)
@@ -199,6 +213,7 @@ class HealthMonitor:
self.failed_vm_history = set() # Track VMs that failed to start
self.persistent_log_patterns = defaultdict(lambda: {'count': 0, 'first_seen': 0, 'last_seen': 0})
self._unknown_counts = {} # Track consecutive UNKNOWN cycles per category
self._last_cleanup_time = 0 # Throttle cleanup_old_errors calls
# System capabilities - derived from Proxmox storage types at runtime (Priority 1.5)
# SMART detection still uses filesystem check on init (lightweight)
@@ -380,12 +395,15 @@ class HealthMonitor:
Returns JSON structure with ALL 10 categories always present.
Now includes persistent error tracking.
"""
# Run cleanup on every status check so stale errors are auto-resolved
# Run cleanup with throttle (every 5 min) so stale errors are auto-resolved
# using the user-configured Suppression Duration (single source of truth).
try:
health_persistence.cleanup_old_errors()
except Exception:
pass
current_time = time.time()
if current_time - self._last_cleanup_time > 300: # 5 minutes
try:
health_persistence.cleanup_old_errors()
self._last_cleanup_time = current_time
except Exception:
pass
active_errors = health_persistence.get_active_errors()
# No need to create persistent_issues dict here, it's implicitly handled by the checks
@@ -1319,7 +1337,7 @@ class HealthMonitor:
'device': f'/dev/{device_name}',
'serial': serial,
'model': model,
'error_key': error_info.get('error_key') or f'disk_{device_name}',
'error_key': error_info.get('error_key') or f'disk_smart_{device_name}',
'dismissable': error_info.get('dismissable', True),
'is_disk_entry': True,
}
@@ -2843,12 +2861,9 @@ class HealthMonitor:
}
def _is_benign_error(self, line: str) -> bool:
"""Check if log line matches benign error patterns"""
line_lower = line.lower()
for pattern in self.BENIGN_ERROR_PATTERNS:
if re.search(pattern, line_lower):
return True
return False
"""Check if log line matches benign error patterns (uses pre-compiled regex)"""
benign_re, _, _ = self._get_compiled_patterns()
return bool(benign_re.search(line.lower()))
def _enrich_critical_log_reason(self, line: str) -> str:
"""
@@ -2969,7 +2984,7 @@ class HealthMonitor:
# Generic classification -- very conservative to avoid false positives.
# Only escalate if the line explicitly uses severity-level keywords
# from the kernel or systemd (not just any line containing "error").
if 'kernel panic' in line_lower or 'fatal' in line_lower and 'non-fatal' not in line_lower:
if 'kernel panic' in line_lower or ('fatal' in line_lower and 'non-fatal' not in line_lower):
return 'CRITICAL'
# Lines from priority "err" that don't match any keyword above are
@@ -3164,7 +3179,7 @@ class HealthMonitor:
device_name=base_device,
serial=obs_serial,
error_type='filesystem_error',
error_signature=f'fs_error_{fs_device}_{pattern_key}',
error_signature=f'fs_error_{fs_device}_{pattern_hash}',
raw_message=enriched_reason[:500],
severity=fs_severity.lower(),
)
@@ -3253,12 +3268,25 @@ class HealthMonitor:
'dismissable': True, 'occurrences': data['count']}
)
patterns_to_remove = [
p for p, data in self.persistent_log_patterns.items()
if current_time - data['last_seen'] > 1800
]
for pattern in patterns_to_remove:
del self.persistent_log_patterns[pattern]
patterns_to_remove = [
p for p, data in self.persistent_log_patterns.items()
if current_time - data['last_seen'] > 1800
]
for pattern in patterns_to_remove:
del self.persistent_log_patterns[pattern]
# B5 fix: Cap size to prevent unbounded memory growth under high error load
MAX_LOG_PATTERNS = 500
if len(self.persistent_log_patterns) > MAX_LOG_PATTERNS:
sorted_patterns = sorted(
self.persistent_log_patterns.items(),
key=lambda x: x[1]['last_seen'],
reverse=True
)
self.persistent_log_patterns = defaultdict(
lambda: {'count': 0, 'first_seen': 0, 'last_seen': 0},
dict(sorted_patterns[:MAX_LOG_PATTERNS])
)
unique_critical_count = len(critical_errors_found)
cascade_count = len(cascading_errors)
@@ -3870,12 +3898,14 @@ class HealthMonitor:
# Sub-check 3: Failed login attempts (brute force detection)
try:
result = subprocess.run(
['journalctl', '--since', '24 hours ago', '--no-pager'],
capture_output=True,
text=True,
timeout=3
)
result = subprocess.run(
['journalctl', '--since', '24 hours ago', '--no-pager',
'-g', 'authentication failure|failed password|invalid user',
'--output=cat', '-n', '5000'],
capture_output=True,
text=True,
timeout=5
)
failed_logins = 0
if result.returncode == 0:

View File

@@ -18,6 +18,7 @@ import sqlite3
import json
import os
import threading
from contextlib import contextmanager
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional
from pathlib import Path
@@ -59,6 +60,24 @@ class HealthPersistence:
conn.execute('PRAGMA busy_timeout=5000')
return conn
@contextmanager
def _db_connection(self, row_factory: bool = False):
"""Context manager for safe database connections (B4 fix).
Ensures connections are always closed, even if exceptions occur.
Usage:
with self._db_connection() as conn:
cursor = conn.cursor()
...
"""
conn = self._get_conn()
if row_factory:
conn.row_factory = sqlite3.Row
try:
yield conn
finally:
conn.close()
def _init_database(self):
"""Initialize SQLite database with required tables"""
conn = self._get_conn()
@@ -345,7 +364,8 @@ class HealthPersistence:
if not (error_key == 'cpu_temperature' and severity == 'CRITICAL'):
setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
if setting_key:
stored = self.get_setting(setting_key)
# P4 fix: use _get_setting_impl with existing connection to avoid deadlock
stored = self._get_setting_impl(conn, setting_key)
if stored is not None:
configured_hours = int(stored)
if configured_hours != self.DEFAULT_SUPPRESSION_HOURS:
@@ -411,54 +431,54 @@ class HealthPersistence:
- Error is active (unresolved and not acknowledged), OR
- Error is dismissed but still within its suppression period
"""
conn = self._get_conn()
cursor = conn.cursor()
# First check: is the error active (unresolved and not acknowledged)?
if category:
cursor.execute('''
SELECT COUNT(*) FROM errors
WHERE error_key = ? AND category = ?
AND resolved_at IS NULL AND acknowledged = 0
''', (error_key, category))
else:
cursor.execute('''
SELECT COUNT(*) FROM errors
WHERE error_key = ?
AND resolved_at IS NULL AND acknowledged = 0
''', (error_key,))
active_count = cursor.fetchone()[0]
if active_count > 0:
conn.close()
return True
# Second check: is the error dismissed but still within suppression period?
# This prevents re-recording dismissed errors before their suppression expires
if category:
cursor.execute('''
SELECT resolved_at, suppression_hours FROM errors
WHERE error_key = ? AND category = ?
AND acknowledged = 1 AND resolved_at IS NOT NULL
ORDER BY resolved_at DESC LIMIT 1
''', (error_key, category))
else:
cursor.execute('''
SELECT resolved_at, suppression_hours FROM errors
WHERE error_key = ?
AND acknowledged = 1 AND resolved_at IS NOT NULL
ORDER BY resolved_at DESC LIMIT 1
''', (error_key,))
row = cursor.fetchone()
conn.close()
with self._db_connection() as conn:
cursor = conn.cursor()
# First check: is the error active (unresolved and not acknowledged)?
if category:
cursor.execute('''
SELECT COUNT(*) FROM errors
WHERE error_key = ? AND category = ?
AND resolved_at IS NULL AND acknowledged = 0
''', (error_key, category))
else:
cursor.execute('''
SELECT COUNT(*) FROM errors
WHERE error_key = ?
AND resolved_at IS NULL AND acknowledged = 0
''', (error_key,))
active_count = cursor.fetchone()[0]
if active_count > 0:
return True
# Second check: is the error dismissed but still within suppression period?
# This prevents re-recording dismissed errors before their suppression expires
# Note: acknowledged errors may have resolved_at NULL (dismissed but error still exists)
# or resolved_at set (error was dismissed AND condition resolved)
if category:
cursor.execute('''
SELECT acknowledged_at, suppression_hours FROM errors
WHERE error_key = ? AND category = ?
AND acknowledged = 1
ORDER BY acknowledged_at DESC LIMIT 1
''', (error_key, category))
else:
cursor.execute('''
SELECT acknowledged_at, suppression_hours FROM errors
WHERE error_key = ?
AND acknowledged = 1
ORDER BY acknowledged_at DESC LIMIT 1
''', (error_key,))
row = cursor.fetchone()
if row:
resolved_at_str, suppression_hours = row
if resolved_at_str and suppression_hours:
acknowledged_at_str, suppression_hours = row
if acknowledged_at_str and suppression_hours:
try:
resolved_at = datetime.fromisoformat(resolved_at_str)
suppression_end = resolved_at + timedelta(hours=suppression_hours)
acknowledged_at = datetime.fromisoformat(acknowledged_at_str)
suppression_end = acknowledged_at + timedelta(hours=suppression_hours)
if datetime.now() < suppression_end:
# Still within suppression period - treat as "active" to prevent re-recording
return True
@@ -542,17 +562,25 @@ class HealthPersistence:
('updates', 'pending_updates'), ('updates', 'kernel_pve'),
('security', 'security_'),
('pve_services', 'pve_service_'), ('vms', 'vmct_'), ('vms', 'vm_'), ('vms', 'ct_'),
('disks', 'disk_'), ('disks', 'smart_'), ('disks', 'zfs_pool_'),
('disks', 'disk_smart_'), ('disks', 'disk_'), ('disks', 'smart_'), ('disks', 'zfs_pool_'),
('logs', 'log_'), ('network', 'net_'),
('temperature', 'temp_')]:
if error_key == prefix or error_key.startswith(prefix):
category = cat
break
# Fallback: if no category matched, try to infer from common patterns
if not category:
if 'disk' in error_key or 'smart' in error_key or 'sda' in error_key or 'sdb' in error_key or 'nvme' in error_key:
category = 'disks'
else:
category = 'general' # Use 'general' as ultimate fallback instead of empty string
setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
sup_hours = self.DEFAULT_SUPPRESSION_HOURS
if setting_key:
stored = self.get_setting(setting_key)
# P4 fix: use _get_setting_impl with existing connection
stored = self._get_setting_impl(conn, setting_key)
if stored is not None:
try:
sup_hours = int(stored)
@@ -593,7 +621,8 @@ class HealthPersistence:
setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
sup_hours = self.DEFAULT_SUPPRESSION_HOURS
if setting_key:
stored = self.get_setting(setting_key)
# P4 fix: use _get_setting_impl with existing connection
stored = self._get_setting_impl(conn, setting_key)
if stored is not None:
try:
sup_hours = int(stored)
@@ -648,55 +677,63 @@ class HealthPersistence:
return result
def is_error_acknowledged(self, error_key: str) -> bool:
"""Check if an error_key has been acknowledged and is still within suppression window."""
"""Check if an error_key has been acknowledged and is still within suppression window.
Uses acknowledged_at (not resolved_at) to calculate suppression expiration,
since dismissed errors may have resolved_at = NULL.
"""
try:
conn = self._get_conn()
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
cursor.execute(
'SELECT acknowledged, resolved_at, suppression_hours FROM errors WHERE error_key = ?',
(error_key,))
row = cursor.fetchone()
conn.close()
if not row:
return False
if not row['acknowledged']:
return False
# Check if still within suppression window
resolved_at = row['resolved_at']
sup_hours = row['suppression_hours'] or self.DEFAULT_SUPPRESSION_HOURS
if resolved_at:
try:
resolved_dt = datetime.fromisoformat(resolved_at)
if datetime.now() > resolved_dt + timedelta(hours=sup_hours):
return False # Suppression expired
except Exception:
pass
return True
with self._db_connection(row_factory=True) as conn:
cursor = conn.cursor()
cursor.execute(
'SELECT acknowledged, acknowledged_at, suppression_hours FROM errors WHERE error_key = ?',
(error_key,))
row = cursor.fetchone()
if not row:
return False
if not row['acknowledged']:
return False
# Check if still within suppression window using acknowledged_at
acknowledged_at = row['acknowledged_at']
sup_hours = row['suppression_hours'] or self.DEFAULT_SUPPRESSION_HOURS
# -1 means permanently suppressed
if sup_hours < 0:
return True
if acknowledged_at:
try:
acknowledged_dt = datetime.fromisoformat(acknowledged_at)
if datetime.now() > acknowledged_dt + timedelta(hours=sup_hours):
return False # Suppression expired
except Exception:
pass
return True
except Exception:
return False
def get_active_errors(self, category: Optional[str] = None) -> List[Dict[str, Any]]:
"""Get all active (unresolved) errors, optionally filtered by category"""
conn = self._get_conn()
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
"""Get all active (unresolved AND not acknowledged) errors, optionally filtered by category.
if category:
cursor.execute('''
SELECT * FROM errors
WHERE resolved_at IS NULL AND category = ?
ORDER BY severity DESC, last_seen DESC
''', (category,))
else:
cursor.execute('''
SELECT * FROM errors
WHERE resolved_at IS NULL
ORDER BY severity DESC, last_seen DESC
''')
rows = cursor.fetchall()
conn.close()
Acknowledged errors are excluded since they have been dismissed by the user.
"""
with self._db_connection(row_factory=True) as conn:
cursor = conn.cursor()
if category:
cursor.execute('''
SELECT * FROM errors
WHERE resolved_at IS NULL AND acknowledged = 0 AND category = ?
ORDER BY severity DESC, last_seen DESC
''', (category,))
else:
cursor.execute('''
SELECT * FROM errors
WHERE resolved_at IS NULL AND acknowledged = 0
ORDER BY severity DESC, last_seen DESC
''')
rows = cursor.fetchall()
errors = []
for row in rows:
@@ -850,11 +887,11 @@ class HealthPersistence:
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
cursor.execute('''
SELECT * FROM errors
WHERE acknowledged = 1 AND resolved_at IS NOT NULL
ORDER BY resolved_at DESC
''')
cursor.execute('''
SELECT * FROM errors
WHERE acknowledged = 1
ORDER BY acknowledged_at DESC
''')
rows = cursor.fetchall()
conn.close()
@@ -871,8 +908,12 @@ class HealthPersistence:
pass
# Check if still within suppression period using per-record hours
# Use acknowledged_at as reference (resolved_at may be NULL for dismissed but active errors)
try:
resolved_dt = datetime.fromisoformat(error_dict['resolved_at'])
ref_time_str = error_dict.get('acknowledged_at') or error_dict.get('resolved_at')
if not ref_time_str:
continue
ref_dt = datetime.fromisoformat(ref_time_str)
sup_hours = error_dict.get('suppression_hours')
if sup_hours is None:
sup_hours = self.DEFAULT_SUPPRESSION_HOURS
@@ -885,7 +926,7 @@ class HealthPersistence:
error_dict['permanent'] = True
dismissed.append(error_dict)
else:
elapsed_seconds = (now - resolved_dt).total_seconds()
elapsed_seconds = (now - ref_dt).total_seconds()
suppression_seconds = sup_hours * 3600
if elapsed_seconds < suppression_seconds:
@@ -971,12 +1012,14 @@ class HealthPersistence:
conn = self._get_conn()
cursor = conn.cursor()
for event_id in event_ids:
cursor.execute('''
UPDATE events
SET data = json_set(COALESCE(data, '{}'), '$.needs_notification', 0, '$.notified_at', ?)
WHERE id = ?
''', (datetime.now().isoformat(), event_id))
# Use single UPDATE with IN clause instead of N individual updates
now = datetime.now().isoformat()
placeholders = ','.join('?' * len(event_ids))
cursor.execute(f'''
UPDATE events
SET data = json_set(COALESCE(data, '{{}}'), '$.needs_notification', 0, '$.notified_at', ?)
WHERE id IN ({placeholders})
''', [now] + event_ids)
conn.commit()
conn.close()
@@ -1074,13 +1117,16 @@ class HealthPersistence:
def get_setting(self, key: str, default: Optional[str] = None) -> Optional[str]:
"""Get a user setting value by key."""
conn = self._get_conn()
with self._db_connection() as conn:
return self._get_setting_impl(conn, key, default)
def _get_setting_impl(self, conn, key: str, default: Optional[str] = None) -> Optional[str]:
"""Internal: get setting using existing connection (P4 fix - avoids nested connections)."""
cursor = conn.cursor()
cursor.execute(
'SELECT setting_value FROM user_settings WHERE setting_key = ?', (key,)
)
row = cursor.fetchone()
conn.close()
return row[0] if row else default
def set_setting(self, key: str, value: str):