mirror of
https://github.com/MacRimi/ProxMenux.git
synced 2026-04-20 11:02:15 +00:00
Update health monitor
This commit is contained in:
@@ -214,7 +214,7 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
|
|||||||
const refreshInterval = setInterval(fetchHealthDetails, 300000)
|
const refreshInterval = setInterval(fetchHealthDetails, 300000)
|
||||||
return () => clearInterval(refreshInterval)
|
return () => clearInterval(refreshInterval)
|
||||||
}
|
}
|
||||||
}, [open])
|
}, [open, fetchHealthDetails])
|
||||||
|
|
||||||
// Auto-expand non-OK categories when data loads
|
// Auto-expand non-OK categories when data loads
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
@@ -506,13 +506,13 @@ export function HealthStatusModal({ open, onOpenChange, getApiUrl }: HealthStatu
|
|||||||
size="sm"
|
size="sm"
|
||||||
variant="outline"
|
variant="outline"
|
||||||
className="h-5 px-1 sm:px-1.5 shrink-0 hover:bg-red-500/10 hover:border-red-500/50 bg-transparent text-[10px]"
|
className="h-5 px-1 sm:px-1.5 shrink-0 hover:bg-red-500/10 hover:border-red-500/50 bg-transparent text-[10px]"
|
||||||
disabled={dismissingKey === checkKey}
|
disabled={dismissingKey === (checkData.error_key || checkKey)}
|
||||||
onClick={(e) => {
|
onClick={(e) => {
|
||||||
e.stopPropagation()
|
e.stopPropagation()
|
||||||
handleAcknowledge(checkData.error_key || checkKey, e)
|
handleAcknowledge(checkData.error_key || checkKey, e)
|
||||||
}}
|
}}
|
||||||
>
|
>
|
||||||
{dismissingKey === checkKey ? (
|
{dismissingKey === (checkData.error_key || checkKey) ? (
|
||||||
<Loader2 className="h-3 w-3 animate-spin" />
|
<Loader2 className="h-3 w-3 animate-spin" />
|
||||||
) : (
|
) : (
|
||||||
<>
|
<>
|
||||||
|
|||||||
@@ -189,6 +189,20 @@ class HealthMonitor:
|
|||||||
# PVE Critical Services
|
# PVE Critical Services
|
||||||
PVE_SERVICES = ['pveproxy', 'pvedaemon', 'pvestatd', 'pve-cluster']
|
PVE_SERVICES = ['pveproxy', 'pvedaemon', 'pvestatd', 'pve-cluster']
|
||||||
|
|
||||||
|
# P2 fix: Pre-compiled regex patterns for performance (avoid re-compiling on every line)
|
||||||
|
_BENIGN_RE = None
|
||||||
|
_CRITICAL_RE = None
|
||||||
|
_WARNING_RE = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _get_compiled_patterns(cls):
|
||||||
|
"""Lazily compile regex patterns once"""
|
||||||
|
if cls._BENIGN_RE is None:
|
||||||
|
cls._BENIGN_RE = re.compile("|".join(cls.BENIGN_ERROR_PATTERNS), re.IGNORECASE)
|
||||||
|
cls._CRITICAL_RE = re.compile("|".join(cls.CRITICAL_LOG_KEYWORDS), re.IGNORECASE)
|
||||||
|
cls._WARNING_RE = re.compile("|".join(cls.WARNING_LOG_KEYWORDS), re.IGNORECASE)
|
||||||
|
return cls._BENIGN_RE, cls._CRITICAL_RE, cls._WARNING_RE
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
"""Initialize health monitor with state tracking"""
|
"""Initialize health monitor with state tracking"""
|
||||||
self.state_history = defaultdict(list)
|
self.state_history = defaultdict(list)
|
||||||
@@ -199,6 +213,7 @@ class HealthMonitor:
|
|||||||
self.failed_vm_history = set() # Track VMs that failed to start
|
self.failed_vm_history = set() # Track VMs that failed to start
|
||||||
self.persistent_log_patterns = defaultdict(lambda: {'count': 0, 'first_seen': 0, 'last_seen': 0})
|
self.persistent_log_patterns = defaultdict(lambda: {'count': 0, 'first_seen': 0, 'last_seen': 0})
|
||||||
self._unknown_counts = {} # Track consecutive UNKNOWN cycles per category
|
self._unknown_counts = {} # Track consecutive UNKNOWN cycles per category
|
||||||
|
self._last_cleanup_time = 0 # Throttle cleanup_old_errors calls
|
||||||
|
|
||||||
# System capabilities - derived from Proxmox storage types at runtime (Priority 1.5)
|
# System capabilities - derived from Proxmox storage types at runtime (Priority 1.5)
|
||||||
# SMART detection still uses filesystem check on init (lightweight)
|
# SMART detection still uses filesystem check on init (lightweight)
|
||||||
@@ -380,12 +395,15 @@ class HealthMonitor:
|
|||||||
Returns JSON structure with ALL 10 categories always present.
|
Returns JSON structure with ALL 10 categories always present.
|
||||||
Now includes persistent error tracking.
|
Now includes persistent error tracking.
|
||||||
"""
|
"""
|
||||||
# Run cleanup on every status check so stale errors are auto-resolved
|
# Run cleanup with throttle (every 5 min) so stale errors are auto-resolved
|
||||||
# using the user-configured Suppression Duration (single source of truth).
|
# using the user-configured Suppression Duration (single source of truth).
|
||||||
try:
|
current_time = time.time()
|
||||||
health_persistence.cleanup_old_errors()
|
if current_time - self._last_cleanup_time > 300: # 5 minutes
|
||||||
except Exception:
|
try:
|
||||||
pass
|
health_persistence.cleanup_old_errors()
|
||||||
|
self._last_cleanup_time = current_time
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
active_errors = health_persistence.get_active_errors()
|
active_errors = health_persistence.get_active_errors()
|
||||||
# No need to create persistent_issues dict here, it's implicitly handled by the checks
|
# No need to create persistent_issues dict here, it's implicitly handled by the checks
|
||||||
@@ -1319,7 +1337,7 @@ class HealthMonitor:
|
|||||||
'device': f'/dev/{device_name}',
|
'device': f'/dev/{device_name}',
|
||||||
'serial': serial,
|
'serial': serial,
|
||||||
'model': model,
|
'model': model,
|
||||||
'error_key': error_info.get('error_key') or f'disk_{device_name}',
|
'error_key': error_info.get('error_key') or f'disk_smart_{device_name}',
|
||||||
'dismissable': error_info.get('dismissable', True),
|
'dismissable': error_info.get('dismissable', True),
|
||||||
'is_disk_entry': True,
|
'is_disk_entry': True,
|
||||||
}
|
}
|
||||||
@@ -2843,12 +2861,9 @@ class HealthMonitor:
|
|||||||
}
|
}
|
||||||
|
|
||||||
def _is_benign_error(self, line: str) -> bool:
|
def _is_benign_error(self, line: str) -> bool:
|
||||||
"""Check if log line matches benign error patterns"""
|
"""Check if log line matches benign error patterns (uses pre-compiled regex)"""
|
||||||
line_lower = line.lower()
|
benign_re, _, _ = self._get_compiled_patterns()
|
||||||
for pattern in self.BENIGN_ERROR_PATTERNS:
|
return bool(benign_re.search(line.lower()))
|
||||||
if re.search(pattern, line_lower):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _enrich_critical_log_reason(self, line: str) -> str:
|
def _enrich_critical_log_reason(self, line: str) -> str:
|
||||||
"""
|
"""
|
||||||
@@ -2969,7 +2984,7 @@ class HealthMonitor:
|
|||||||
# Generic classification -- very conservative to avoid false positives.
|
# Generic classification -- very conservative to avoid false positives.
|
||||||
# Only escalate if the line explicitly uses severity-level keywords
|
# Only escalate if the line explicitly uses severity-level keywords
|
||||||
# from the kernel or systemd (not just any line containing "error").
|
# from the kernel or systemd (not just any line containing "error").
|
||||||
if 'kernel panic' in line_lower or 'fatal' in line_lower and 'non-fatal' not in line_lower:
|
if 'kernel panic' in line_lower or ('fatal' in line_lower and 'non-fatal' not in line_lower):
|
||||||
return 'CRITICAL'
|
return 'CRITICAL'
|
||||||
|
|
||||||
# Lines from priority "err" that don't match any keyword above are
|
# Lines from priority "err" that don't match any keyword above are
|
||||||
@@ -3164,7 +3179,7 @@ class HealthMonitor:
|
|||||||
device_name=base_device,
|
device_name=base_device,
|
||||||
serial=obs_serial,
|
serial=obs_serial,
|
||||||
error_type='filesystem_error',
|
error_type='filesystem_error',
|
||||||
error_signature=f'fs_error_{fs_device}_{pattern_key}',
|
error_signature=f'fs_error_{fs_device}_{pattern_hash}',
|
||||||
raw_message=enriched_reason[:500],
|
raw_message=enriched_reason[:500],
|
||||||
severity=fs_severity.lower(),
|
severity=fs_severity.lower(),
|
||||||
)
|
)
|
||||||
@@ -3253,12 +3268,25 @@ class HealthMonitor:
|
|||||||
'dismissable': True, 'occurrences': data['count']}
|
'dismissable': True, 'occurrences': data['count']}
|
||||||
)
|
)
|
||||||
|
|
||||||
patterns_to_remove = [
|
patterns_to_remove = [
|
||||||
p for p, data in self.persistent_log_patterns.items()
|
p for p, data in self.persistent_log_patterns.items()
|
||||||
if current_time - data['last_seen'] > 1800
|
if current_time - data['last_seen'] > 1800
|
||||||
]
|
]
|
||||||
for pattern in patterns_to_remove:
|
for pattern in patterns_to_remove:
|
||||||
del self.persistent_log_patterns[pattern]
|
del self.persistent_log_patterns[pattern]
|
||||||
|
|
||||||
|
# B5 fix: Cap size to prevent unbounded memory growth under high error load
|
||||||
|
MAX_LOG_PATTERNS = 500
|
||||||
|
if len(self.persistent_log_patterns) > MAX_LOG_PATTERNS:
|
||||||
|
sorted_patterns = sorted(
|
||||||
|
self.persistent_log_patterns.items(),
|
||||||
|
key=lambda x: x[1]['last_seen'],
|
||||||
|
reverse=True
|
||||||
|
)
|
||||||
|
self.persistent_log_patterns = defaultdict(
|
||||||
|
lambda: {'count': 0, 'first_seen': 0, 'last_seen': 0},
|
||||||
|
dict(sorted_patterns[:MAX_LOG_PATTERNS])
|
||||||
|
)
|
||||||
|
|
||||||
unique_critical_count = len(critical_errors_found)
|
unique_critical_count = len(critical_errors_found)
|
||||||
cascade_count = len(cascading_errors)
|
cascade_count = len(cascading_errors)
|
||||||
@@ -3870,12 +3898,14 @@ class HealthMonitor:
|
|||||||
|
|
||||||
# Sub-check 3: Failed login attempts (brute force detection)
|
# Sub-check 3: Failed login attempts (brute force detection)
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
['journalctl', '--since', '24 hours ago', '--no-pager'],
|
['journalctl', '--since', '24 hours ago', '--no-pager',
|
||||||
capture_output=True,
|
'-g', 'authentication failure|failed password|invalid user',
|
||||||
text=True,
|
'--output=cat', '-n', '5000'],
|
||||||
timeout=3
|
capture_output=True,
|
||||||
)
|
text=True,
|
||||||
|
timeout=5
|
||||||
|
)
|
||||||
|
|
||||||
failed_logins = 0
|
failed_logins = 0
|
||||||
if result.returncode == 0:
|
if result.returncode == 0:
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ import sqlite3
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import threading
|
import threading
|
||||||
|
from contextlib import contextmanager
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from typing import Dict, List, Any, Optional
|
from typing import Dict, List, Any, Optional
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -59,6 +60,24 @@ class HealthPersistence:
|
|||||||
conn.execute('PRAGMA busy_timeout=5000')
|
conn.execute('PRAGMA busy_timeout=5000')
|
||||||
return conn
|
return conn
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def _db_connection(self, row_factory: bool = False):
|
||||||
|
"""Context manager for safe database connections (B4 fix).
|
||||||
|
|
||||||
|
Ensures connections are always closed, even if exceptions occur.
|
||||||
|
Usage:
|
||||||
|
with self._db_connection() as conn:
|
||||||
|
cursor = conn.cursor()
|
||||||
|
...
|
||||||
|
"""
|
||||||
|
conn = self._get_conn()
|
||||||
|
if row_factory:
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
try:
|
||||||
|
yield conn
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
def _init_database(self):
|
def _init_database(self):
|
||||||
"""Initialize SQLite database with required tables"""
|
"""Initialize SQLite database with required tables"""
|
||||||
conn = self._get_conn()
|
conn = self._get_conn()
|
||||||
@@ -345,7 +364,8 @@ class HealthPersistence:
|
|||||||
if not (error_key == 'cpu_temperature' and severity == 'CRITICAL'):
|
if not (error_key == 'cpu_temperature' and severity == 'CRITICAL'):
|
||||||
setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
|
setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
|
||||||
if setting_key:
|
if setting_key:
|
||||||
stored = self.get_setting(setting_key)
|
# P4 fix: use _get_setting_impl with existing connection to avoid deadlock
|
||||||
|
stored = self._get_setting_impl(conn, setting_key)
|
||||||
if stored is not None:
|
if stored is not None:
|
||||||
configured_hours = int(stored)
|
configured_hours = int(stored)
|
||||||
if configured_hours != self.DEFAULT_SUPPRESSION_HOURS:
|
if configured_hours != self.DEFAULT_SUPPRESSION_HOURS:
|
||||||
@@ -411,54 +431,54 @@ class HealthPersistence:
|
|||||||
- Error is active (unresolved and not acknowledged), OR
|
- Error is active (unresolved and not acknowledged), OR
|
||||||
- Error is dismissed but still within its suppression period
|
- Error is dismissed but still within its suppression period
|
||||||
"""
|
"""
|
||||||
conn = self._get_conn()
|
with self._db_connection() as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
# First check: is the error active (unresolved and not acknowledged)?
|
# First check: is the error active (unresolved and not acknowledged)?
|
||||||
if category:
|
if category:
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
SELECT COUNT(*) FROM errors
|
SELECT COUNT(*) FROM errors
|
||||||
WHERE error_key = ? AND category = ?
|
WHERE error_key = ? AND category = ?
|
||||||
AND resolved_at IS NULL AND acknowledged = 0
|
AND resolved_at IS NULL AND acknowledged = 0
|
||||||
''', (error_key, category))
|
''', (error_key, category))
|
||||||
else:
|
else:
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
SELECT COUNT(*) FROM errors
|
SELECT COUNT(*) FROM errors
|
||||||
WHERE error_key = ?
|
WHERE error_key = ?
|
||||||
AND resolved_at IS NULL AND acknowledged = 0
|
AND resolved_at IS NULL AND acknowledged = 0
|
||||||
''', (error_key,))
|
''', (error_key,))
|
||||||
|
|
||||||
active_count = cursor.fetchone()[0]
|
active_count = cursor.fetchone()[0]
|
||||||
if active_count > 0:
|
if active_count > 0:
|
||||||
conn.close()
|
return True
|
||||||
return True
|
|
||||||
|
|
||||||
# Second check: is the error dismissed but still within suppression period?
|
# Second check: is the error dismissed but still within suppression period?
|
||||||
# This prevents re-recording dismissed errors before their suppression expires
|
# This prevents re-recording dismissed errors before their suppression expires
|
||||||
if category:
|
# Note: acknowledged errors may have resolved_at NULL (dismissed but error still exists)
|
||||||
cursor.execute('''
|
# or resolved_at set (error was dismissed AND condition resolved)
|
||||||
SELECT resolved_at, suppression_hours FROM errors
|
if category:
|
||||||
WHERE error_key = ? AND category = ?
|
cursor.execute('''
|
||||||
AND acknowledged = 1 AND resolved_at IS NOT NULL
|
SELECT acknowledged_at, suppression_hours FROM errors
|
||||||
ORDER BY resolved_at DESC LIMIT 1
|
WHERE error_key = ? AND category = ?
|
||||||
''', (error_key, category))
|
AND acknowledged = 1
|
||||||
else:
|
ORDER BY acknowledged_at DESC LIMIT 1
|
||||||
cursor.execute('''
|
''', (error_key, category))
|
||||||
SELECT resolved_at, suppression_hours FROM errors
|
else:
|
||||||
WHERE error_key = ?
|
cursor.execute('''
|
||||||
AND acknowledged = 1 AND resolved_at IS NOT NULL
|
SELECT acknowledged_at, suppression_hours FROM errors
|
||||||
ORDER BY resolved_at DESC LIMIT 1
|
WHERE error_key = ?
|
||||||
''', (error_key,))
|
AND acknowledged = 1
|
||||||
|
ORDER BY acknowledged_at DESC LIMIT 1
|
||||||
|
''', (error_key,))
|
||||||
|
|
||||||
row = cursor.fetchone()
|
row = cursor.fetchone()
|
||||||
conn.close()
|
|
||||||
|
|
||||||
if row:
|
if row:
|
||||||
resolved_at_str, suppression_hours = row
|
acknowledged_at_str, suppression_hours = row
|
||||||
if resolved_at_str and suppression_hours:
|
if acknowledged_at_str and suppression_hours:
|
||||||
try:
|
try:
|
||||||
resolved_at = datetime.fromisoformat(resolved_at_str)
|
acknowledged_at = datetime.fromisoformat(acknowledged_at_str)
|
||||||
suppression_end = resolved_at + timedelta(hours=suppression_hours)
|
suppression_end = acknowledged_at + timedelta(hours=suppression_hours)
|
||||||
if datetime.now() < suppression_end:
|
if datetime.now() < suppression_end:
|
||||||
# Still within suppression period - treat as "active" to prevent re-recording
|
# Still within suppression period - treat as "active" to prevent re-recording
|
||||||
return True
|
return True
|
||||||
@@ -542,17 +562,25 @@ class HealthPersistence:
|
|||||||
('updates', 'pending_updates'), ('updates', 'kernel_pve'),
|
('updates', 'pending_updates'), ('updates', 'kernel_pve'),
|
||||||
('security', 'security_'),
|
('security', 'security_'),
|
||||||
('pve_services', 'pve_service_'), ('vms', 'vmct_'), ('vms', 'vm_'), ('vms', 'ct_'),
|
('pve_services', 'pve_service_'), ('vms', 'vmct_'), ('vms', 'vm_'), ('vms', 'ct_'),
|
||||||
('disks', 'disk_'), ('disks', 'smart_'), ('disks', 'zfs_pool_'),
|
('disks', 'disk_smart_'), ('disks', 'disk_'), ('disks', 'smart_'), ('disks', 'zfs_pool_'),
|
||||||
('logs', 'log_'), ('network', 'net_'),
|
('logs', 'log_'), ('network', 'net_'),
|
||||||
('temperature', 'temp_')]:
|
('temperature', 'temp_')]:
|
||||||
if error_key == prefix or error_key.startswith(prefix):
|
if error_key == prefix or error_key.startswith(prefix):
|
||||||
category = cat
|
category = cat
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# Fallback: if no category matched, try to infer from common patterns
|
||||||
|
if not category:
|
||||||
|
if 'disk' in error_key or 'smart' in error_key or 'sda' in error_key or 'sdb' in error_key or 'nvme' in error_key:
|
||||||
|
category = 'disks'
|
||||||
|
else:
|
||||||
|
category = 'general' # Use 'general' as ultimate fallback instead of empty string
|
||||||
|
|
||||||
setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
|
setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
|
||||||
sup_hours = self.DEFAULT_SUPPRESSION_HOURS
|
sup_hours = self.DEFAULT_SUPPRESSION_HOURS
|
||||||
if setting_key:
|
if setting_key:
|
||||||
stored = self.get_setting(setting_key)
|
# P4 fix: use _get_setting_impl with existing connection
|
||||||
|
stored = self._get_setting_impl(conn, setting_key)
|
||||||
if stored is not None:
|
if stored is not None:
|
||||||
try:
|
try:
|
||||||
sup_hours = int(stored)
|
sup_hours = int(stored)
|
||||||
@@ -593,7 +621,8 @@ class HealthPersistence:
|
|||||||
setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
|
setting_key = self.CATEGORY_SETTING_MAP.get(category, '')
|
||||||
sup_hours = self.DEFAULT_SUPPRESSION_HOURS
|
sup_hours = self.DEFAULT_SUPPRESSION_HOURS
|
||||||
if setting_key:
|
if setting_key:
|
||||||
stored = self.get_setting(setting_key)
|
# P4 fix: use _get_setting_impl with existing connection
|
||||||
|
stored = self._get_setting_impl(conn, setting_key)
|
||||||
if stored is not None:
|
if stored is not None:
|
||||||
try:
|
try:
|
||||||
sup_hours = int(stored)
|
sup_hours = int(stored)
|
||||||
@@ -648,55 +677,63 @@ class HealthPersistence:
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
def is_error_acknowledged(self, error_key: str) -> bool:
|
def is_error_acknowledged(self, error_key: str) -> bool:
|
||||||
"""Check if an error_key has been acknowledged and is still within suppression window."""
|
"""Check if an error_key has been acknowledged and is still within suppression window.
|
||||||
|
|
||||||
|
Uses acknowledged_at (not resolved_at) to calculate suppression expiration,
|
||||||
|
since dismissed errors may have resolved_at = NULL.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
conn = self._get_conn()
|
with self._db_connection(row_factory=True) as conn:
|
||||||
conn.row_factory = sqlite3.Row
|
cursor = conn.cursor()
|
||||||
cursor = conn.cursor()
|
cursor.execute(
|
||||||
cursor.execute(
|
'SELECT acknowledged, acknowledged_at, suppression_hours FROM errors WHERE error_key = ?',
|
||||||
'SELECT acknowledged, resolved_at, suppression_hours FROM errors WHERE error_key = ?',
|
(error_key,))
|
||||||
(error_key,))
|
row = cursor.fetchone()
|
||||||
row = cursor.fetchone()
|
if not row:
|
||||||
conn.close()
|
return False
|
||||||
if not row:
|
if not row['acknowledged']:
|
||||||
return False
|
return False
|
||||||
if not row['acknowledged']:
|
# Check if still within suppression window using acknowledged_at
|
||||||
return False
|
acknowledged_at = row['acknowledged_at']
|
||||||
# Check if still within suppression window
|
sup_hours = row['suppression_hours'] or self.DEFAULT_SUPPRESSION_HOURS
|
||||||
resolved_at = row['resolved_at']
|
|
||||||
sup_hours = row['suppression_hours'] or self.DEFAULT_SUPPRESSION_HOURS
|
# -1 means permanently suppressed
|
||||||
if resolved_at:
|
if sup_hours < 0:
|
||||||
try:
|
return True
|
||||||
resolved_dt = datetime.fromisoformat(resolved_at)
|
|
||||||
if datetime.now() > resolved_dt + timedelta(hours=sup_hours):
|
if acknowledged_at:
|
||||||
return False # Suppression expired
|
try:
|
||||||
except Exception:
|
acknowledged_dt = datetime.fromisoformat(acknowledged_at)
|
||||||
pass
|
if datetime.now() > acknowledged_dt + timedelta(hours=sup_hours):
|
||||||
return True
|
return False # Suppression expired
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return True
|
||||||
except Exception:
|
except Exception:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def get_active_errors(self, category: Optional[str] = None) -> List[Dict[str, Any]]:
|
def get_active_errors(self, category: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||||
"""Get all active (unresolved) errors, optionally filtered by category"""
|
"""Get all active (unresolved AND not acknowledged) errors, optionally filtered by category.
|
||||||
conn = self._get_conn()
|
|
||||||
conn.row_factory = sqlite3.Row
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
if category:
|
Acknowledged errors are excluded since they have been dismissed by the user.
|
||||||
cursor.execute('''
|
"""
|
||||||
SELECT * FROM errors
|
with self._db_connection(row_factory=True) as conn:
|
||||||
WHERE resolved_at IS NULL AND category = ?
|
cursor = conn.cursor()
|
||||||
ORDER BY severity DESC, last_seen DESC
|
|
||||||
''', (category,))
|
|
||||||
else:
|
|
||||||
cursor.execute('''
|
|
||||||
SELECT * FROM errors
|
|
||||||
WHERE resolved_at IS NULL
|
|
||||||
ORDER BY severity DESC, last_seen DESC
|
|
||||||
''')
|
|
||||||
|
|
||||||
rows = cursor.fetchall()
|
if category:
|
||||||
conn.close()
|
cursor.execute('''
|
||||||
|
SELECT * FROM errors
|
||||||
|
WHERE resolved_at IS NULL AND acknowledged = 0 AND category = ?
|
||||||
|
ORDER BY severity DESC, last_seen DESC
|
||||||
|
''', (category,))
|
||||||
|
else:
|
||||||
|
cursor.execute('''
|
||||||
|
SELECT * FROM errors
|
||||||
|
WHERE resolved_at IS NULL AND acknowledged = 0
|
||||||
|
ORDER BY severity DESC, last_seen DESC
|
||||||
|
''')
|
||||||
|
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
|
||||||
errors = []
|
errors = []
|
||||||
for row in rows:
|
for row in rows:
|
||||||
@@ -850,11 +887,11 @@ class HealthPersistence:
|
|||||||
conn.row_factory = sqlite3.Row
|
conn.row_factory = sqlite3.Row
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
SELECT * FROM errors
|
SELECT * FROM errors
|
||||||
WHERE acknowledged = 1 AND resolved_at IS NOT NULL
|
WHERE acknowledged = 1
|
||||||
ORDER BY resolved_at DESC
|
ORDER BY acknowledged_at DESC
|
||||||
''')
|
''')
|
||||||
|
|
||||||
rows = cursor.fetchall()
|
rows = cursor.fetchall()
|
||||||
conn.close()
|
conn.close()
|
||||||
@@ -871,8 +908,12 @@ class HealthPersistence:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
# Check if still within suppression period using per-record hours
|
# Check if still within suppression period using per-record hours
|
||||||
|
# Use acknowledged_at as reference (resolved_at may be NULL for dismissed but active errors)
|
||||||
try:
|
try:
|
||||||
resolved_dt = datetime.fromisoformat(error_dict['resolved_at'])
|
ref_time_str = error_dict.get('acknowledged_at') or error_dict.get('resolved_at')
|
||||||
|
if not ref_time_str:
|
||||||
|
continue
|
||||||
|
ref_dt = datetime.fromisoformat(ref_time_str)
|
||||||
sup_hours = error_dict.get('suppression_hours')
|
sup_hours = error_dict.get('suppression_hours')
|
||||||
if sup_hours is None:
|
if sup_hours is None:
|
||||||
sup_hours = self.DEFAULT_SUPPRESSION_HOURS
|
sup_hours = self.DEFAULT_SUPPRESSION_HOURS
|
||||||
@@ -885,7 +926,7 @@ class HealthPersistence:
|
|||||||
error_dict['permanent'] = True
|
error_dict['permanent'] = True
|
||||||
dismissed.append(error_dict)
|
dismissed.append(error_dict)
|
||||||
else:
|
else:
|
||||||
elapsed_seconds = (now - resolved_dt).total_seconds()
|
elapsed_seconds = (now - ref_dt).total_seconds()
|
||||||
suppression_seconds = sup_hours * 3600
|
suppression_seconds = sup_hours * 3600
|
||||||
|
|
||||||
if elapsed_seconds < suppression_seconds:
|
if elapsed_seconds < suppression_seconds:
|
||||||
@@ -971,12 +1012,14 @@ class HealthPersistence:
|
|||||||
conn = self._get_conn()
|
conn = self._get_conn()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
for event_id in event_ids:
|
# Use single UPDATE with IN clause instead of N individual updates
|
||||||
cursor.execute('''
|
now = datetime.now().isoformat()
|
||||||
UPDATE events
|
placeholders = ','.join('?' * len(event_ids))
|
||||||
SET data = json_set(COALESCE(data, '{}'), '$.needs_notification', 0, '$.notified_at', ?)
|
cursor.execute(f'''
|
||||||
WHERE id = ?
|
UPDATE events
|
||||||
''', (datetime.now().isoformat(), event_id))
|
SET data = json_set(COALESCE(data, '{{}}'), '$.needs_notification', 0, '$.notified_at', ?)
|
||||||
|
WHERE id IN ({placeholders})
|
||||||
|
''', [now] + event_ids)
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
@@ -1074,13 +1117,16 @@ class HealthPersistence:
|
|||||||
|
|
||||||
def get_setting(self, key: str, default: Optional[str] = None) -> Optional[str]:
|
def get_setting(self, key: str, default: Optional[str] = None) -> Optional[str]:
|
||||||
"""Get a user setting value by key."""
|
"""Get a user setting value by key."""
|
||||||
conn = self._get_conn()
|
with self._db_connection() as conn:
|
||||||
|
return self._get_setting_impl(conn, key, default)
|
||||||
|
|
||||||
|
def _get_setting_impl(self, conn, key: str, default: Optional[str] = None) -> Optional[str]:
|
||||||
|
"""Internal: get setting using existing connection (P4 fix - avoids nested connections)."""
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
cursor.execute(
|
cursor.execute(
|
||||||
'SELECT setting_value FROM user_settings WHERE setting_key = ?', (key,)
|
'SELECT setting_value FROM user_settings WHERE setting_key = ?', (key,)
|
||||||
)
|
)
|
||||||
row = cursor.fetchone()
|
row = cursor.fetchone()
|
||||||
conn.close()
|
|
||||||
return row[0] if row else default
|
return row[0] if row else default
|
||||||
|
|
||||||
def set_setting(self, key: str, value: str):
|
def set_setting(self, key: str, value: str):
|
||||||
|
|||||||
Reference in New Issue
Block a user